From c5971456964290da7e98222892797b71ef793e62 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 3 May 2012 14:48:26 +0100 Subject: [PATCH 001/117] ACPI battery: only refresh the sysfs files when pertinent information changes We only need to regenerate the sysfs files when the capacity units change, avoid the update otherwise. The origin of this issue is dates way back to 2.6.38: da8aeb92d4853f37e281f11fddf61f9c7d84c3cd (ACPI / Battery: Update information on info notification and resume) cc: Signed-off-by: Andy Whitcroft Tested-by: Ralf Jung Signed-off-by: Len Brown --- drivers/acpi/battery.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 86933ca8b472..7dd3f9fb9f3f 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -643,11 +643,19 @@ static int acpi_battery_update(struct acpi_battery *battery) static void acpi_battery_refresh(struct acpi_battery *battery) { + int power_unit; + if (!battery->bat.dev) return; + power_unit = battery->power_unit; + acpi_battery_get_info(battery); - /* The battery may have changed its reporting units. */ + + if (power_unit == battery->power_unit) + return; + + /* The battery has changed its reporting units. */ sysfs_remove_battery(battery); sysfs_add_battery(battery); } From d8e725f356fd5f225ad97f21213fc007e409c9f5 Mon Sep 17 00:00:00 2001 From: Marco Aurelio da Costa Date: Fri, 4 May 2012 18:53:44 +0200 Subject: [PATCH 002/117] ACPI: Ignore invalid _PSS entries, but use valid ones The EliteBook 8560W has non-initialized entries in its _PSS ACPI table. Instead of bailing out when the first non-initialized entry is found, ignore it and use only the valid entries. Only bail out if there is no valid entry at all. [v3: Fixes suggested by Konrad] Signed-off-by: Marco Aurelio da Costa Signed-off-by: Len Brown --- drivers/acpi/processor_perflib.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 0af48a8554cd..a093dc163a42 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -333,6 +333,7 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr) struct acpi_buffer state = { 0, NULL }; union acpi_object *pss = NULL; int i; + int last_invalid = -1; status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer); @@ -394,14 +395,33 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr) ((u32)(px->core_frequency * 1000) != (px->core_frequency * 1000))) { printk(KERN_ERR FW_BUG PREFIX - "Invalid BIOS _PSS frequency: 0x%llx MHz\n", - px->core_frequency); - result = -EFAULT; - kfree(pr->performance->states); - goto end; + "Invalid BIOS _PSS frequency found for processor %d: 0x%llx MHz\n", + pr->id, px->core_frequency); + if (last_invalid == -1) + last_invalid = i; + } else { + if (last_invalid != -1) { + /* + * Copy this valid entry over last_invalid entry + */ + memcpy(&(pr->performance->states[last_invalid]), + px, sizeof(struct acpi_processor_px)); + ++last_invalid; + } } } + if (last_invalid == 0) { + printk(KERN_ERR FW_BUG PREFIX + "No valid BIOS _PSS frequency found for processor %d\n", pr->id); + result = -EFAULT; + kfree(pr->performance->states); + pr->performance->states = NULL; + } + + if (last_invalid > 0) + pr->performance->state_count = last_invalid; + end: kfree(buffer.pointer); From 2aa4ee2a8805ec0260dde971e9e6699917c868a7 Mon Sep 17 00:00:00 2001 From: Jim Kukunas Date: Mon, 28 May 2012 14:10:22 +1000 Subject: [PATCH 003/117] lib/raid6: fix sparse warnings in recovery functions Make the recovery functions static to fix the following sparse warnings: lib/raid6/recov.c:25:6: warning: symbol 'raid6_2data_recov_intx1' was not declared. Should it be static? lib/raid6/recov.c:69:6: warning: symbol 'raid6_datap_recov_intx1' was not declared. Should it be static? lib/raid6/recov_ssse3.c:22:6: warning: symbol 'raid6_2data_recov_ssse3' was not declared. Should it be static? lib/raid6/recov_ssse3.c:197:6: warning: symbol 'raid6_datap_recov_ssse3' was not declared. Should it be static? Reported-by: Fengguang Wu Signed-off-by: Jim Kukunas Signed-off-by: NeilBrown --- lib/raid6/recov.c | 7 ++++--- lib/raid6/recov_ssse3.c | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index 1805a5cc5daa..a95bccb8497d 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -22,8 +22,8 @@ #include /* Recover two failed data blocks. */ -void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, - void **ptrs) +static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, + int failb, void **ptrs) { u8 *p, *q, *dp, *dq; u8 px, qx, db; @@ -66,7 +66,8 @@ void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, } /* Recover failure of one data block plus the P block */ -void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs) +static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, + void **ptrs) { u8 *p, *q, *dq; const u8 *qmul; /* Q multiplier table */ diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index 37ae61930559..ecb710c0b4d9 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -19,8 +19,8 @@ static int raid6_has_ssse3(void) boot_cpu_has(X86_FEATURE_SSSE3); } -void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, - void **ptrs) +static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, + int failb, void **ptrs) { u8 *p, *q, *dp, *dq; const u8 *pbmul; /* P multiplier table for B data */ @@ -194,7 +194,8 @@ void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, } -void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs) +static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, + void **ptrs) { u8 *p, *q, *dq; const u8 *qmul; /* Q multiplier table */ From c3b20037926e607b6cdaecbf9d3103e2ca63bc31 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 28 May 2012 22:33:02 +0100 Subject: [PATCH 004/117] drm/i915: Reset last_retired_head when resetting ring When we reset the ring control registers, including the HEAD and TAIL of the ring, we also need to reset associated state. In this instance, we were failing to reset the cached value of ring->last_retired_head and so upon the first request for more space following a resume would potentially (depending on a narrow race window) believe that the HEAD had advanced much further than reality. This is a regression from: commit a71d8d94525e8fd855c0466fb586ae1cb008f3a2 Author: Chris Wilson Date: Wed Feb 15 11:25:36 2012 +0000 drm/i915: Record the tail at each request and use it to estimate the head Signed-off-by: Chris Wilson Cc: stable@vger.kernel.org # 3.4 Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index b59b6d5b7583..490df551e312 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -326,6 +326,7 @@ static int init_ring_common(struct intel_ring_buffer *ring) ring->head = I915_READ_HEAD(ring); ring->tail = I915_READ_TAIL(ring) & TAIL_ADDR; ring->space = ring_space(ring); + ring->last_retired_head = -1; } return 0; From cd4fedf9cfe2d453148dce0f105a41725e5107a3 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 23 May 2012 21:02:07 +0530 Subject: [PATCH 005/117] RDMA/ocrdma: Correct queue free count math Correct queue free count math for SQ, RQ for all hardware type. Update user-kernel ABI interface. Signed-off-by: Parav Pandit Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma.h | 1 - drivers/infiniband/hw/ocrdma/ocrdma_abi.h | 5 +---- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 7 ------- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 5 ----- 4 files changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 85a69c958559..037f5cea85bd 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -231,7 +231,6 @@ struct ocrdma_qp_hwq_info { u32 entry_size; u32 max_cnt; u32 max_wqe_idx; - u32 free_delta; u16 dbid; /* qid, where to ring the doorbell. */ u32 len; dma_addr_t pa; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h index a411a4e3193d..517ab20b727c 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h @@ -101,8 +101,6 @@ struct ocrdma_create_qp_uresp { u32 rsvd1; u32 num_wqe_allocated; u32 num_rqe_allocated; - u32 free_wqe_delta; - u32 free_rqe_delta; u32 db_sq_offset; u32 db_rq_offset; u32 db_shift; @@ -126,8 +124,7 @@ struct ocrdma_create_srq_uresp { u32 db_rq_offset; u32 db_shift; - u32 free_rqe_delta; - u32 rsvd2; + u64 rsvd2; u64 rsvd3; } __packed; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 9b204b1ba336..f26314fd5f6e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1990,19 +1990,12 @@ static void ocrdma_get_create_qp_rsp(struct ocrdma_create_qp_rsp *rsp, max_wqe_allocated = 1 << max_wqe_allocated; max_rqe_allocated = 1 << ((u16)rsp->max_wqe_rqe); - if (qp->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { - qp->sq.free_delta = 0; - qp->rq.free_delta = 1; - } else - qp->sq.free_delta = 1; - qp->sq.max_cnt = max_wqe_allocated; qp->sq.max_wqe_idx = max_wqe_allocated - 1; if (!attrs->srq) { qp->rq.max_cnt = max_rqe_allocated; qp->rq.max_wqe_idx = max_rqe_allocated - 1; - qp->rq.free_delta = 1; } } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index e9f74d1b48f6..d16d172b6b6b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -940,8 +940,6 @@ static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET; uresp.db_shift = 16; } - uresp.free_wqe_delta = qp->sq.free_delta; - uresp.free_rqe_delta = qp->rq.free_delta; if (qp->dpp_enabled) { uresp.dpp_credit = dpp_credit_lmt; @@ -1307,8 +1305,6 @@ static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q) free_cnt = (q->max_cnt - q->head) + q->tail; else free_cnt = q->tail - q->head; - if (q->free_delta) - free_cnt -= q->free_delta; return free_cnt; } @@ -1501,7 +1497,6 @@ static int ocrdma_copy_srq_uresp(struct ocrdma_srq *srq, struct ib_udata *udata) (srq->pd->id * srq->dev->nic_info.db_page_size); uresp.db_page_size = srq->dev->nic_info.db_page_size; uresp.num_rqe_allocated = srq->rq.max_cnt; - uresp.free_rqe_delta = 1; if (srq->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY) { uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ1_OFFSET; uresp.db_shift = 24; From 804eaf29bac4148aa265bedc62182ed41a4c6120 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 23 May 2012 21:11:17 +0530 Subject: [PATCH 006/117] RDMA/ocrdma: Fix signaled event for SRQ_LIMIT_REACHED Signed-off-by: Parav Pandit Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index f26314fd5f6e..9343a1522977 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -732,7 +732,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, break; case OCRDMA_SRQ_LIMIT_EVENT: ib_evt.element.srq = &qp->srq->ibsrq; - ib_evt.event = IB_EVENT_QP_LAST_WQE_REACHED; + ib_evt.event = IB_EVENT_SRQ_LIMIT_REACHED; srq_event = 1; qp_event = 0; break; From 7ad5e449b96bd82f406ed4657a64c8f72a48896d Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Tue, 29 May 2012 12:51:47 +0530 Subject: [PATCH 007/117] RDMA/ocrdma: Remove unnecessary version.h includes "make versioncheck" shows: drivers/infiniband/hw/ocrdma/ocrdma_main.c: 29 linux/version.h not needed. drivers/infiniband/hw/ocrdma/ocrdma_verbs.h: 31 linux/version.h not needed. Signed-off-by: Devendra Naga Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 - drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index a20d16eaae71..04fef3de6d75 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -26,7 +26,6 @@ *******************************************************************/ #include -#include #include #include #include diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index e6483439f25f..633f03d80274 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -28,7 +28,6 @@ #ifndef __OCRDMA_VERBS_H__ #define __OCRDMA_VERBS_H__ -#include int ocrdma_post_send(struct ib_qp *, struct ib_send_wr *, struct ib_send_wr **bad_wr); int ocrdma_post_recv(struct ib_qp *, struct ib_recv_wr *, From 91557e847dbc715acf2d847a1ffec63f71b00b65 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 12:25:52 -0300 Subject: [PATCH 008/117] perf report: Use the right symbol for annotation In non symbolic views, i.e. --sort without "symbol", as in: perf report --sort comm We're segfaulting in the --tui because we're testing the symbol resolved and then trying to use the symbol on the histogram entry where we're coalescing all hits for a COMM, and the first hist_entry for a comm may have a NULL symbol, i.e. the RIP didn't resolve to any symbol. In this case we're segfaulting, fix it by testing against the symbol in the histogram entry. Reported-by: William Cohen Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-8ylwubbcmu27ucc9ffrku3yv@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 8c767c6bca91..2400e009f149 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -162,7 +162,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, * so we don't allocated the extra space needed because the stdio * code will not use it. */ - if (al->sym != NULL && use_browser > 0) { + if (he->ms.sym != NULL && use_browser > 0) { struct annotation *notes = symbol__annotation(he->ms.sym); assert(evsel != NULL); From 107baecaca0b2843f1a0464701b253e51ef6f0e2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 12:31:44 -0300 Subject: [PATCH 009/117] perf annotate browser: Fix help window entry for navigating to hottest line Its 'H', not 'h'. The later is for getting to the help window. Reported-by: Ingo Molnar Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-7zvwphhm815y2zczoxgstzuf@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 4deea6aaf927..34b1c46eaf42 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -668,7 +668,7 @@ static int annotate_browser__run(struct annotate_browser *browser, int evidx, "q/ESC/CTRL+C Exit\n\n" "-> Go to target\n" "<- Exit\n" - "h Cycle thru hottest instructions\n" + "H Cycle thru hottest instructions\n" "j Toggle showing jump to target arrows\n" "J Toggle showing number of jump sources on targets\n" "n Search next string\n" From 79695e1bb65ba0e21488c360a1bed6e358354aaa Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 13:53:54 -0300 Subject: [PATCH 010/117] perf stat: Initialize default events wrt exclude_{guest,host} When no event is specified the tools use perf_evlist__add_default(), that will call event_attr_init to initialize the KVM exclusion bits. When the change was made to the tools so that by default guest samples would be excluded, the changes were made just to the parsing routines and to perf_evlist__add_default(), not to perf_evlist__add_attrs, that is used so far just by perf stat to add multiple events, according to the level of detail specified. Recently the tools were changed to reconstruct the event name from all the details in perf_event_attr, not just from .type and .config, but taking into account all the feature bits (.exclude_{guest,host,user,kernel,etc}, .precise_ip, etc). That is when we noticed that the default for perf stat wasn't the one for the rest of the tools, i.e. the .exclude_guest bit wasn't being set. I.e. the default, that doesn't call event_attr_init was showing the :HG modifier: $ perf stat usleep 1 Performance counter stats for 'usleep 1': 0.942119 task-clock # 0.454 CPUs utilized 1 context-switches # 0.001 M/sec 0 CPU-migrations # 0.000 K/sec 126 page-faults # 0.134 M/sec 693,193 cycles:HG # 0.736 GHz [40.11%] 407,461 stalled-cycles-frontend:HG # 58.78% frontend cycles idle [72.29%] 365,403 stalled-cycles-backend:HG # 52.71% backend cycles idle 465,982 instructions:HG # 0.67 insns per cycle # 0.87 stalled cycles per insn 89,760 branches:HG # 95.275 M/sec 6,178 branch-misses:HG # 6.88% of all branches 0.002077228 seconds time elapsed While if one explicitely specifies the same events, which will make the parsing code to be called and thus event_attr_init is called: $ perf stat -e task-clock,context-switches,migrations,page-faults,cycles,stalled-cycles-frontend,stalled-cycles-backend,instructions,branches,branch-misses usleep 1 Performance counter stats for 'usleep 1': 1.040349 task-clock # 0.500 CPUs utilized 2 context-switches # 0.002 M/sec 0 CPU-migrations # 0.000 K/sec 127 page-faults # 0.122 M/sec 587,966 cycles # 0.565 GHz [13.18%] 459,167 stalled-cycles-frontend # 78.09% frontend cycles idle 390,249 stalled-cycles-backend # 66.37% backend cycles idle 504,006 instructions # 0.86 insns per cycle # 0.91 stalled cycles per insn 96,455 branches # 92.714 M/sec 6,522 branch-misses # 6.76% of all branches [96.12%] 0.002078681 seconds time elapsed Fix it by introducing a perf_evlist__add_default_attrs method that will call evlist_attr_init in all the perf_event_attr entries before adding the events. Reported-by: Ingo Molnar Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-4eysr236r0pgiyum9epwxw7s@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 8 ++++---- tools/perf/util/evlist.c | 11 +++++++++++ tools/perf/util/evlist.h | 4 ++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 62ae30d34fa6..262589991ea4 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1129,7 +1129,7 @@ static int add_default_attributes(void) return 0; if (!evsel_list->nr_entries) { - if (perf_evlist__add_attrs_array(evsel_list, default_attrs) < 0) + if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) return -1; } @@ -1139,21 +1139,21 @@ static int add_default_attributes(void) return 0; /* Append detailed run extra attributes: */ - if (perf_evlist__add_attrs_array(evsel_list, detailed_attrs) < 0) + if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) return -1; if (detailed_run < 2) return 0; /* Append very detailed run extra attributes: */ - if (perf_evlist__add_attrs_array(evsel_list, very_detailed_attrs) < 0) + if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) return -1; if (detailed_run < 3) return 0; /* Append very, very detailed run extra attributes: */ - return perf_evlist__add_attrs_array(evsel_list, very_very_detailed_attrs); + return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); } int cmd_stat(int argc, const char **argv, const char *prefix __used) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4ac5f5ae4ce9..ed277e5627cf 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -159,6 +159,17 @@ out_delete_partial_list: return -1; } +int __perf_evlist__add_default_attrs(struct perf_evlist *evlist, + struct perf_event_attr *attrs, size_t nr_attrs) +{ + size_t i; + + for (i = 0; i < nr_attrs; i++) + event_attr_init(attrs + i); + + return perf_evlist__add_attrs(evlist, attrs, nr_attrs); +} + static int trace_event__id(const char *evname) { char *filename, *colon; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 58abb63ac13a..989bee9624c2 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -54,6 +54,8 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); int perf_evlist__add_attrs(struct perf_evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs); +int __perf_evlist__add_default_attrs(struct perf_evlist *evlist, + struct perf_event_attr *attrs, size_t nr_attrs); int perf_evlist__add_tracepoints(struct perf_evlist *evlist, const char *tracepoints[], size_t nr_tracepoints); int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, @@ -62,6 +64,8 @@ int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, #define perf_evlist__add_attrs_array(evlist, array) \ perf_evlist__add_attrs(evlist, array, ARRAY_SIZE(array)) +#define perf_evlist__add_default_attrs(evlist, array) \ + __perf_evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array)) #define perf_evlist__add_tracepoints_array(evlist, array) \ perf_evlist__add_tracepoints(evlist, array, ARRAY_SIZE(array)) From 52deff71bc2b2c24587ab71f588ff5e4c9279349 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 29 May 2012 22:58:26 -0600 Subject: [PATCH 011/117] perf script: Fix regression in callchain dso name $ perf script -i /tmp/perf.data ... gcc 13623 544315.062858: context-switches: ffffffff815f65c9 __schedule ([kernel.kallsyms]) ffffffff81087cea __cond_resched ([kernel.kallsyms]) ffffffff815f6b92 _cond_resched ([kernel.kallsyms]) ffffffff815fb87a do_page_fault ([kernel.kallsyms]) ffffffff815f8465 page_fault ([kernel.kallsyms]) 2b7a71ea0303 _dl_lookup_symbol_x ([kernel.kallsyms]) 2b7a71ea1eb5 _dl_relocate_object ([kernel.kallsyms]) 2b7a71e99b2e dl_main ([kernel.kallsyms]) 2b7a71eab7f4 _dl_sysdep_start ([kernel.kallsyms]) All DSO's in a callchain are printed as [kernel.kallsyms]. git bisect chased it to: 547a92e0aedb88129e7fbd804697a11949de2e5a is the first bad commit commit 547a92e0aedb88129e7fbd804697a11949de2e5a Author: Akihiro Nagai Date: Mon Jan 30 13:42:57 2012 +0900 perf script: Unify the expressions indicating "unknown" The perf script command uses various expressions to indicate "unknown". It is unfriendly for user scripts to parse it. So, this patch unifies the expressions to "[unknown]". Looks like a copy-paste in that the other references use al.map but this one should be node->map. With this patch you get: $ perf script -i /tmp/perf.data ... gcc 13623 544315.062858: context-switches: ffffffff815f65c9 __schedule ([kernel.kallsyms]) ffffffff81087cea __cond_resched ([kernel.kallsyms]) ffffffff815f6b92 _cond_resched ([kernel.kallsyms]) ffffffff815fb87a do_page_fault ([kernel.kallsyms]) ffffffff815f8465 page_fault ([kernel.kallsyms]) 2b7a71ea0303 _dl_lookup_symbol_x (/lib64/ld-2.14.90.so) 2b7a71ea1eb5 _dl_relocate_object (/lib64/ld-2.14.90.so) 2b7a71e99b2e dl_main (/lib64/ld-2.14.90.so) 2b7a71eab7f4 _dl_sysdep_start (/lib64/ld-2.14.90.so) Signed-off-by: David Ahern Cc: Akihiro Nagai Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1338353906-60706-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 93d355d27109..48206144758e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1460,7 +1460,7 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, } if (print_dso) { printf(" ("); - map__fprintf_dsoname(al.map, stdout); + map__fprintf_dsoname(node->map, stdout); printf(")"); } printf("\n"); From f1439c315b328bbb3f7f6fdd814a7057f9e130d5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 May 2012 15:02:42 -0300 Subject: [PATCH 012/117] perf tools: Fix make tarballs The patch series that introduced the top level tools/ makefile and the libtraceevent broke this feature where files needed to build in a detached tarball were not included in the MANIFEST file and thus not included in the tarball. Fix it by adding the relevant files to the MANIFEST. Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-z3mjj74927xvqwhlmu18kj80@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/MANIFEST | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 5476bc0a1eac..b4b572e8c100 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -1,4 +1,6 @@ tools/perf +tools/scripts +tools/lib/traceevent include/linux/const.h include/linux/perf_event.h include/linux/rbtree.h From ea1b3ebac9a1c72c4362c784b4ed069a938a4ddb Mon Sep 17 00:00:00 2001 From: Avik Sil Date: Tue, 29 May 2012 16:05:25 +0530 Subject: [PATCH 013/117] perf tools: Fix pager on minimal-install embedded systems Some Distributions may lack "less" package being included by default, e.g., Linaro nano rootfs. In those cases use the portable "pager" command instead of "less". Signed-off-by: Avik Sil Acked-by: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338287725-26382-1-git-send-email-avik.sil@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pager.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c index 1915de20dcac..3322b8446e89 100644 --- a/tools/perf/util/pager.c +++ b/tools/perf/util/pager.c @@ -57,6 +57,10 @@ void setup_pager(void) } if (!pager) pager = getenv("PAGER"); + if (!pager) { + if (!access("/usr/bin/pager", X_OK)) + pager = "/usr/bin/pager"; + } if (!pager) pager = "less"; else if (!*pager || !strcmp(pager, "cat")) From aba336bd1d46d6b0404b06f6915ed76150739057 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 31 May 2012 15:39:11 +1000 Subject: [PATCH 014/117] md: raid1/raid10: fix problem with merge_bvec_fn The new merge_bvec_fn which calls the corresponding function in subsidiary devices requires that mddev->merge_check_needed be set if any child has a merge_bvec_fn. However were were only setting that when a device was hot-added, not when a device was present from the start. This bug was introduced in 3.4 so patch is suitable for 3.4.y kernels. However that are conflicts in raid10.c so a separate patch will be needed for 3.4.y. Cc: stable@vger.kernel.org Reported-by: Sebastian Riemer Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++++ drivers/md/raid10.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 835de7168cd3..a9c7981ddd24 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2550,6 +2550,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); rdev_for_each(rdev, mddev) { + struct request_queue *q; int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -2562,6 +2563,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (disk->rdev) goto abort; disk->rdev = rdev; + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; disk->head_position = 0; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 987db37cb875..99ae6068e456 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3475,6 +3475,7 @@ static int run(struct mddev *mddev) rdev_for_each(rdev, mddev) { long long diff; + struct request_queue *q; disk_idx = rdev->raid_disk; if (disk_idx < 0) @@ -3493,6 +3494,9 @@ static int run(struct mddev *mddev) goto out_free_conf; disk->rdev = rdev; } + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; diff = (rdev->new_data_offset - rdev->data_offset); if (!mddev->reshape_backwards) diff = -diff; From 9e612a008fa7fe493a473454def56aa321479495 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 31 May 2012 13:08:53 +0100 Subject: [PATCH 015/117] drm/i915/crt: Do not rely upon the HPD presence pin Whilst most monitors do wire up the HPD presence pin, it seems quite a few KVM do not. Therefore if we simply rely on the HPD pin being asserted to indicate a connected monitor we fail miserable, so fall back to performing a DCC query for the EDID. Reported-and-tested-by: Matthieu LAVIE Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=50501 Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_crt.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 75a70c46ef1b..844e93e1e395 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -453,13 +453,15 @@ intel_crt_detect(struct drm_connector *connector, bool force) struct intel_load_detect_pipe tmp; if (I915_HAS_HOTPLUG(dev)) { + /* We can not rely on the HPD pin always being correctly wired + * up, for example many KVM do not pass it through, and so + * only trust an assertion that the monitor is connected. + */ if (intel_crt_detect_hotplug(connector)) { DRM_DEBUG_KMS("CRT detected via hotplug\n"); return connector_status_connected; - } else { + } else DRM_DEBUG_KMS("CRT not detected via hotplug\n"); - return connector_status_disconnected; - } } if (intel_crt_detect_ddc(connector)) From 472606458f3e1ced5fe3cc5f04e90a6b5a4732cf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:43:26 +0900 Subject: [PATCH 016/117] perf callchain: Make callchain cursors TLS perf top -G has a race on callchain cursor between main thread and display thread. Since the callchain cursors are used locally make them thread-local data would solve the problem. Signed-off-by: Namhyung Kim Reported-by: Sunjin Yang Suggested-by: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Sunjin Yang Link: http://lkml.kernel.org/r/1338443007-24857-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/util/callchain.c | 2 ++ tools/perf/util/callchain.h | 2 ++ tools/perf/util/hist.c | 7 ++++--- tools/perf/util/hist.h | 2 -- tools/perf/util/session.c | 14 +++++++------- 7 files changed, 17 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 2400e009f149..25249f76329d 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -152,7 +152,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, if (symbol_conf.use_callchain) { err = callchain_append(he->callchain, - &evsel->hists.callchain_cursor, + &callchain_cursor, sample->period); if (err) return err; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 871b540293e1..6bb0277b7dfe 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -787,7 +787,7 @@ static void perf_event__process_sample(struct perf_tool *tool, } if (symbol_conf.use_callchain) { - err = callchain_append(he->callchain, &evsel->hists.callchain_cursor, + err = callchain_append(he->callchain, &callchain_cursor, sample->period); if (err) return; diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 9f7106a8d9a4..3a6bff47614f 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -18,6 +18,8 @@ #include "util.h" #include "callchain.h" +__thread struct callchain_cursor callchain_cursor; + bool ip_callchain__valid(struct ip_callchain *chain, const union perf_event *event) { diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 7f9c0f1ae3a9..3bdb407f9cd9 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -76,6 +76,8 @@ struct callchain_cursor { struct callchain_cursor_node *curr; }; +extern __thread struct callchain_cursor callchain_cursor; + static inline void callchain_init(struct callchain_root *root) { INIT_LIST_HEAD(&root->node.siblings); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 1293b5ebea4d..514e2a4b367d 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -378,7 +378,7 @@ void hist_entry__free(struct hist_entry *he) * collapse the histogram */ -static bool hists__collapse_insert_entry(struct hists *hists, +static bool hists__collapse_insert_entry(struct hists *hists __used, struct rb_root *root, struct hist_entry *he) { @@ -397,8 +397,9 @@ static bool hists__collapse_insert_entry(struct hists *hists, iter->period += he->period; iter->nr_events += he->nr_events; if (symbol_conf.use_callchain) { - callchain_cursor_reset(&hists->callchain_cursor); - callchain_merge(&hists->callchain_cursor, iter->callchain, + callchain_cursor_reset(&callchain_cursor); + callchain_merge(&callchain_cursor, + iter->callchain, he->callchain); } hist_entry__free(he); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index cfc64e293f90..34bb556d6219 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -67,8 +67,6 @@ struct hists { struct events_stats stats; u64 event_stream; u16 col_len[HISTC_NR_COLS]; - /* Best would be to reuse the session callchain cursor */ - struct callchain_cursor callchain_cursor; }; struct hist_entry *__hists__add_entry(struct hists *self, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 48206144758e..3b6f8e460a31 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -288,7 +288,8 @@ struct branch_info *machine__resolve_bstack(struct machine *self, return bi; } -int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, +int machine__resolve_callchain(struct machine *self, + struct perf_evsel *evsel __used, struct thread *thread, struct ip_callchain *chain, struct symbol **parent) @@ -297,7 +298,7 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, unsigned int i; int err; - callchain_cursor_reset(&evsel->hists.callchain_cursor); + callchain_cursor_reset(&callchain_cursor); for (i = 0; i < chain->nr; i++) { u64 ip; @@ -333,7 +334,7 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, break; } - err = callchain_cursor_append(&evsel->hists.callchain_cursor, + err = callchain_cursor_append(&callchain_cursor, ip, al.map, al.sym); if (err) return err; @@ -1428,7 +1429,6 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, int print_sym, int print_dso, int print_symoffset) { struct addr_location al; - struct callchain_cursor *cursor = &evsel->hists.callchain_cursor; struct callchain_cursor_node *node; if (perf_event__preprocess_sample(event, machine, &al, sample, @@ -1446,10 +1446,10 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, error("Failed to resolve callchain. Skipping\n"); return; } - callchain_cursor_commit(cursor); + callchain_cursor_commit(&callchain_cursor); while (1) { - node = callchain_cursor_current(cursor); + node = callchain_cursor_current(&callchain_cursor); if (!node) break; @@ -1465,7 +1465,7 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, } printf("\n"); - callchain_cursor_advance(cursor); + callchain_cursor_advance(&callchain_cursor); } } else { From 114067b69e7b2c691faace0e33db2f04096f668d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:43:27 +0900 Subject: [PATCH 017/117] perf tools: Check if callchain is corrupted We faced segmentation fault on perf top -G at very high sampling rate due to a corrupted callchain. While the root cause was not revealed (I failed to figure it out), this patch tries to protect us from the segfault on such cases. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Sunjin Yang Link: http://lkml.kernel.org/r/1338443007-24857-2-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 4 ++-- tools/perf/util/session.c | 14 +++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f32578634d9d..1817d4015e5f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -555,6 +555,8 @@ enum perf_event_type { PERF_RECORD_MAX, /* non-ABI */ }; +#define PERF_MAX_STACK_DEPTH 255 + enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, @@ -609,8 +611,6 @@ struct perf_guest_info_callbacks { #include #include -#define PERF_MAX_STACK_DEPTH 255 - struct perf_callchain_entry { __u64 nr; __u64 ip[PERF_MAX_STACK_DEPTH]; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 3b6f8e460a31..04d1e33f4592 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -300,6 +300,11 @@ int machine__resolve_callchain(struct machine *self, callchain_cursor_reset(&callchain_cursor); + if (chain->nr > PERF_MAX_STACK_DEPTH) { + pr_warning("corrupted callchain. skipping...\n"); + return 0; + } + for (i = 0; i < chain->nr; i++) { u64 ip; struct addr_location al; @@ -318,7 +323,14 @@ int machine__resolve_callchain(struct machine *self, case PERF_CONTEXT_USER: cpumode = PERF_RECORD_MISC_USER; break; default: - break; + pr_debug("invalid callchain context: " + "%"PRId64"\n", (s64) ip); + /* + * It seems the callchain is corrupted. + * Discard all. + */ + callchain_cursor_reset(&callchain_cursor); + return 0; } continue; } From aa5cdd308ddbb08959534be408eba8ef040b5989 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 31 May 2012 11:17:34 -0300 Subject: [PATCH 018/117] perf tools: Make --version show kernel version instead of pull req tag Before: $ perf --version perf version perf.urgent.for.mingo.5.g37da28 After: $ perf --version perf version 3.4.8941.g37da28.dirty Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-vc9b4e6023iegz9kabr3yvyv@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/PERF-VERSION-GEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN index ad73300f7bac..95264f304179 100755 --- a/tools/perf/util/PERF-VERSION-GEN +++ b/tools/perf/util/PERF-VERSION-GEN @@ -12,7 +12,7 @@ LF=' # First check if there is a .git to get the version from git describe # otherwise try to get the version from the kernel makefile if test -d ../../.git -o -f ../../.git && - VN=$(git describe --abbrev=4 HEAD 2>/dev/null) && + VN=$(git describe --match 'v[0-9].[0-9]*' --abbrev=4 HEAD 2>/dev/null) && case "$VN" in *$LF*) (exit 1) ;; v[0-9]*) From a59e64a13a927fb7530bef39e9f5e7de8268137e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:51:45 +0900 Subject: [PATCH 019/117] perf tools: Update ioctl documentation for PERF_IOC_FLAG_GROUP The ioctl interface of perf event fd receives 3 arguments to control event group behavior but it lacked documentation. Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338443506-25009-2-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/design.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/design.txt b/tools/perf/design.txt index bd0bb1b1279b..67e5d0cace85 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -409,14 +409,15 @@ Counters can be enabled and disabled in two ways: via ioctl and via prctl. When a counter is disabled, it doesn't count or generate events but does continue to exist and maintain its count value. -An individual counter or counter group can be enabled with +An individual counter can be enabled with - ioctl(fd, PERF_EVENT_IOC_ENABLE); + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); or disabled with - ioctl(fd, PERF_EVENT_IOC_DISABLE); + ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); +For a counter group, pass PERF_IOC_FLAG_GROUP as the third argument. Enabling or disabling the leader of a group enables or disables the whole group; that is, while the group leader is disabled, none of the counters in the group will count. Enabling or disabling a member of a From 55da80059de6c7533724fcd95f16c5d5618ecf4d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:51:46 +0900 Subject: [PATCH 020/117] perf evlist: Pass third argument to ioctl explicitly The ioctl on perf event fd wants 3 arguments but we only passed 2. As the only user of the functions is perf record and it calls them for every event (regardless of group setting), just pass 0 for now. Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338443506-25009-3-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ed277e5627cf..7400fb3fc50c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -274,7 +274,8 @@ void perf_evlist__disable(struct perf_evlist *evlist) for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { list_for_each_entry(pos, &evlist->entries, node) { for (thread = 0; thread < evlist->threads->nr; thread++) - ioctl(FD(pos, cpu, thread), PERF_EVENT_IOC_DISABLE); + ioctl(FD(pos, cpu, thread), + PERF_EVENT_IOC_DISABLE, 0); } } } @@ -287,7 +288,8 @@ void perf_evlist__enable(struct perf_evlist *evlist) for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { list_for_each_entry(pos, &evlist->entries, node) { for (thread = 0; thread < evlist->threads->nr; thread++) - ioctl(FD(pos, cpu, thread), PERF_EVENT_IOC_ENABLE); + ioctl(FD(pos, cpu, thread), + PERF_EVENT_IOC_ENABLE, 0); } } } From 8db4841fc72acc58254029f050226ea5f8103854 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 30 May 2012 14:23:42 +0200 Subject: [PATCH 021/117] perf symbols: Handle different endians properly during symbol load Currently we dont care about the file object's endianness. It's possible we read buildid file object from different architecture than we are currentlly running on. So we need to care about properly reading such object's data - handle different endianness properly. Adding: needs_swap DSO field dso__swap_init function to initialize DSO's needs_swap DSO__SWAP to read the data with proper swaps Together with other endianity patches, this change fixies perf report discrepancies on origin and target systems as described in test 1 below, e.g. following perf report diff: ... 0.12% ps [kernel.kallsyms] [k] clear_page - 0.12% awk bash [.] alloc_word_desc + 0.12% awk bash [.] yyparse 0.11% beah-rhts-task libpython2.6.so.1.0 [.] 0x5560e 0.10% perf libc-2.12.so [.] __ctype_toupper_loc - 0.09% rhts-test-runne bash [.] maybe_make_export_env + 0.09% rhts-test-runne bash [.] 0x385a0 0.09% ps [kernel.kallsyms] [k] page_fault ... Note, running following to test perf endianity handling: test 1) - origin system: # perf record -a -- sleep 10 (any perf record will do) # perf report > report.origin # perf archive perf.data - copy the perf.data, report.origin and perf.data.tar.bz2 to a target system and run: # tar xjvf perf.data.tar.bz2 -C ~/.debug # perf report > report.target # diff -u report.origin report.target - the diff should produce no output (besides some white space stuff and possibly different date/TZ output) test 1) - origin system: # perf record -ag -fo /tmp/perf.data -- sleep 1 - mount origin system root to the target system on /mnt/origin - target system: # perf script --symfs /mnt/origin -I -i /mnt/origin/tmp/perf.data \ --kallsyms /mnt/origin/proc/kallsyms - complete perf.data header is displayed Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338380624-7443-2-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 33 ++++++++++++++++++++++++++++++++- tools/perf/util/symbol.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index e2ba8858f3e1..9d04dcd91815 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -323,6 +323,7 @@ struct dso *dso__new(const char *name) dso->sorted_by_name = 0; dso->has_build_id = 0; dso->kernel = DSO_TYPE_USER; + dso->needs_swap = DSO_SWAP__UNSET; INIT_LIST_HEAD(&dso->node); } @@ -1156,6 +1157,33 @@ static size_t elf_addr_to_index(Elf *elf, GElf_Addr addr) return -1; } +static int dso__swap_init(struct dso *dso, unsigned char eidata) +{ + static unsigned int const endian = 1; + + dso->needs_swap = DSO_SWAP__NO; + + switch (eidata) { + case ELFDATA2LSB: + /* We are big endian, DSO is little endian. */ + if (*(unsigned char const *)&endian != 1) + dso->needs_swap = DSO_SWAP__YES; + break; + + case ELFDATA2MSB: + /* We are little endian, DSO is big endian. */ + if (*(unsigned char const *)&endian != 0) + dso->needs_swap = DSO_SWAP__YES; + break; + + default: + pr_err("unrecognized DSO data encoding %d\n", eidata); + return -EINVAL; + } + + return 0; +} + static int dso__load_sym(struct dso *dso, struct map *map, const char *name, int fd, symbol_filter_t filter, int kmodule, int want_symtab) @@ -1187,6 +1215,9 @@ static int dso__load_sym(struct dso *dso, struct map *map, const char *name, goto out_elf_end; } + if (dso__swap_init(dso, ehdr.e_ident[EI_DATA])) + goto out_elf_end; + /* Always reject images with a mismatched build-id: */ if (dso->has_build_id) { u8 build_id[BUILD_ID_SIZE]; @@ -1272,7 +1303,7 @@ static int dso__load_sym(struct dso *dso, struct map *map, const char *name, if (opdsec && sym.st_shndx == opdidx) { u32 offset = sym.st_value - opdshdr.sh_addr; u64 *opd = opddata->d_buf + offset; - sym.st_value = *opd; + sym.st_value = DSO__SWAP(dso, u64, *opd); sym.st_shndx = elf_addr_to_index(elf, sym.st_value); } diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 5649d63798cb..af0752b1aca1 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef HAVE_CPLUS_DEMANGLE extern char *cplus_demangle(const char *, int); @@ -160,11 +161,18 @@ enum dso_kernel_type { DSO_TYPE_GUEST_KERNEL }; +enum dso_swap_type { + DSO_SWAP__UNSET, + DSO_SWAP__NO, + DSO_SWAP__YES, +}; + struct dso { struct list_head node; struct rb_root symbols[MAP__NR_TYPES]; struct rb_root symbol_names[MAP__NR_TYPES]; enum dso_kernel_type kernel; + enum dso_swap_type needs_swap; u8 adjust_symbols:1; u8 has_build_id:1; u8 hit:1; @@ -182,6 +190,28 @@ struct dso { char name[0]; }; +#define DSO__SWAP(dso, type, val) \ +({ \ + type ____r = val; \ + BUG_ON(dso->needs_swap == DSO_SWAP__UNSET); \ + if (dso->needs_swap == DSO_SWAP__YES) { \ + switch (sizeof(____r)) { \ + case 2: \ + ____r = bswap_16(val); \ + break; \ + case 4: \ + ____r = bswap_32(val); \ + break; \ + case 8: \ + ____r = bswap_64(val); \ + break; \ + default: \ + BUG_ON(1); \ + } \ + } \ + ____r; \ +}) + struct dso *dso__new(const char *name); void dso__delete(struct dso *dso); From 268fb20f832e1eb4afd5113ee31fef9332986b13 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 30 May 2012 14:23:43 +0200 Subject: [PATCH 022/117] perf session: Handle endianity swap on sample_id_all header data Adding endianity swapping for event header attached via sample_id_all. Currently we dont do that and it's causing wrong data to be read when running report on architecture with different endianity than the record. The perf is currently able to process 32-bit PPC samples on 32-bit and 64-bit x86. Together with other endianity patches, this change fixies perf report discrepancies on origin and target systems as described in test 1 below, e.g. following perf report diff: ... 0.12% ps [kernel.kallsyms] [k] clear_page - 0.12% awk bash [.] alloc_word_desc + 0.12% awk bash [.] yyparse 0.11% beah-rhts-task libpython2.6.so.1.0 [.] 0x5560e 0.10% perf libc-2.12.so [.] __ctype_toupper_loc - 0.09% rhts-test-runne bash [.] maybe_make_export_env + 0.09% rhts-test-runne bash [.] 0x385a0 0.09% ps [kernel.kallsyms] [k] page_fault ... Note, running following to test perf endianity handling: test 1) - origin system: # perf record -a -- sleep 10 (any perf record will do) # perf report > report.origin # perf archive perf.data - copy the perf.data, report.origin and perf.data.tar.bz2 to a target system and run: # tar xjvf perf.data.tar.bz2 -C ~/.debug # perf report > report.target # diff -u report.origin report.target - the diff should produce no output (besides some white space stuff and possibly different date/TZ output) test 2) - origin system: # perf record -ag -fo /tmp/perf.data -- sleep 1 - mount origin system root to the target system on /mnt/origin - target system: # perf script --symfs /mnt/origin -I -i /mnt/origin/tmp/perf.data \ --kallsyms /mnt/origin/proc/kallsyms - complete perf.data header is displayed Signed-off-by: Jiri Olsa Reviewed-by: David Ahern Tested-by: David Ahern Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338380624-7443-3-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 67 ++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 04d1e33f4592..2600916efa83 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -454,37 +454,65 @@ void mem_bswap_64(void *src, int byte_size) } } -static void perf_event__all64_swap(union perf_event *event) +static void swap_sample_id_all(union perf_event *event, void *data) +{ + void *end = (void *) event + event->header.size; + int size = end - data; + + BUG_ON(size % sizeof(u64)); + mem_bswap_64(data, size); +} + +static void perf_event__all64_swap(union perf_event *event, + bool sample_id_all __used) { struct perf_event_header *hdr = &event->header; mem_bswap_64(hdr + 1, event->header.size - sizeof(*hdr)); } -static void perf_event__comm_swap(union perf_event *event) +static void perf_event__comm_swap(union perf_event *event, bool sample_id_all) { event->comm.pid = bswap_32(event->comm.pid); event->comm.tid = bswap_32(event->comm.tid); + + if (sample_id_all) { + void *data = &event->comm.comm; + + data += ALIGN(strlen(data) + 1, sizeof(u64)); + swap_sample_id_all(event, data); + } } -static void perf_event__mmap_swap(union perf_event *event) +static void perf_event__mmap_swap(union perf_event *event, + bool sample_id_all) { event->mmap.pid = bswap_32(event->mmap.pid); event->mmap.tid = bswap_32(event->mmap.tid); event->mmap.start = bswap_64(event->mmap.start); event->mmap.len = bswap_64(event->mmap.len); event->mmap.pgoff = bswap_64(event->mmap.pgoff); + + if (sample_id_all) { + void *data = &event->mmap.filename; + + data += ALIGN(strlen(data) + 1, sizeof(u64)); + swap_sample_id_all(event, data); + } } -static void perf_event__task_swap(union perf_event *event) +static void perf_event__task_swap(union perf_event *event, bool sample_id_all) { event->fork.pid = bswap_32(event->fork.pid); event->fork.tid = bswap_32(event->fork.tid); event->fork.ppid = bswap_32(event->fork.ppid); event->fork.ptid = bswap_32(event->fork.ptid); event->fork.time = bswap_64(event->fork.time); + + if (sample_id_all) + swap_sample_id_all(event, &event->fork + 1); } -static void perf_event__read_swap(union perf_event *event) +static void perf_event__read_swap(union perf_event *event, bool sample_id_all) { event->read.pid = bswap_32(event->read.pid); event->read.tid = bswap_32(event->read.tid); @@ -492,6 +520,9 @@ static void perf_event__read_swap(union perf_event *event) event->read.time_enabled = bswap_64(event->read.time_enabled); event->read.time_running = bswap_64(event->read.time_running); event->read.id = bswap_64(event->read.id); + + if (sample_id_all) + swap_sample_id_all(event, &event->read + 1); } static u8 revbyte(u8 b) @@ -543,7 +574,8 @@ void perf_event__attr_swap(struct perf_event_attr *attr) swap_bitfield((u8 *) (&attr->read_format + 1), sizeof(u64)); } -static void perf_event__hdr_attr_swap(union perf_event *event) +static void perf_event__hdr_attr_swap(union perf_event *event, + bool sample_id_all __used) { size_t size; @@ -554,18 +586,21 @@ static void perf_event__hdr_attr_swap(union perf_event *event) mem_bswap_64(event->attr.id, size); } -static void perf_event__event_type_swap(union perf_event *event) +static void perf_event__event_type_swap(union perf_event *event, + bool sample_id_all __used) { event->event_type.event_type.event_id = bswap_64(event->event_type.event_type.event_id); } -static void perf_event__tracing_data_swap(union perf_event *event) +static void perf_event__tracing_data_swap(union perf_event *event, + bool sample_id_all __used) { event->tracing_data.size = bswap_32(event->tracing_data.size); } -typedef void (*perf_event__swap_op)(union perf_event *event); +typedef void (*perf_event__swap_op)(union perf_event *event, + bool sample_id_all); static perf_event__swap_op perf_event__swap_ops[] = { [PERF_RECORD_MMAP] = perf_event__mmap_swap, @@ -999,6 +1034,15 @@ static int perf_session__process_user_event(struct perf_session *session, union } } +static void event_swap(union perf_event *event, bool sample_id_all) +{ + perf_event__swap_op swap; + + swap = perf_event__swap_ops[event->header.type]; + if (swap) + swap(event, sample_id_all); +} + static int perf_session__process_event(struct perf_session *session, union perf_event *event, struct perf_tool *tool, @@ -1007,9 +1051,8 @@ static int perf_session__process_event(struct perf_session *session, struct perf_sample sample; int ret; - if (session->header.needs_swap && - perf_event__swap_ops[event->header.type]) - perf_event__swap_ops[event->header.type](event); + if (session->header.needs_swap) + event_swap(event, session->sample_id_all); if (event->header.type >= PERF_RECORD_HEADER_MAX) return -EINVAL; From 37073f9e449bc430e6c40b9cffc2558002a0256a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 30 May 2012 14:23:44 +0200 Subject: [PATCH 023/117] perf evsel: Fix 32 bit values endianity swap for sample_id_all header We swap the sample_id_all header by u64 pointers. Some members of the header happen to be 32 bit values. We need to handle them separatelly. Together with other endianity patches, this change fixies perf report discrepancies on origin and target systems as described in test 1 below, e.g. following perf report diff: ... 0.12% ps [kernel.kallsyms] [k] clear_page - 0.12% awk bash [.] alloc_word_desc + 0.12% awk bash [.] yyparse 0.11% beah-rhts-task libpython2.6.so.1.0 [.] 0x5560e 0.10% perf libc-2.12.so [.] __ctype_toupper_loc - 0.09% rhts-test-runne bash [.] maybe_make_export_env + 0.09% rhts-test-runne bash [.] 0x385a0 0.09% ps [kernel.kallsyms] [k] page_fault ... Note, running following to test perf endianity handling: test 1) - origin system: # perf record -a -- sleep 10 (any perf record will do) # perf report > report.origin # perf archive perf.data - copy the perf.data, report.origin and perf.data.tar.bz2 to a target system and run: # tar xjvf perf.data.tar.bz2 -C ~/.debug # perf report > report.target # diff -u report.origin report.target - the diff should produce no output (besides some white space stuff and possibly different date/TZ output) test 2) - origin system: # perf record -ag -fo /tmp/perf.data -- sleep 1 - mount origin system root to the target system on /mnt/origin - target system: # perf script --symfs /mnt/origin -I -i /mnt/origin/tmp/perf.data \ --kallsyms /mnt/origin/proc/kallsyms - complete perf.data header is displayed Signed-off-by: Jiri Olsa Reviewed-by: David Ahern Tested-by: David Ahern Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338380624-7443-4-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 91d19138f3ec..9f6cebd798ee 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -494,16 +494,24 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel, } static int perf_event__parse_id_sample(const union perf_event *event, u64 type, - struct perf_sample *sample) + struct perf_sample *sample, + bool swapped) { const u64 *array = event->sample.array; + union u64_swap u; array += ((event->header.size - sizeof(event->header)) / sizeof(u64)) - 1; if (type & PERF_SAMPLE_CPU) { - u32 *p = (u32 *)array; - sample->cpu = *p; + u.val64 = *array; + if (swapped) { + /* undo swap of u64, then swap on individual u32s */ + u.val64 = bswap_64(u.val64); + u.val32[0] = bswap_32(u.val32[0]); + } + + sample->cpu = u.val32[0]; array--; } @@ -523,9 +531,16 @@ static int perf_event__parse_id_sample(const union perf_event *event, u64 type, } if (type & PERF_SAMPLE_TID) { - u32 *p = (u32 *)array; - sample->pid = p[0]; - sample->tid = p[1]; + u.val64 = *array; + if (swapped) { + /* undo swap of u64, then swap on individual u32s */ + u.val64 = bswap_64(u.val64); + u.val32[0] = bswap_32(u.val32[0]); + u.val32[1] = bswap_32(u.val32[1]); + } + + sample->pid = u.val32[0]; + sample->tid = u.val32[1]; } return 0; @@ -562,7 +577,7 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, if (event->header.type != PERF_RECORD_SAMPLE) { if (!sample_id_all) return 0; - return perf_event__parse_id_sample(event, type, data); + return perf_event__parse_id_sample(event, type, data, swapped); } array = event->sample.array; From 378474e4b23183002f1791b294a4440b6b7df36a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 31 May 2012 17:16:56 +0530 Subject: [PATCH 024/117] perf symbols: Check for valid dso before creating map dso__new() can return NULL. Hence verify dso before creating a new map. Signed-off-by: Srikar Dronamraju Suggested-by: Arnaldo Carvalho de Melo Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20120531114656.23691.54223.sendpatchset@srdronam.in.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 9d04dcd91815..3e2e5ea0f03f 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -2817,8 +2817,11 @@ int machine__load_vmlinux_path(struct machine *machine, enum map_type type, struct map *dso__new_map(const char *name) { + struct map *map = NULL; struct dso *dso = dso__new(name); - struct map *map = map__new2(0, dso, MAP__FUNCTION); + + if (dso) + map = map__new2(0, dso, MAP__FUNCTION); return map; } From a23c4dc422cff7bd30f40f5dc7c9ac4a6339005a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 31 May 2012 17:16:43 +0530 Subject: [PATCH 025/117] perf uprobes: Remove unnecessary check before strlist__delete Since strlist__delete() itself checks, the additional check before calling strlist__delete() is redundant. No Functional change. Signed-off-by: Srikar Dronamraju Suggested-by: Arnaldo Carvalho de Melo Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20120531114643.23691.38666.sendpatchset@srdronam.in.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 59dccc98b554..0dda25d82d06 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2164,16 +2164,12 @@ int del_perf_probe_events(struct strlist *dellist) error: if (kfd >= 0) { - if (namelist) - strlist__delete(namelist); - + strlist__delete(namelist); close(kfd); } if (ufd >= 0) { - if (unamelist) - strlist__delete(unamelist); - + strlist__delete(unamelist); close(ufd); } From cb7225feec627e91d598198996429e9ee6804f8d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:51:44 +0900 Subject: [PATCH 026/117] perf: Remove duplicate invocation on perf_event_for_each The @func callback was invoked twice for group leader when perf_event_for_each() called. It seems the commit 75f937f24bd9 ("perf_counter: Fix ctx->mutex vs counter ->mutex inversion") made the mistake during the change. Signed-off-by: Namhyung Kim Acked-by: Peter Zijlstra Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338443506-25009-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b06cbbf6931..f85c0154b333 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3181,7 +3181,6 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - func(event); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); mutex_unlock(&ctx->mutex); From cfb46f433a4da97c31780e08a259fac2cb6bd61f Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Apr 2012 14:33:33 +0100 Subject: [PATCH 027/117] acpi_video: fix leaking PCI references Signed-off-by: Alan Cox Acked-by: Matthew Garrett Signed-off-by: Len Brown --- drivers/acpi/video.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 9577b6fa2650..66e8f7333e9b 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -1745,6 +1745,7 @@ static int acpi_video_bus_remove(struct acpi_device *device, int type) static int __init intel_opregion_present(void) { + int i915 = 0; #if defined(CONFIG_DRM_I915) || defined(CONFIG_DRM_I915_MODULE) struct pci_dev *dev = NULL; u32 address; @@ -1757,10 +1758,10 @@ static int __init intel_opregion_present(void) pci_read_config_dword(dev, 0xfc, &address); if (!address) continue; - return 1; + i915 = 1; } #endif - return 0; + return i915; } int acpi_video_register(void) From c6996bdd850fb53319918487d5f674203517fdc5 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Apr 2012 14:33:48 +0100 Subject: [PATCH 028/117] acpi_video: Intel video is not always i915 Stop it poking at random registers on the i740 cards that may be out there still. As per Matthew's feedback remove the conditional checks and never enable the opregion handling unless an appropriate driver has been loaded. Signed-off-by: Alan Cox Signed-off-by: Len Brown --- drivers/acpi/video.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 66e8f7333e9b..609262dc1258 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -1743,10 +1743,18 @@ static int acpi_video_bus_remove(struct acpi_device *device, int type) return 0; } +static int __init is_i740(struct pci_dev *dev) +{ + if (dev->device == 0x00D1) + return 1; + if (dev->device == 0x7000) + return 1; + return 0; +} + static int __init intel_opregion_present(void) { - int i915 = 0; -#if defined(CONFIG_DRM_I915) || defined(CONFIG_DRM_I915_MODULE) + int opregion = 0; struct pci_dev *dev = NULL; u32 address; @@ -1755,13 +1763,15 @@ static int __init intel_opregion_present(void) continue; if (dev->vendor != PCI_VENDOR_ID_INTEL) continue; + /* We don't want to poke around undefined i740 registers */ + if (is_i740(dev)) + continue; pci_read_config_dword(dev, 0xfc, &address); if (!address) continue; - i915 = 1; + opregion = 1; } -#endif - return i915; + return opregion; } int acpi_video_register(void) From 155689defc782b486a7e6776a57ecc4ebb37ed52 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Apr 2012 14:34:04 +0100 Subject: [PATCH 029/117] gma500: don't register the ACPI video bus We are not yet ready for this and it makes a mess on some devices. Signed-off-by: Alan Cox Signed-off-by: Len Brown --- drivers/gpu/drm/gma500/psb_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/gma500/psb_drv.c b/drivers/gpu/drm/gma500/psb_drv.c index c34adf9d910a..09af2ffbb307 100644 --- a/drivers/gpu/drm/gma500/psb_drv.c +++ b/drivers/gpu/drm/gma500/psb_drv.c @@ -349,7 +349,7 @@ static int psb_driver_load(struct drm_device *dev, unsigned long chipset) PSB_WSGX32(0x30000000, PSB_CR_BIF_3D_REQ_BASE); /* igd_opregion_init(&dev_priv->opregion_dev); */ - acpi_video_register(); +/* acpi_video_register(); */ if (dev_priv->lid_state) psb_lid_timer_init(dev_priv); From 301f33fbcf4ced53b3de114846ecece5d6aafeeb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Apr 2012 14:19:16 +0300 Subject: [PATCH 030/117] ACPI video: use after input_unregister_device() We can't use "input" anymore after calling input_unregister_device(). The call to input_free_device() is a double free. The normal way to deal with this is to make input_register_device() the last function called in the function. Signed-off-by: Dan Carpenter Acked-by: Dmitry Torokhov Signed-off-by: Len Brown --- drivers/acpi/video.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 609262dc1258..a576575617d7 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -1687,10 +1687,6 @@ static int acpi_video_bus_add(struct acpi_device *device) set_bit(KEY_BRIGHTNESS_ZERO, input->keybit); set_bit(KEY_DISPLAY_OFF, input->keybit); - error = input_register_device(input); - if (error) - goto err_stop_video; - printk(KERN_INFO PREFIX "%s [%s] (multi-head: %s rom: %s post: %s)\n", ACPI_VIDEO_DEVICE_NAME, acpi_device_bid(device), video->flags.multihead ? "yes" : "no", @@ -1701,12 +1697,16 @@ static int acpi_video_bus_add(struct acpi_device *device) video->pm_nb.priority = 0; error = register_pm_notifier(&video->pm_nb); if (error) - goto err_unregister_input_dev; + goto err_stop_video; + + error = input_register_device(input); + if (error) + goto err_unregister_pm_notifier; return 0; - err_unregister_input_dev: - input_unregister_device(input); + err_unregister_pm_notifier: + unregister_pm_notifier(&video->pm_nb); err_stop_video: acpi_video_bus_stop_devices(video); err_free_input_dev: From ca4620853a5b578725edec1fc8f1345db3a823ab Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 1 Jun 2012 23:28:23 -0700 Subject: [PATCH 031/117] MAINTAINERS: Update my e-mail address Maintainer activities moved to home server. Update e-mail address to reflect reality. Signed-off-by: Guenter Roeck Acked-by: Jean Delvare --- MAINTAINERS | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 55f0fda602ec..5b75b9706d2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2270,7 +2270,7 @@ F: include/linux/device-mapper.h F: include/linux/dm-*.h DIOLAN U2C-12 I2C DRIVER -M: Guenter Roeck +M: Guenter Roeck L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-diolan-u2c.c @@ -3138,7 +3138,7 @@ F: drivers/tty/hvc/ HARDWARE MONITORING M: Jean Delvare -M: Guenter Roeck +M: Guenter Roeck L: lm-sensors@lm-sensors.org W: http://www.lm-sensors.org/ T: quilt kernel.org/pub/linux/kernel/people/jdelvare/linux-2.6/jdelvare-hwmon/ @@ -4411,6 +4411,13 @@ S: Orphan F: drivers/video/matrox/matroxfb_* F: include/linux/matroxfb.h +MAX16065 HARDWARE MONITOR DRIVER +M: Guenter Roeck +L: lm-sensors@lm-sensors.org +S: Maintained +F: Documentation/hwmon/max16065 +F: drivers/hwmon/max16065.c + MAX6650 HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: "Hans J. Koch" L: lm-sensors@lm-sensors.org @@ -5149,7 +5156,7 @@ F: drivers/leds/leds-pca9532.c F: include/linux/leds-pca9532.h PCA9541 I2C BUS MASTER SELECTOR DRIVER -M: Guenter Roeck +M: Guenter Roeck L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/muxes/i2c-mux-pca9541.c @@ -5299,7 +5306,7 @@ F: drivers/video/fb-puv3.c F: drivers/rtc/rtc-puv3.c PMBUS HARDWARE MONITORING DRIVERS -M: Guenter Roeck +M: Guenter Roeck L: lm-sensors@lm-sensors.org W: http://www.lm-sensors.org/ W: http://www.roeck-us.net/linux/drivers/ From d15cf7c129fa4ec4b44c52521e49ffafb9749029 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 3 Jun 2012 23:24:00 -0400 Subject: [PATCH 032/117] tools/power turbostat: fix un-intended affinity of forked program Linux 3.4 included a modification to turbostat to lower cross-call overhead by using scheduler affinity: 15aaa34654831e98dd76f7738b6c7f5d05a66430 (tools turbostat: reduce measurement overhead due to IPIs) In the use-case where turbostat forks a child program, that change had the un-intended side-effect of binding the child to the last cpu in the system. This change removed the binding before forking the child. This is a back-port of a fix already included in turbostat v2. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 28 +++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ab2f682fd44c..d5d6a3dc9bac 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -73,8 +73,8 @@ int backwards_count; char *progname; int num_cpus; -cpu_set_t *cpu_mask; -size_t cpu_mask_size; +cpu_set_t *cpu_present_set, *cpu_mask; +size_t cpu_present_setsize, cpu_mask_size; struct counters { unsigned long long tsc; /* per thread */ @@ -103,6 +103,12 @@ struct timeval tv_even; struct timeval tv_odd; struct timeval tv_delta; +int mark_cpu_present(int pkg, int core, int cpu) +{ + CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set); + return 0; +} + /* * cpu_mask_init(ncpus) * @@ -118,6 +124,18 @@ void cpu_mask_init(int ncpus) } cpu_mask_size = CPU_ALLOC_SIZE(ncpus); CPU_ZERO_S(cpu_mask_size, cpu_mask); + + /* + * Allocate and initialize cpu_present_set + */ + cpu_present_set = CPU_ALLOC(ncpus); + if (cpu_present_set == NULL) { + perror("CPU_ALLOC"); + exit(3); + } + cpu_present_setsize = CPU_ALLOC_SIZE(ncpus); + CPU_ZERO_S(cpu_present_setsize, cpu_present_set); + for_all_cpus(mark_cpu_present); } void cpu_mask_uninit() @@ -125,6 +143,9 @@ void cpu_mask_uninit() CPU_FREE(cpu_mask); cpu_mask = NULL; cpu_mask_size = 0; + CPU_FREE(cpu_present_set); + cpu_present_set = NULL; + cpu_present_setsize = 0; } int cpu_migrate(int cpu) @@ -1047,6 +1068,9 @@ int fork_it(char **argv) int retval; pid_t child_pid; get_counters(cnt_even); + + /* clear affinity side-effect of get_counters() */ + sched_setaffinity(0, cpu_present_setsize, cpu_present_set); gettimeofday(&tv_even, (struct timezone *)NULL); child_pid = fork(); From 650a37f32d2bc16fa802075be579802bc4ec4132 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 3 Jun 2012 23:34:44 -0400 Subject: [PATCH 033/117] tools/power turbostat: fix IVB support Initial IVB support went into turbostat in Linux-3.1: 553575f1ae048aa44682b46b3c51929a0b3ad337 (tools turbostat: recognize and run properly on IVB) However, when running on IVB, turbostat would fail to report the new couters added with SNB, c7, pc2 and pc7. So in scenarios where these counters are non-zero on IVB, turbostat would report erroneous residencey results. In particular c7 time would be added to c1 time, since c1 time is calculated as "that which is left over". Also, turbostat reports MHz capabilities when passed the "-v" option, and it would incorrectly report 133MHz bclk instead of 100MHz bclk for IVB, which would inflate GHz reported with that option. This patch is a backport of a fix already included in turbostat v2. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d5d6a3dc9bac..16de7ad4850f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -933,6 +933,8 @@ int is_snb(unsigned int family, unsigned int model) switch (model) { case 0x2A: case 0x2D: + case 0x3A: /* IVB */ + case 0x3D: /* IVB Xeon */ return 1; } return 0; From 2f07a6134f670755e6ce657d019c26305bfcef89 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 5 Jan 2012 10:47:58 -0200 Subject: [PATCH 034/117] drivers: acpi: Fix dependency for ACPI_HOTPLUG_CPU Fix the following build warning: warning: (ACPI_HOTPLUG_CPU) selects ACPI_CONTAINER which has unmet direct dependencies (ACPI && EXPERIMENTAL) Signed-off-by: Fabio Estevam Signed-off-by: Len Brown --- drivers/acpi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 47768ff87343..80998958cf45 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -208,7 +208,7 @@ config ACPI_IPMI config ACPI_HOTPLUG_CPU bool - depends on ACPI_PROCESSOR && HOTPLUG_CPU + depends on EXPERIMENTAL && ACPI_PROCESSOR && HOTPLUG_CPU select ACPI_CONTAINER default y From 7ae30986dc63d214cb075a40f2cf205f0a7806cd Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 4 Jun 2012 00:29:11 -0400 Subject: [PATCH 035/117] ACPI: fix acpi_bus.h build warnings when ACPI is not enabled introduced in Linux-3.5-rc1 by 66886d6f8c9bcdee3d7fce5796dcffd6b4bc0b48 (ACPI: Add stubs for (un)register_acpi_bus_type) Fix header file warnings when CONFIG_ACPI is not enabled: include/acpi/acpi_bus.h:443:42: warning: 'struct acpi_bus_type' declared inside parameter list include/acpi/acpi_bus.h:443:42: warning: its scope is only this definition or declaration, which is probably not include/acpi/acpi_bus.h:444:44: warning: 'struct acpi_bus_type' declared inside parameter list Signed-off-by: Len Brown --- include/acpi/acpi_bus.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index b0d62820ada1..9e6e1c6eb60a 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -440,8 +440,8 @@ static inline int acpi_pm_device_sleep_wake(struct device *dev, bool enable) #else /* CONFIG_ACPI */ -static int register_acpi_bus_type(struct acpi_bus_type *bus) { return 0; } -static int unregister_acpi_bus_type(struct acpi_bus_type *bus) { return 0; } +static inline int register_acpi_bus_type(void *bus) { return 0; } +static inline int unregister_acpi_bus_type(void *bus) { return 0; } #endif /* CONFIG_ACPI */ From 71b43fd573a60972b2175df4927c4ee10d757004 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 17 May 2012 17:51:53 -0300 Subject: [PATCH 036/117] RDMA/cxgb4: Fix crash when peer address is 0.0.0.0 When using rping -c -a 0.0.0.0 with iw_cxgb4, the system crashes when rdma_connect() is called. ip_dev_find() will return NULL, but pdev is accessed anyway. Checking that pdev is NULL and returning -ENODEV prevents the system from crashing. Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 55ab284e22f2..b18870c455ad 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1593,6 +1593,10 @@ static int import_ep(struct c4iw_ep *ep, __be32 peer_ip, struct dst_entry *dst, struct net_device *pdev; pdev = ip_dev_find(&init_net, peer_ip); + if (!pdev) { + err = -ENODEV; + goto out; + } ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, n, pdev, 0); if (!ep->l2t) From 3aac6ff16a2097be668975fd51084df2e27e4999 Mon Sep 17 00:00:00 2001 From: Shlomo Pongratz Date: Thu, 24 May 2012 16:08:07 +0300 Subject: [PATCH 037/117] IB/mlx4: Fix EQ deallocation in legacy mode Commit e605b743f33d ("IB/mlx4: Increase the number of vectors (EQs) available for ULPs") didn't handle correctly the case where there aren't enough MSI-X vectors to increase the number of EQs, so only the legacy EQs are allocated. This results in an attempt to memset() to zero the EQ table which was never allocated and a kernel crash. Fix this by checking in the teardown flow if the table of EQs was ever allocated. Also remove some unneeded setting to zero of the EQ related fields in struct mlx4_ib_dev. Signed-off-by: Shlomo Pongratz Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ee1c577238f7..8afea12c8d44 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1084,12 +1084,9 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) int total_eqs = 0; int i, j, eq; - /* Init eq table */ - ibdev->eq_table = NULL; - ibdev->eq_added = 0; - - /* Legacy mode? */ - if (dev->caps.comp_pool == 0) + /* Legacy mode or comp_pool is not large enough */ + if (dev->caps.comp_pool == 0 || + dev->caps.num_ports > dev->caps.comp_pool) return; eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ @@ -1135,7 +1132,10 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i; - int total_eqs; + + /* no additional eqs were added */ + if (!ibdev->eq_table) + return; /* Reset the advertised EQ number */ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; @@ -1148,12 +1148,7 @@ static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) mlx4_release_eq(dev, ibdev->eq_table[i]); } - total_eqs = dev->caps.num_comp_vectors + ibdev->eq_added; - memset(ibdev->eq_table, 0, total_eqs * sizeof(int)); kfree(ibdev->eq_table); - - ibdev->eq_table = NULL; - ibdev->eq_added = 0; } static void *mlx4_ib_add(struct mlx4_dev *dev) From c1bf94ec1e12d76838ad485158aecf208ebd8fb9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 May 2012 17:38:11 +0200 Subject: [PATCH 038/117] iommu/amd: Cache pdev pointer to root-bridge At some point pci_get_bus_and_slot started to enable interrupts. Since this function is used in the amd_iommu_resume path it will enable interrupts on resume which causes a warning. The fix will use a cached pointer to the root-bridge to re-enable the IOMMU in case the BIOS is broken. Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 13 +++++-------- drivers/iommu/amd_iommu_types.h | 3 +++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index c56790375e0f..542024ba6dba 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -1029,6 +1029,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->dev) return 1; + iommu->root_pdev = pci_get_bus_and_slot(iommu->dev->bus->number, + PCI_DEVFN(0, 0)); + iommu->cap_ptr = h->cap_ptr; iommu->pci_seg = h->pci_seg; iommu->mmio_phys = h->mmio_phys; @@ -1323,20 +1326,16 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu) { int i, j; u32 ioc_feature_control; - struct pci_dev *pdev = NULL; + struct pci_dev *pdev = iommu->root_pdev; /* RD890 BIOSes may not have completely reconfigured the iommu */ - if (!is_rd890_iommu(iommu->dev)) + if (!is_rd890_iommu(iommu->dev) || !pdev) return; /* * First, we need to ensure that the iommu is enabled. This is * controlled by a register in the northbridge */ - pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); - - if (!pdev) - return; /* Select Northbridge indirect register 0x75 and enable writing */ pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); @@ -1346,8 +1345,6 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu) if (!(ioc_feature_control & 0x1)) pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); - pci_dev_put(pdev); - /* Restore the iommu BAR */ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, iommu->stored_addr_lo); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 2452f3b71736..24355559a2ad 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -481,6 +481,9 @@ struct amd_iommu { /* Pointer to PCI device of this IOMMU */ struct pci_dev *dev; + /* Cache pdev to root device for resume quirks */ + struct pci_dev *root_pdev; + /* physical address of MMIO space */ u64 mmio_phys; /* virtual address of MMIO space */ From eee53537c476c947bf7faa1c916d2f5a0ae8ec93 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 1 Jun 2012 15:20:23 +0200 Subject: [PATCH 039/117] iommu/amd: Fix deadlock in ppr-handling error path In the error path of the ppr_notifer it can happen that the iommu->lock is taken recursivly. This patch fixes the problem by releasing the iommu->lock before any notifier is invoked. This also requires to move the erratum workaround for the ppr-log (interrupt may be faster than data in the log) one function up. Cc: stable@vger.kernel.org # v3.3, v3.4 Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 71 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index d90a421e9cac..a2e418cba0ff 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -547,26 +547,12 @@ static void iommu_poll_events(struct amd_iommu *iommu) spin_unlock_irqrestore(&iommu->lock, flags); } -static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head) +static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw) { struct amd_iommu_fault fault; - volatile u64 *raw; - int i; INC_STATS_COUNTER(pri_requests); - raw = (u64 *)(iommu->ppr_log + head); - - /* - * Hardware bug: Interrupt may arrive before the entry is written to - * memory. If this happens we need to wait for the entry to arrive. - */ - for (i = 0; i < LOOP_TIMEOUT; ++i) { - if (PPR_REQ_TYPE(raw[0]) != 0) - break; - udelay(1); - } - if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n"); return; @@ -578,12 +564,6 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head) fault.tag = PPR_TAG(raw[0]); fault.flags = PPR_FLAGS(raw[0]); - /* - * To detect the hardware bug we need to clear the entry - * to back to zero. - */ - raw[0] = raw[1] = 0; - atomic_notifier_call_chain(&ppr_notifier, 0, &fault); } @@ -595,25 +575,62 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu) if (iommu->ppr_log == NULL) return; + /* enable ppr interrupts again */ + writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); + spin_lock_irqsave(&iommu->lock, flags); head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); while (head != tail) { + volatile u64 *raw; + u64 entry[2]; + int i; - /* Handle PPR entry */ - iommu_handle_ppr_entry(iommu, head); + raw = (u64 *)(iommu->ppr_log + head); - /* Update and refresh ring-buffer state*/ + /* + * Hardware bug: Interrupt may arrive before the entry is + * written to memory. If this happens we need to wait for the + * entry to arrive. + */ + for (i = 0; i < LOOP_TIMEOUT; ++i) { + if (PPR_REQ_TYPE(raw[0]) != 0) + break; + udelay(1); + } + + /* Avoid memcpy function-call overhead */ + entry[0] = raw[0]; + entry[1] = raw[1]; + + /* + * To detect the hardware bug we need to clear the entry + * back to zero. + */ + raw[0] = raw[1] = 0UL; + + /* Update head pointer of hardware ring-buffer */ head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + + /* + * Release iommu->lock because ppr-handling might need to + * re-aquire it + */ + spin_unlock_irqrestore(&iommu->lock, flags); + + /* Handle PPR entry */ + iommu_handle_ppr_entry(iommu, entry); + + spin_lock_irqsave(&iommu->lock, flags); + + /* Refresh ring-buffer information */ + head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); } - /* enable ppr interrupts again */ - writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); - spin_unlock_irqrestore(&iommu->lock, flags); } From 3eef8918ff440837f6af791942d8dd07e1a268ee Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 4 Jun 2012 17:05:40 +0100 Subject: [PATCH 040/117] drm/i915: Mark the ringbuffers as being in the GTT domain By correctly describing the rinbuffers as being in the GTT domain, it appears that we are more careful with the management of the CPU cache upon resume and so prevent some coherency issue when submitting commands to the GPU later. A secondary effect is that the debug logs are then consistent with the actual usage (i.e. they no longer describe the ringbuffers as being in the CPU write domain when we are accessing them through an wc iomapping.) Reported-and-tested-by: Daniel Gnoutcheff Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=41092 Signed-off-by: Chris Wilson Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 490df551e312..9fbad086cb4b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -988,6 +988,10 @@ static int intel_init_ring_buffer(struct drm_device *dev, if (ret) goto err_unref; + ret = i915_gem_object_set_to_gtt_domain(obj, true); + if (ret) + goto err_unpin; + ring->virtual_start = ioremap_wc(dev->agp->base + obj->gtt_offset, ring->size); if (ring->virtual_start == NULL) { From b7884eb45ec98c0d34c7f49005ae9d4b4b4e38f6 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 4 Jun 2012 11:18:15 +0200 Subject: [PATCH 041/117] drm/i915: hold forcewake around ring hw init Empirical evidence suggests that we need to: On at least one ivb machine when running the hangman i-g-t test, the rings don't properly initialize properly - the RING_START registers seems to be stuck at all zeros. Holding forcewake around this register init sequences makes chip reset reliable again. Note that this is not the first such issue: commit f01db988ef6f6c70a6cc36ee71e4a98a68901229 Author: Sean Paul Date: Fri Mar 16 12:43:22 2012 -0400 drm/i915: Add wait_for in init_ring_common added delay loops to make RING_START and RING_CTL initialization reliable on the blt ring at boot-up. So I guess it won't hurt if we do this unconditionally for all force_wake needing gpus. To avoid copy&pasting of the HAS_FORCE_WAKE check I've added a new intel_info bit for that. v2: Fixup missing commas in static struct and properly handling the error case in init_ring_common, both noticed by Jani Nikula. Cc: stable@vger.kernel.org Reported-and-tested-by: Yang Guang Reviewed-by: Eugeni Dodonov Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=50522 Signed-Off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_drv.c | 13 +++++++++---- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/intel_ringbuffer.c | 16 +++++++++++++--- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 238a52165833..9fe9ebe52a7a 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -233,6 +233,7 @@ static const struct intel_device_info intel_sandybridge_d_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_sandybridge_m_info = { @@ -243,6 +244,7 @@ static const struct intel_device_info intel_sandybridge_m_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_ivybridge_d_info = { @@ -252,6 +254,7 @@ static const struct intel_device_info intel_ivybridge_d_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_ivybridge_m_info = { @@ -262,6 +265,7 @@ static const struct intel_device_info intel_ivybridge_m_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_valleyview_m_info = { @@ -289,6 +293,7 @@ static const struct intel_device_info intel_haswell_d_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct intel_device_info intel_haswell_m_info = { @@ -298,6 +303,7 @@ static const struct intel_device_info intel_haswell_m_info = { .has_blt_ring = 1, .has_llc = 1, .has_pch_split = 1, + .has_force_wake = 1, }; static const struct pci_device_id pciidlist[] = { /* aka */ @@ -1139,10 +1145,9 @@ MODULE_LICENSE("GPL and additional rights"); /* We give fast paths for the really cool registers */ #define NEEDS_FORCE_WAKE(dev_priv, reg) \ - (((dev_priv)->info->gen >= 6) && \ - ((reg) < 0x40000) && \ - ((reg) != FORCEWAKE)) && \ - (!IS_VALLEYVIEW((dev_priv)->dev)) + ((HAS_FORCE_WAKE((dev_priv)->dev)) && \ + ((reg) < 0x40000) && \ + ((reg) != FORCEWAKE)) #define __i915_read(x, y) \ u##x i915_read##x(struct drm_i915_private *dev_priv, u32 reg) { \ diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 377c21f531e4..cff630757650 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -285,6 +285,7 @@ struct intel_device_info { u8 is_ivybridge:1; u8 is_valleyview:1; u8 has_pch_split:1; + u8 has_force_wake:1; u8 is_haswell:1; u8 has_fbc:1; u8 has_pipe_cxsr:1; @@ -1098,6 +1099,8 @@ struct drm_i915_file_private { #define HAS_PCH_CPT(dev) (INTEL_PCH_TYPE(dev) == PCH_CPT) #define HAS_PCH_IBX(dev) (INTEL_PCH_TYPE(dev) == PCH_IBX) +#define HAS_FORCE_WAKE(dev) (INTEL_INFO(dev)->has_force_wake) + #include "i915_trace.h" /** diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 9fbad086cb4b..e5b84ff89ca5 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -266,10 +266,15 @@ u32 intel_ring_get_active_head(struct intel_ring_buffer *ring) static int init_ring_common(struct intel_ring_buffer *ring) { - drm_i915_private_t *dev_priv = ring->dev->dev_private; + struct drm_device *dev = ring->dev; + drm_i915_private_t *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj = ring->obj; + int ret = 0; u32 head; + if (HAS_FORCE_WAKE(dev)) + gen6_gt_force_wake_get(dev_priv); + /* Stop the ring if it's running. */ I915_WRITE_CTL(ring, 0); I915_WRITE_HEAD(ring, 0); @@ -317,7 +322,8 @@ static int init_ring_common(struct intel_ring_buffer *ring) I915_READ_HEAD(ring), I915_READ_TAIL(ring), I915_READ_START(ring)); - return -EIO; + ret = -EIO; + goto out; } if (!drm_core_check_feature(ring->dev, DRIVER_MODESET)) @@ -329,7 +335,11 @@ static int init_ring_common(struct intel_ring_buffer *ring) ring->last_retired_head = -1; } - return 0; +out: + if (HAS_FORCE_WAKE(dev)) + gen6_gt_force_wake_put(dev_priv); + + return ret; } static int From fad0c66c4bb836d57a5f125ecd38bed653ca863a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 30 May 2012 10:54:57 -0700 Subject: [PATCH 042/117] timekeeping: Fix CLOCK_MONOTONIC inconsistency during leapsecond Commit 6b43ae8a61 (ntp: Fix leap-second hrtimer livelock) broke the leapsecond update of CLOCK_MONOTONIC. The missing leapsecond update to wall_to_monotonic causes discontinuities in CLOCK_MONOTONIC. Adjust wall_to_monotonic when NTP inserted a leapsecond. Reported-by: Richard Cochran Signed-off-by: John Stultz Tested-by: Richard Cochran Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1338400497-12420-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6e46cacf5969..6f46a00a1e8a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } /* Accumulate raw time */ @@ -1077,6 +1078,7 @@ static void update_wall_time(void) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } timekeeping_update(false); From 363b06aaa59fc20d0a9c5a5a9ce1fa2c45946700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 14 May 2012 11:08:51 +0900 Subject: [PATCH 043/117] drm/exynos: Use DRM_FORMAT_{NV12, YUV420} instead of DRM_FORMAT_{NV12M, YUV420M} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NV12M/YUV420M formats are identical to the already existing standard NV12/YUV420 formats. The M variants will be removed, so convert the driver to use the standard names. Signed-off-by: Ville Syrjälä Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_fb.h | 4 ++-- drivers/gpu/drm/exynos/exynos_mixer.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fb.h b/drivers/gpu/drm/exynos/exynos_drm_fb.h index 3ecb30d93552..50823756cdea 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fb.h +++ b/drivers/gpu/drm/exynos/exynos_drm_fb.h @@ -31,10 +31,10 @@ static inline int exynos_drm_format_num_buffers(uint32_t format) { switch (format) { - case DRM_FORMAT_NV12M: + case DRM_FORMAT_NV12: case DRM_FORMAT_NV12MT: return 2; - case DRM_FORMAT_YUV420M: + case DRM_FORMAT_YUV420: return 3; default: return 1; diff --git a/drivers/gpu/drm/exynos/exynos_mixer.c b/drivers/gpu/drm/exynos/exynos_mixer.c index 68ef01028375..5a46e583c5b5 100644 --- a/drivers/gpu/drm/exynos/exynos_mixer.c +++ b/drivers/gpu/drm/exynos/exynos_mixer.c @@ -365,7 +365,7 @@ static void vp_video_buffer(struct mixer_context *ctx, int win) switch (win_data->pixel_format) { case DRM_FORMAT_NV12MT: tiled_mode = true; - case DRM_FORMAT_NV12M: + case DRM_FORMAT_NV12: crcb_mode = false; buf_num = 2; break; From 13b87b27421e12f82ebbaac018cea30f82e5c33e Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Mon, 14 May 2012 20:04:38 +0900 Subject: [PATCH 044/117] drm/exynos: fixed size type. size type of drm_exynos_gem_mmap struct is changed to uint64_t and it adds pad for the struct to be aligned as 64bit. Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- include/drm/exynos_drm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/drm/exynos_drm.h b/include/drm/exynos_drm.h index b6d7ce92eadd..68733587e700 100644 --- a/include/drm/exynos_drm.h +++ b/include/drm/exynos_drm.h @@ -64,6 +64,7 @@ struct drm_exynos_gem_map_off { * A structure for mapping buffer. * * @handle: a handle to gem object created. + * @pad: just padding to be 64-bit aligned. * @size: memory size to be mapped. * @mapped: having user virtual address mmaped. * - this variable would be filled by exynos gem module @@ -72,7 +73,8 @@ struct drm_exynos_gem_map_off { */ struct drm_exynos_gem_mmap { unsigned int handle; - unsigned int size; + unsigned int pad; + uint64_t size; uint64_t mapped; }; From 293a1c128ecc523e9a74252ca64220d8081be759 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 May 2012 17:08:53 +0200 Subject: [PATCH 045/117] drm/exynos: DRIVER_BUS_PLATFORM is not a driver feature DRIVER_BUS_PLATFORM is a bus type used internally in the DRM core, not a flag for the drm_driver::driver_features field. Signed-off-by: Laurent Pinchart Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index 420953197d0a..d6de2e07fa03 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -244,8 +244,8 @@ static const struct file_operations exynos_drm_driver_fops = { }; static struct drm_driver exynos_drm_driver = { - .driver_features = DRIVER_HAVE_IRQ | DRIVER_BUS_PLATFORM | - DRIVER_MODESET | DRIVER_GEM | DRIVER_PRIME, + .driver_features = DRIVER_HAVE_IRQ | DRIVER_MODESET | + DRIVER_GEM | DRIVER_PRIME, .load = exynos_drm_load, .unload = exynos_drm_unload, .open = exynos_drm_open, From 6037bafa2e676162a86e4f4dee366e394565a0ee Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 May 2012 17:08:55 +0200 Subject: [PATCH 046/117] drm/exynos: Don't cast GEM object to Exynos GEM object when not needed The exynos_drm_gem_dumb_map_offset() doesn't need to access any Exynos-specific GEM object fields, don't cast the GEM object. Signed-off-by: Laurent Pinchart Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_gem.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index fc91293c4560..5c8b683029ea 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -689,7 +689,6 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle, uint64_t *offset) { - struct exynos_drm_gem_obj *exynos_gem_obj; struct drm_gem_object *obj; int ret = 0; @@ -710,15 +709,13 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv, goto unlock; } - exynos_gem_obj = to_exynos_gem_obj(obj); - - if (!exynos_gem_obj->base.map_list.map) { - ret = drm_gem_create_mmap_offset(&exynos_gem_obj->base); + if (!obj->map_list.map) { + ret = drm_gem_create_mmap_offset(obj); if (ret) goto out; } - *offset = (u64)exynos_gem_obj->base.map_list.hash.key << PAGE_SHIFT; + *offset = (u64)obj->map_list.hash.key << PAGE_SHIFT; DRM_DEBUG_KMS("offset = 0x%lx\n", (unsigned long)*offset); out: From 07b6835f2c6bc3101ed7cf471f566d53319a6d50 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 May 2012 17:08:56 +0200 Subject: [PATCH 047/117] drm/exynos: Keep a reference to frame buffer GEM objects GEM objects used by frame buffers must be referenced for the whole life of the frame buffer. Release the references in the frame buffer destructor instead of its constructor. Signed-off-by: Laurent Pinchart Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_fb.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fb.c b/drivers/gpu/drm/exynos/exynos_drm_fb.c index f82a299553fb..4ccfe4328fab 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fb.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fb.c @@ -51,11 +51,22 @@ struct exynos_drm_fb { static void exynos_drm_fb_destroy(struct drm_framebuffer *fb) { struct exynos_drm_fb *exynos_fb = to_exynos_fb(fb); + unsigned int i; DRM_DEBUG_KMS("%s\n", __FILE__); drm_framebuffer_cleanup(fb); + for (i = 0; i < ARRAY_SIZE(exynos_fb->exynos_gem_obj); i++) { + struct drm_gem_object *obj; + + if (exynos_fb->exynos_gem_obj[i] == NULL) + continue; + + obj = &exynos_fb->exynos_gem_obj[i]->base; + drm_gem_object_unreference_unlocked(obj); + } + kfree(exynos_fb); exynos_fb = NULL; } @@ -134,11 +145,11 @@ exynos_user_fb_create(struct drm_device *dev, struct drm_file *file_priv, return ERR_PTR(-ENOENT); } - drm_gem_object_unreference_unlocked(obj); - fb = exynos_drm_framebuffer_init(dev, mode_cmd, obj); - if (IS_ERR(fb)) + if (IS_ERR(fb)) { + drm_gem_object_unreference_unlocked(obj); return fb; + } exynos_fb = to_exynos_fb(fb); nr = exynos_drm_format_num_buffers(fb->pixel_format); @@ -152,8 +163,6 @@ exynos_user_fb_create(struct drm_device *dev, struct drm_file *file_priv, return ERR_PTR(-ENOENT); } - drm_gem_object_unreference_unlocked(obj); - exynos_fb->exynos_gem_obj[i] = to_exynos_gem_obj(obj); } From f56fdcef4d8991b0906461fec6494d7f9d401ef3 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 May 2012 17:08:54 +0200 Subject: [PATCH 048/117] drm/exynos: Remove dummy encoder get_crtc operation implementation The encoder get_crtc operation is called to retrieve a pointer to the CRTC the encoder is currenctly connected to, right after setting the encoder::crtc field to the new CRTC. The implementation of this operation returns the pointer to the new CRTC, which is then pointlessly compared to itself. As the operation is not mandatory, don't implement it. Signed-off-by: Laurent Pinchart Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_encoder.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_encoder.c b/drivers/gpu/drm/exynos/exynos_drm_encoder.c index 6e9ac7bd1dcf..23d5ad379f86 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_encoder.c +++ b/drivers/gpu/drm/exynos/exynos_drm_encoder.c @@ -172,19 +172,12 @@ static void exynos_drm_encoder_commit(struct drm_encoder *encoder) manager_ops->commit(manager->dev); } -static struct drm_crtc * -exynos_drm_encoder_get_crtc(struct drm_encoder *encoder) -{ - return encoder->crtc; -} - static struct drm_encoder_helper_funcs exynos_encoder_helper_funcs = { .dpms = exynos_drm_encoder_dpms, .mode_fixup = exynos_drm_encoder_mode_fixup, .mode_set = exynos_drm_encoder_mode_set, .prepare = exynos_drm_encoder_prepare, .commit = exynos_drm_encoder_commit, - .get_crtc = exynos_drm_encoder_get_crtc, }; static void exynos_drm_encoder_destroy(struct drm_encoder *encoder) From 5736603bef2383b6bb07f88596ccc8c387d91121 Mon Sep 17 00:00:00 2001 From: Seung-Woo Kim Date: Tue, 15 May 2012 17:22:08 +0900 Subject: [PATCH 049/117] drm/exynos: fixed blending for hdmi graphic layer Blending for graphic layer 0 of hdmi mixer was not set so video layer cannot be showed if graphic layer 0 is enabled. This patch fixes blending values to support blending between graphic layer 0 and video layer. Signed-off-by: Seung-Woo Kim Signed-off-by: Kyungmin Park Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_mixer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_mixer.c b/drivers/gpu/drm/exynos/exynos_mixer.c index 5a46e583c5b5..e2147a2ddcec 100644 --- a/drivers/gpu/drm/exynos/exynos_mixer.c +++ b/drivers/gpu/drm/exynos/exynos_mixer.c @@ -601,18 +601,20 @@ static void mixer_win_reset(struct mixer_context *ctx) mixer_reg_write(res, MXR_BG_COLOR2, 0x008080); /* setting graphical layers */ - val = MXR_GRP_CFG_COLOR_KEY_DISABLE; /* no blank key */ val |= MXR_GRP_CFG_WIN_BLEND_EN; + val |= MXR_GRP_CFG_BLEND_PRE_MUL; + val |= MXR_GRP_CFG_PIXEL_BLEND_EN; val |= MXR_GRP_CFG_ALPHA_VAL(0xff); /* non-transparent alpha */ /* the same configuration for both layers */ mixer_reg_write(res, MXR_GRAPHIC_CFG(0), val); - - val |= MXR_GRP_CFG_BLEND_PRE_MUL; - val |= MXR_GRP_CFG_PIXEL_BLEND_EN; mixer_reg_write(res, MXR_GRAPHIC_CFG(1), val); + /* setting video layers */ + val = MXR_GRP_CFG_ALPHA_VAL(0); + mixer_reg_write(res, MXR_VIDEO_CFG, val); + /* configuration of Video Processor Registers */ vp_win_reset(ctx); vp_default_filter(res); From f1ea8b66e5bf85802ae59961981f5e0d61510b18 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 3 Jun 2012 01:49:32 -0700 Subject: [PATCH 050/117] [PARISC] fix missing TAINT_WARN problem Al viro broke us with commit edd63a2763bdae0daa4f0a4d4c5d61d1154352a5 Author: Al Viro Date: Fri Apr 27 13:42:45 2012 -0400 set_restore_sigmask() is never called without SIGPENDING (and never should be) Although it's pretty much our fault since parisc's asm/bug.h uses BUGWARN_TAINT but doesn't include the file that defines it. Fix that. Signed-off-by: James Bottomley --- arch/parisc/include/asm/bug.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h index 72cfdb0cfdd1..62a33338549c 100644 --- a/arch/parisc/include/asm/bug.h +++ b/arch/parisc/include/asm/bug.h @@ -1,6 +1,8 @@ #ifndef _PARISC_BUG_H #define _PARISC_BUG_H +#include /* for BUGFLAG_TAINT */ + /* * Tell the user there is some problem. * The offending file and line are encoded in the __bug_table section. From 731455624fe5af906877aae80fb107daccaa9599 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Tue, 5 Jun 2012 13:53:04 +0900 Subject: [PATCH 051/117] [PARISC] fix compile break in use of lib/strncopy_from_user.c Linus broke us with commit 36126f8f2ed8168eb13aa0662b9b9585cba100a9 Author: Linus Torvalds Date: Sat May 26 10:43:17 2012 -0700 word-at-a-time: make the interfaces truly generic By moving functions defined in strncopy_from_user.c into the asm-geneic version word-at-a-time.h. Spark and OpenRisc were fixed to use this, but not parisc. Fix by adding to generic-y in asm/Kbuild Signed-off-by: James Bottomley --- arch/parisc/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 19a434f55059..4383707d9801 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,3 +1,4 @@ include include/asm-generic/Kbuild.asm header-y += pdc.h +generic-y += word-at-a-time.h From 4c01acc01d77e8df5727247aa5ef5912c00256bf Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Tue, 5 Jun 2012 13:54:09 +0900 Subject: [PATCH 052/117] [PARISC] fix code to find libgcc Sam broke this with commit 1f2bfbd00e466ff3489b2ca5cc75b1cccd14c123 Author: Sam Ravnborg Date: Sat May 5 10:18:41 2012 +0200 kbuild: link of vmlinux moved to a script But we should be deriving the location of libgcc in the same way as all the other archs, so fix by adding a LIBGCC variable which is evaluated in the makefile Signed-off-by: James Bottomley --- arch/parisc/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index dbc3850b1d0d..5707f1a62341 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -21,6 +21,7 @@ KBUILD_DEFCONFIG := default_defconfig NM = sh $(srctree)/arch/parisc/nm CHECKFLAGS += -D__hppa__=1 +LIBGCC = $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) MACHINE := $(shell uname -m) ifeq ($(MACHINE),parisc*) @@ -79,7 +80,7 @@ kernel-y := mm/ kernel/ math-emu/ kernel-$(CONFIG_HPUX) += hpux/ core-y += $(addprefix arch/parisc/, $(kernel-y)) -libs-y += arch/parisc/lib/ `$(CC) -print-libgcc-file-name` +libs-y += arch/parisc/lib/ $(LIBGCC) drivers-$(CONFIG_OPROFILE) += arch/parisc/oprofile/ From cb05d8dedefa3066bf5d74ef88c6ca6cf4bd1c87 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 23 May 2012 14:02:00 +0200 Subject: [PATCH 053/117] drm/i915: fix up ivb plane 3 pageflips Or at least plug another gapping hole. Apparrently hw desingers only moved the bit field, but did not bother ot re-enumerate the planes when adding support for a 3rd pipe. Discovered by i-g-t/flip_test. This may or may not fix the reference bugzilla, because that one smells like we have still larger fish to fry. v2: Fixup the impossible case to catch programming errors, noticed by Chris Wilson. References: https://bugs.freedesktop.org/show_bug.cgi?id=50069 Acked-by: Chris Wilson Tested-by: Eugeni Dodonov Eugeni Dodonov Cc: stable@vger.kernel.org Signed-Off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_reg.h | 8 ++++++++ drivers/gpu/drm/i915/intel_display.c | 19 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 2d49b9507ed0..76bc2756d7c4 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -210,6 +210,14 @@ #define MI_DISPLAY_FLIP MI_INSTR(0x14, 2) #define MI_DISPLAY_FLIP_I915 MI_INSTR(0x14, 1) #define MI_DISPLAY_FLIP_PLANE(n) ((n) << 20) +/* IVB has funny definitions for which plane to flip. */ +#define MI_DISPLAY_FLIP_IVB_PLANE_A (0 << 19) +#define MI_DISPLAY_FLIP_IVB_PLANE_B (1 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_A (2 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19) +#define MI_DISPLAY_FLIP_IVB_PLANE_C (4 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19) + #define MI_SET_CONTEXT MI_INSTR(0x18, 0) #define MI_MM_SPACE_GTT (1<<8) #define MI_MM_SPACE_PHYSICAL (0<<8) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 914789420906..e0aa064def31 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -6158,17 +6158,34 @@ static int intel_gen7_queue_flip(struct drm_device *dev, struct drm_i915_private *dev_priv = dev->dev_private; struct intel_crtc *intel_crtc = to_intel_crtc(crtc); struct intel_ring_buffer *ring = &dev_priv->ring[BCS]; + uint32_t plane_bit = 0; int ret; ret = intel_pin_and_fence_fb_obj(dev, obj, ring); if (ret) goto err; + switch(intel_crtc->plane) { + case PLANE_A: + plane_bit = MI_DISPLAY_FLIP_IVB_PLANE_A; + break; + case PLANE_B: + plane_bit = MI_DISPLAY_FLIP_IVB_PLANE_B; + break; + case PLANE_C: + plane_bit = MI_DISPLAY_FLIP_IVB_PLANE_C; + break; + default: + WARN_ONCE(1, "unknown plane in flip command\n"); + ret = -ENODEV; + goto err; + } + ret = intel_ring_begin(ring, 4); if (ret) goto err_unpin; - intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | (intel_crtc->plane << 19)); + intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit); intel_ring_emit(ring, (fb->pitches[0] | obj->tiling_mode)); intel_ring_emit(ring, (obj->gtt_offset)); intel_ring_emit(ring, (MI_NOOP)); From 958fb3c51295764599d6abce87e1a01ace897a3e Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Tue, 5 Jun 2012 10:35:02 +0800 Subject: [PATCH 054/117] x86/mce: Fix the MCE poll timer logic In commit 82f7af09 ("x86/mce: Cleanup timer mess), Thomas just forgot the "/ 2" there while cleaning up. Signed-off-by: Chen Gong Acked-by: Thomas Gleixner Cc: bp@amd64.org Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1338863702-9245-1-git-send-email-gong.chen@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0a687fd185e6..a97f3c4a3946 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1274,7 +1274,7 @@ static void mce_timer_fn(unsigned long data) */ iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - iv = max(iv, (unsigned long) HZ/100); + iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); __this_cpu_write(mce_next_interval, iv); From 436d03faf6961b30e13b2d0967aea9d772d6cf44 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 00:09:11 +0900 Subject: [PATCH 055/117] x86/decoder: Fix bsr/bsf/jmpe decoding with operand-size prefix Fix the x86 instruction decoder to decode bsr/bsf/jmpe with operand-size prefix (66h). This fixes the test case failure reported by Linus, attached below. bsf/bsr/jmpe have a special encoding. Opcode map in Intel Software Developers Manual vol2 says they have TZCNT/LZCNT variants if it has F3h prefix. However, there is no information if it has other 66h or F2h prefixes. Current instruction decoder supposes that those are bad instructions, but it actually accepts at least operand-size prefixes. H. Peter Anvin further explains: " TZCNT/LZCNT are F3 + BSF/BSR exactly because the F2 and F3 prefixes have historically been no-ops with most instructions. This allows software to unconditionally use the prefixed versions and get TZCNT/LZCNT on the processors that have them if they don't care about the difference. " This fixes errors reported by test_get_len: Warning: arch/x86/tools/test_get_len found difference at :ffffffff81036d87 Warning: ffffffff81036de5: 66 0f bc c2 bsf %dx,%ax Warning: objdump says 4 bytes, but insn_get_length() says 3 Warning: arch/x86/tools/test_get_len found difference at :ffffffff81036ea6 Warning: ffffffff81036f04: 66 0f bd c2 bsr %dx,%ax Warning: objdump says 4 bytes, but insn_get_length() says 3 Warning: decoded and checked 13298882 instructions with 2 warnings Reported-by: Linus Torvalds Reported-by: Pekka Enberg Signed-off-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Link: http://lkml.kernel.org/r/20120604150911.22338.43296.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 8 ++++---- arch/x86/tools/gen-insn-attr-x86.awk | 14 +++++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 819137904428..5d7e51f3fd28 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -28,7 +28,7 @@ # - (66): the last prefix is 0x66 # - (F3): the last prefix is 0xF3 # - (F2): the last prefix is 0xF2 -# +# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) Table: one byte opcode Referrer: @@ -515,12 +515,12 @@ b4: LFS Gv,Mp b5: LGS Gv,Mp b6: MOVZX Gv,Eb b7: MOVZX Gv,Ew -b8: JMPE | POPCNT Gv,Ev (F3) +b8: JMPE (!F3) | POPCNT Gv,Ev (F3) b9: Grp10 (1A) ba: Grp8 Ev,Ib (1A) bb: BTC Ev,Gv -bc: BSF Gv,Ev | TZCNT Gv,Ev (F3) -bd: BSR Gv,Ev | LZCNT Gv,Ev (F3) +bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3) +bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3) be: MOVSX Gv,Eb bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 5f6a5b6c3a15..ddcf39b1a18d 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -66,9 +66,10 @@ BEGIN { rex_expr = "^REX(\\.[XRWB]+)*" fpu_expr = "^ESC" # TODO - lprefix1_expr = "\\(66\\)" + lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\(F2\\)" + lprefix3_expr = "\\((F2|!F3)\\)" + lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 # All opcodes starting with lower-case 'v' or with (v1) superscript @@ -333,13 +334,16 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, lprefix1_expr)) { lptable1[idx] = add_flags(lptable1[idx],flags) variant = "INAT_VARIANT" - } else if (match(ext, lprefix2_expr)) { + } + if (match(ext, lprefix2_expr)) { lptable2[idx] = add_flags(lptable2[idx],flags) variant = "INAT_VARIANT" - } else if (match(ext, lprefix3_expr)) { + } + if (match(ext, lprefix3_expr)) { lptable3[idx] = add_flags(lptable3[idx],flags) variant = "INAT_VARIANT" - } else { + } + if (!match(ext, lprefix_expr)){ table[idx] = add_flags(table[idx],flags) } } From 1a87fc1ec7b05b9bc60df9dc52297d4c225d7f1a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Jun 2012 11:33:21 +0200 Subject: [PATCH 056/117] x86: mce: Add the dropped timer interval init back commit 82f7af09 ("x86/mce: Cleanup timer mess) dropped the initialization of the per cpu timer interval. Duh :( Restore the previous behaviour. Reported-by: Chen Gong Cc: bp@amd64.org Cc: tony.luck@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a97f3c4a3946..da27c5d2168a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1557,7 +1557,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); + unsigned long iv = check_interval * HZ; setup_timer(t, mce_timer_fn, smp_processor_id()); From aff5a62d52ff03956ff6992b9fe4b561fd855804 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 5 Jun 2012 15:00:31 -0400 Subject: [PATCH 057/117] x86/gart: Fix kmemleak warning aperture_64.c now is using memblock, the previous kmemleak_ignore() for alloc_bootmem() should be removed then. Otherwise, with kmemleak enabled, kernel will throw warnings like: [ 0.000000] kmemleak: Trying to color unknown object at 0xffff8800c4000000 as Black [ 0.000000] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc1-next-20120605+ #130 [ 0.000000] Call Trace: [ 0.000000] [] paint_ptr+0x66/0xc0 [ 0.000000] [] kmemleak_ignore+0x2b/0x60 [ 0.000000] [] kmemleak_init+0x217/0x2c1 [ 0.000000] [] start_kernel+0x32d/0x3eb [ 0.000000] [] ? repair_env_string+0x5a/0x5a [ 0.000000] [] x86_64_start_reservations+0x131/0x135 [ 0.000000] [] ? early_idt_handlers+0x120/0x120 [ 0.000000] [] x86_64_start_kernel+0x102/0x111 [ 0.000000] kmemleak: Early log backtrace: [ 0.000000] [] kmemleak_ignore+0x4b/0x60 [ 0.000000] [] gart_iommu_hole_init+0x3e7/0x547 [ 0.000000] [] pci_iommu_alloc+0x44/0x6f [ 0.000000] [] mem_init+0x19/0xec [ 0.000000] [] start_kernel+0x1ea/0x3eb [ 0.000000] [] x86_64_start_reservations+0x131/0x135 [ 0.000000] [] x86_64_start_kernel+0x102/0x111 [ 0.000000] [] 0xffffffffffffffff Signed-off-by: Xiaotian Feng Cc: Xiaotian Feng Cc: Tejun Heo Link: http://lkml.kernel.org/r/1338922831-2847-1-git-send-email-xtfeng@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e76c191a835..d5fd66f0d4cd 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void) return 0; } memblock_reserve(addr, aper_size); - /* - * Kmemleak should not scan this block as it may not be mapped via the - * kernel direct mapping. - */ - kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, addr); insert_aperture_resource((u32)addr, aper_size); From 4af463d28f1a026e25c0b879fac2a0d2b7bff599 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 4 Jun 2012 11:42:32 +0900 Subject: [PATCH 058/117] x86/numa: Set numa_nodes_parsed at acpi_numa_memory_affinity_init() When hot-adding a CPU, the system outputs following messages since node_to_cpumask_map[2] was not allocated memory. Booting Node 2 Processor 32 APIC 0xc0 node_to_cpumask_map[2] NULL Pid: 0, comm: swapper/32 Tainted: G A 3.3.5-acd #21 Call Trace: [] debug_cpumask_set_cpu+0x155/0x160 [] ? add_timer_on+0xaa/0x120 [] numa_add_cpu+0x1e/0x22 [] identify_cpu+0x1df/0x1e4 [] identify_econdary_cpu+0x16/0x1d [] smp_store_cpu_info+0x3c/0x3e [] smp_callin+0x139/0x1be [] start_secondary+0x13/0xeb The reason is that the bit of node 2 was not set at numa_nodes_parsed. numa_nodes_parsed is set by only acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init. Thus even if hot-added memory which is same PXM as hot-added CPU is written in ACPI SRAT Table, if the hot-added CPU is not written in ACPI SRAT table, numa_nodes_parsed is not set. But according to ACPI Spec Rev 5.0, it says about ACPI SRAT table as follows: This optional table provides information that allows OSPM to associate processors and memory ranges, including ranges of memory provided by hot-added memory devices, with system localities / proximity domains and clock domains. It means that ACPI SRAT table only provides information for CPUs present at boot time and for memory including hot-added memory. So hot-added memory is written in ACPI SRAT table, but hot-added CPU is not written in it. Thus numa_nodes_parsed should be set by not only acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init but also acpi_numa_memory_affinity_init for the case. Additionally, if system has cpuless memory node, acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init cannot set numa_nodes_parseds since these functions cannot find cpu description for the node. In this case, numa_nodes_parsed needs to be set by acpi_numa_memory_affinity_init. Signed-off-by: Yasuaki Ishimatsu Acked-by: David Rientjes Acked-by: KOSAKI Motohiro Cc: liuj97@gmail.com Cc: kosaki.motohiro@gmail.com Link: http://lkml.kernel.org/r/4FCC2098.4030007@jp.fujitsu.com [ merged it ] Signed-off-by: Ingo Molnar --- arch/x86/mm/srat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 732af3a96183..4599c3e8bcb6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -176,6 +176,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) return; } + node_set(node, numa_nodes_parsed); + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); From 7071f6b2889bb41bea61891d8a3e6e70517ef5e6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 31 May 2012 23:20:25 +0200 Subject: [PATCH 059/117] x86/intel/moorestown: Change intel_scu_devices_create() to __devinit The allmodconfig hits: WARNING: vmlinux.o(.text+0x6553d): Section mismatch in reference from the function intel_scu_devices_create() to the function .devinit.text: spi_register_board_info() [...] This patch marks intel_scu_devices_create() as devinit because it only calls a devinit function, spi_register_board_info(). Signed-off-by: Sebastian Andrzej Siewior Cc: Alan Cox Cc: Kirill A. Shutemov Cc: Mika Westerberg Cc: Samuel Ortiz Cc: Feng Tang Link: http://lkml.kernel.org/r/20120531212025.GA8519@breakpoint.cc Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index e31bcd8f2eee..fd41a9262d65 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -782,7 +782,7 @@ BLOCKING_NOTIFIER_HEAD(intel_scu_notifier); EXPORT_SYMBOL_GPL(intel_scu_notifier); /* Called by IPC driver */ -void intel_scu_devices_create(void) +void __devinit intel_scu_devices_create(void) { int i; From 55c844a4dd16a4d1fdc0cf2a283ec631a02ec448 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 30 May 2012 23:15:41 +0800 Subject: [PATCH 060/117] x86/reboot: Fix a warning message triggered by stop_other_cpus() When rebooting our 24 CPU Westmere servers with 3.4-rc6, we always see this warning msg: Restarting system. machine restart ------------[ cut here ]------------ WARNING: at arch/x86/kernel/smp.c:125 native_smp_send_reschedule+0x74/0xa7() Hardware name: X8DTN Modules linked in: igb [last unloaded: scsi_wait_scan] Pid: 1, comm: systemd-shutdow Not tainted 3.4.0-rc6+ #22 Call Trace: [] warn_slowpath_common+0x7e/0x96 [] warn_slowpath_null+0x15/0x17 [] native_smp_send_reschedule+0x74/0xa7 [] trigger_load_balance+0x279/0x2a6 [] scheduler_tick+0xe0/0xe9 [] update_process_times+0x60/0x70 [] tick_sched_timer+0x68/0x92 [] __run_hrtimer+0xb3/0x13c [] ? tick_nohz_handler+0xd0/0xd0 [] hrtimer_interrupt+0xdb/0x198 [] smp_apic_timer_interrupt+0x81/0x94 [] apic_timer_interrupt+0x67/0x70 [] ? default_send_IPI_mask_allbutself_phys+0xb4/0xc4 [] physflat_send_IPI_allbutself+0x12/0x14 [] native_nmi_stop_other_cpus+0x8a/0xd6 [] native_machine_shutdown+0x50/0x67 [] machine_shutdown+0xa/0xc [] native_machine_restart+0x20/0x32 [] machine_restart+0xa/0xc [] kernel_restart+0x47/0x4c [] sys_reboot+0x13e/0x17c [] ? _raw_spin_unlock_bh+0x10/0x12 [] ? bdi_queue_work+0xcf/0xd8 [] ? __bdi_start_writeback+0xae/0xb7 [] ? iterate_supers+0xa3/0xb7 [] system_call_fastpath+0x16/0x1b ---[ end trace 320af5cb1cb60c5b ]--- The root cause seems to be the default_send_IPI_mask_allbutself_phys() takes quite some time (I measured it could be several ms) to complete sending NMIs to all the other 23 CPUs, and for HZ=250/1000 system, the time is long enough for a timer interrupt to happen, which will in turn trigger to kick load balance to a stopped CPU and cause this warning in native_smp_send_reschedule(). So disabling the local irq before stop_other_cpu() can fix this problem (tested 25 times reboot ok), and it is fine as there should be nobody caring the timer interrupt in such reboot stage. The latest 3.4 kernel slightly changes this behavior by sending REBOOT_VECTOR first and only send NMI_VECTOR if the REBOOT_VCTOR fails, and this patch is still needed to prevent the problem. Signed-off-by: Feng Tang Acked-by: Don Zickus Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120530231541.4c13433a@feng-i7 Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af81604..25b48edb847c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -639,9 +639,11 @@ void native_machine_shutdown(void) set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); /* - * O.K Now that I'm on the appropriate processor, - * stop all of the others. + * O.K Now that I'm on the appropriate processor, stop all of the + * others. Also disable the local irq to not receive the per-cpu + * timer interrupt which may trigger scheduler's load balance. */ + local_irq_disable(); stop_other_cpus(); #endif From f6175f5bfb4c9f2ed32758c95f765b529b1a7f15 Mon Sep 17 00:00:00 2001 From: Tomoki Sekiyama Date: Mon, 28 May 2012 18:09:18 +0900 Subject: [PATCH 061/117] x86/ioapic: Fix NULL pointer dereference on CPU hotplug after disabling irqs In current Linux, percpu variable `vector_irq' is not cleared on offlined cpus while disabling devices' irqs. If the cpu that has the disabled irqs in vector_irq is hotplugged, __setup_vector_irq() hits invalid irq vector and may crash. This bug can be reproduced as following; # echo 0 > /sys/devices/system/cpu/cpu7/online # modprobe -r some_driver_using_interrupts # vector_irq@cpu7 uncleared # echo 1 > /sys/devices/system/cpu/cpu7/online # kernel may crash This patch fixes this bug by clearing vector_irq in __clear_irq_vector() even if the cpu is offlined. Signed-off-by: Tomoki Sekiyama Acked-by: Thomas Gleixner Cc: yrl.pp-manager.tt@hitachi.com Cc: ltc-kernel@ml.yrl.intra.hitachi.co.jp Cc: Suresh Siddha Cc: Yinghai Lu Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/4FC340BE.7080101@hitachi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a99..5f0ff597437c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1195,7 +1195,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) BUG_ON(!cfg->vector); vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + for_each_cpu(cpu, cfg->domain) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; @@ -1203,7 +1203,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu(cpu, cfg->old_domain) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) From ceb1cbac8eda66cf0f889def226b4e82f8ff857b Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 31 May 2012 13:07:38 +0530 Subject: [PATCH 062/117] sched/x86: Calculate booted cores after construction of sibling_mask Commit 316ad248307fb ("sched/x86: Rewrite set_cpu_sibling_map()") broke the booted_cores accounting. The problem is that the booted_cores accounting needs all the sibling links set up. So restore the second loop and add a comment as to why its needed. On qemu booted with -smp sockets=1,cores=2,threads=2; Before: $ grep cores /proc/cpuinfo cpu cores : 2 cpu cores : 1 cpu cores : 4 cpu cores : 3 With the patch: $ grep cores /proc/cpuinfo cpu cores : 2 cpu cores : 2 cpu cores : 2 cpu cores : 2 Reported-by: Prarit Bhargava Reported-by: Borislav Petkov Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120531073738.GH7511@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fd019d78b1f4..3fab55bea29b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -382,6 +382,15 @@ void __cpuinit set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mc && match_llc(c, o))) link_mask(llc_shared, cpu, i); + } + + /* + * This needs a separate iteration over the cpus because we rely on all + * cpu_sibling_mask links to be set-up. + */ + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + if ((i == cpu) || (has_mc && match_mc(c, o))) { link_mask(core, cpu, i); From 10717dcde10d09f9fcee53a12a4236af1a82b484 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 6 Jun 2012 14:52:51 +0800 Subject: [PATCH 063/117] sched/numa: Load balance between remote nodes Commit cb83b629b ("sched/numa: Rewrite the CONFIG_NUMA sched domain support") removed the NODE sched domain and started checking if the node distance in SLIT table is farther than REMOTE_DISTANCE, if so, it will lose the load balance chance at exec/fork/wake_affine points. But actually, even the node distance is farther than REMOTE_DISTANCE. Modern CPUs also has QPI like connections, which ensures that memory access is not too slow between nodes. So the above change in behavior on NUMA machine causes a performance regression on various benchmarks: hackbench, tbench, netperf, oltp, etc. This patch will recover the scheduler behavior to old mode on all my Intel platforms: NHM EP/EX, WSM EP, SNB EP/EP4S, and thus fixes the perfromance regressions. (all of them just have 2 kinds distance, 10, 21) Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338965571-9812-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c46958e26121..6546083af3e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6321,7 +6321,7 @@ static int sched_domains_curr_level; static inline int sd_local_flags(int level) { - if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) + if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) return 0; return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; From 7f1b43936f0ecad14770634c021cf4a929aec74d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 May 2012 21:19:46 +0200 Subject: [PATCH 064/117] sched/rt: Fix lockdep annotation within find_lock_lowest_rq() Roland Dreier reported spurious, hard to trigger lockdep warnings within the scheduler - without any real lockup. This bit gives us the right clue: > [89945.640512] [] double_lock_balance+0x5a/0x90 > [89945.640568] [] push_rt_task+0xc6/0x290 if you look at that code you'll find the double_lock_balance() in question is the one in find_lock_lowest_rq() [yay for inlining]. Now find_lock_lowest_rq() has a bug.. it fails to use double_unlock_balance() in one exit path, if this results in a retry in push_rt_task() we'll call double_lock_balance() again, at which point we'll run into said lockdep confusion. Reported-by: Roland Dreier Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1337282386.4281.77.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2a4e8dffbd6b..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1562,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->on_rq)) { - raw_spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; break; } From c1174876874dcf8986806e4dad3d7d07af20b439 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 14:47:33 +0200 Subject: [PATCH 065/117] sched: Fix domain iteration Weird topologies can lead to asymmetric domain setups. This needs further consideration since these setups are typically non-minimal too. For now, make it work by adding an extra mask selecting which CPUs are allowed to iterate up. The topology that triggered it is the one from David Rientjes: 10 20 20 30 20 10 20 20 20 20 10 20 30 20 20 10 resulting in boxes that wouldn't even boot. Reported-by: David Rientjes Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3p86l9cuaqnxz7uxsojmz5rm@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++++++++ kernel/sched/core.c | 64 +++++++++++++++++++++++++++++++++++++------ kernel/sched/fair.c | 5 ++-- kernel/sched/sched.h | 2 ++ 4 files changed, 72 insertions(+), 10 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6029d8c54476..ac321d753470 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -876,6 +876,8 @@ struct sched_group_power { * Number of busy cpus in this group. */ atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ }; struct sched_group { @@ -900,6 +902,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgp->cpumask); +} + /** * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. * @group: The group whose first cpu is to be returned. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6546083af3e0..781acb91a50a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5994,6 +5994,44 @@ struct sched_domain_topology_level { struct sd_data data; }; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -6012,6 +6050,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; + child = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(child))) + continue; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); @@ -6019,8 +6063,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - - child = *per_cpu_ptr(sdd->sd, i); if (child->child) { child = child->child; cpumask_copy(sg_span, sched_domain_span(child)); @@ -6030,13 +6072,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); sg->sgp = *per_cpu_ptr(sdd->sgp, i); - atomic_inc(&sg->sgp->ref); + if (atomic_inc_return(&sg->sgp->ref) == 1) + build_group_mask(sd, sg); + + /* + * Make sure the first group of this domain contains the + * canonical balance cpu. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - cpumask_first(sg_span) == cpu) { - WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); + group_balance_cpu(sg) == cpu) groups = sg; - } if (!first) first = sg; @@ -6109,6 +6156,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; + cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -6150,7 +6198,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sg = sg->next; } while (sg != sd->groups); - if (cpu != group_first_cpu(sg)) + if (cpu != group_balance_cpu(sg)) return; update_group_power(sd, cpu); @@ -6525,7 +6573,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power), + sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sgp) return -ENOMEM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2a2d236f27b..54cbaa4e7b37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, int i; if (local_group) - balance_cpu = group_first_cpu(group); + balance_cpu = group_balance_cpu(group); /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; @@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { + if (idle_cpu(i) && !first_idle_cpu && + cpumask_test_cpu(i, sched_group_mask(group))) { first_idle_cpu = 1; balance_cpu = i; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24ce..6d52cea7f33d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +extern int group_balance_cpu(struct sched_group *sg); + #endif /* CONFIG_SMP */ #include "stats.h" From c3decf0dfbc95736b7c0ab68fa4e5854c4734da9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 12:05:32 +0200 Subject: [PATCH 066/117] sched: Always initialize cpu-power Often when we run into mis-shapen topologies the balance iteration fails to update the cpu power properly and we'll end up in /0 traps. Always initialize the cpu-power to a semi-sane value so that we can at least boot the machine, even if the load-balancer might not function correctly. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3lbhyj25sr169ha7z3qht5na@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++++++++++- kernel/sched/fair.c | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 781acb91a50a..725ee7c1c8cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5604,7 +5604,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->sgp->power) { + /* + * Even though we initialize ->power to something semi-sane, + * we leave power_orig unset. This allows us to detect if + * domain iteration is still funny without causing /0 traps. + */ + if (!group->sgp->power_orig) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6075,6 +6080,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (atomic_inc_return(&sg->sgp->ref) == 1) build_group_mask(sd, sg); + /* + * Initialize sgp->power such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 54cbaa4e7b37..c9fd6d673d05 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3602,7 +3602,7 @@ void update_group_power(struct sched_domain *sd, int cpu) } while (group != child->groups); } - sdg->sgp->power = power; + sdg->sgp->power_orig = sdg->sgp->power = power; } /* From d039ac60800fe8ed8522ec3b9ca796aaf748c18b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 21:20:16 +0200 Subject: [PATCH 067/117] sched: Validate assumptions in sched_init_numa() Add some code to validate assumptions we're making and output warnings if they are not. If this trigger we want to know about it. Signed-off-by: Peter Zijlstra Cc: Alex Shi Link: http://lkml.kernel.org/n/tip-6uc3wk5s9udxtdl9cnku0vtt@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 99 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 19 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 725ee7c1c8cf..2bdd17616437 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_domain_debug_enabled; +static __read_mostly int sched_debug_enabled; -static int __init sched_domain_debug_setup(char *str) +static int __init sched_debug_setup(char *str) { - sched_domain_debug_enabled = 1; + sched_debug_enabled = 1; return 0; } -early_param("sched_debug", sched_domain_debug_setup); +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) @@ -5657,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_domain_debug_enabled) + if (!sched_debug_enabled) return; if (!sd) { @@ -5678,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} #endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) @@ -6373,7 +6382,6 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol #ifdef CONFIG_NUMA static int sched_domains_numa_levels; -static int sched_domains_numa_scale; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; @@ -6438,6 +6446,42 @@ static const struct cpumask *sd_numa_mask(int cpu) return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; } +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6445,7 +6489,6 @@ static void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_scale = curr_distance; sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); if (!sched_domains_numa_distance) return; @@ -6456,23 +6499,41 @@ static void sched_init_numa(void) * * Assumes node_distance(0,j) includes all distances in * node_distance(i,j) in order to avoid cubic time. - * - * XXX: could be optimized to O(n log n) by using sort() */ next_distance = curr_distance; for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - int distance = node_distance(0, j); - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; } /* * 'level' contains the number of unique distances, excluding the From b430f7c4706aeba4270c7ab7744fc504b9315e1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 15:30:31 +0200 Subject: [PATCH 068/117] perf/x86: Fix Intel shared extra MSR allocation Zheng Yan reported that event group validation can wreck event state when Intel extra_reg allocation changes event state. Validation shouldn't change any persistent state. Cloning events in validate_{event,group}() isn't really pretty either, so add a few special cases to avoid modifying the event state. The code is restructured to minimize the special case impact. Reported-by: Zheng Yan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338903031.28282.175.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 92 ++++++++++++++++++-------- 3 files changed, 66 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da0183..cb608383e4f6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) if (!cpuc->shared_regs) goto error; } + cpuc->is_fake = 1; return cpuc; error: free_fake_cpuc(cpuc); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6638aaf54493..83794d8e6af0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -117,6 +117,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ unsigned int group_flag; + int is_fake; /* * Intel DebugStore bits diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6aef..965baa2fa790 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +static int intel_alt_er(int idx) { if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) - return false; + return idx; - if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; - event->hw.extra_reg.idx = EXTRA_REG_RSP_1; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; - } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; event->hw.config |= 0x01b7; - event->hw.extra_reg.idx = EXTRA_REG_RSP_0; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } - - if (event->hw.extra_reg.idx == orig_idx) - return false; - - return true; } /* @@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, struct event_constraint *c = &emptyconstraint; struct er_account *era; unsigned long flags; - int orig_idx = reg->idx; + int idx = reg->idx; - /* already allocated shared msr */ - if (reg->alloc) + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) return NULL; /* call x86_get_event_constraint() */ again: - era = &cpuc->shared_regs->regs[reg->idx]; + era = &cpuc->shared_regs->regs[idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when * passing a fake cpuc @@ -1173,6 +1183,29 @@ again: if (!atomic_read(&era->ref) || era->config == reg->config) { + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + /* lock in msr value */ era->config = reg->config; era->reg = reg->reg; @@ -1180,17 +1213,17 @@ again: /* one more user */ atomic_inc(&era->ref); - /* no need to reallocate during incremental event scheduling */ - reg->alloc = 1; - /* * need to call x86_get_event_constraint() * to check if associated event has constraints */ c = NULL; - } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock_irqrestore(&era->lock, flags); - goto again; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, struct er_account *era; /* - * only put constraint if extra reg was actually - * allocated. Also takes care of event which do - * not use an extra shared reg + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. */ - if (!reg->alloc) + if (!reg->alloc || cpuc->is_fake) return; era = &cpuc->shared_regs->regs[reg->idx]; From cccb9ba9e4ee0d750265f53de9258df69655c40b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 069/117] perf/x86: Implement cycles:p for SNB/IVB Now that there's finally a chip with working PEBS (IvyBridge), we can enable the hardware and implement cycles:p for SNB/IVB. Cc: Stephane Eranian Requested-and-tested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 50 +++++++++++++++++++++----- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 83794d8e6af0..7241e2fc3c17 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -365,6 +365,7 @@ struct x86_pmu { int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 965baa2fa790..2312c1ff1b19 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1336,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_shared_regs_event_constraints(cpuc, event); } -static int intel_pmu_hw_config(struct perf_event *event) +static void intel_pebs_aliases_core2(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); - - if (ret) - return ret; - - if (event->attr.precise_ip && - (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. @@ -1365,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event) */ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); if (intel_pmu_needs_lbr_smpl(event)) { ret = intel_pmu_setup_lbr_filter(event); @@ -1643,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, @@ -1885,6 +1918,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; From b6db437ba8322f5cee0bd355ad2ef9f73c413754 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 070/117] perf/x86: Enable/Add IvyBridge hardware support Implement rudimentary IVB perf support. The SDM states its identical to SNB with exception of the exact event tables, but a quick look suggests they're similar enough. Also mark SNB-EP as broken for now. Requested-and-tested-by: Linus Torvalds Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2312c1ff1b19..187c294bc658 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1909,8 +1909,9 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); From 8440ccb43fc0ecffcf1acee0273d766e6a8cd51d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: [PATCH 071/117] perf/x86: Update SNB PEBS constraints Afaict there's no need to (incompletely) iterate the MEM_UOPS_RETIRED.* umask state. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 5a3edc27f6e5..35e2192df9f4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ - INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ - INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ - INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ From 67384fe3fd450536342330f684ea1f7dcaef8130 Mon Sep 17 00:00:00 2001 From: Eugeni Dodonov Date: Wed, 6 Jun 2012 11:59:06 -0300 Subject: [PATCH 072/117] char/agp: add another Ironlake host bridge This seems to come on Gigabyte H55M-S2V and was discovered through the https://bugs.freedesktop.org/show_bug.cgi?id=50381 debugging. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=50381 Signed-off-by: Eugeni Dodonov Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter --- drivers/char/agp/intel-agp.c | 1 + drivers/char/agp/intel-agp.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 764f70c5e690..0a4185279417 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -898,6 +898,7 @@ static struct pci_device_id agp_intel_pci_table[] = { ID(PCI_DEVICE_ID_INTEL_B43_HB), ID(PCI_DEVICE_ID_INTEL_B43_1_HB), ID(PCI_DEVICE_ID_INTEL_IRONLAKE_D_HB), + ID(PCI_DEVICE_ID_INTEL_IRONLAKE_D2_HB), ID(PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB), ID(PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB), ID(PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB), diff --git a/drivers/char/agp/intel-agp.h b/drivers/char/agp/intel-agp.h index c0091753a0d1..8e2d9140f300 100644 --- a/drivers/char/agp/intel-agp.h +++ b/drivers/char/agp/intel-agp.h @@ -212,6 +212,7 @@ #define PCI_DEVICE_ID_INTEL_G41_HB 0x2E30 #define PCI_DEVICE_ID_INTEL_G41_IG 0x2E32 #define PCI_DEVICE_ID_INTEL_IRONLAKE_D_HB 0x0040 +#define PCI_DEVICE_ID_INTEL_IRONLAKE_D2_HB 0x0069 #define PCI_DEVICE_ID_INTEL_IRONLAKE_D_IG 0x0042 #define PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB 0x0044 #define PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB 0x0062 From a841f8cef4bb124f0f5563314d0beaf2e1249d72 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 5 Jun 2012 13:44:36 -0500 Subject: [PATCH 073/117] sched: Fix the relax_domain_level boot parameter It does not get processed because sched_domain_level_max is 0 at the time that setup_relax_domain_level() is run. Simply accept the value as it is, as we don't know the value of sched_domain_level_max until sched domain construction is completed. Fix sched_relax_domain_level in cpuset. The build_sched_domain() routine calls the set_domain_attribute() routine prior to setting the sd->level, however, the set_domain_attribute() routine relies on the sd->level to decide whether idle load balancing will be off/on. Signed-off-by: Dimitri Sivanich Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120605184436.GA15668@sgi.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2bdd17616437..d5594a4268d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6268,11 +6268,8 @@ int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); return 1; } @@ -6698,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, if (!sd) return child; - set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; @@ -6706,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, child->parent = sd; } sd->child = child; + set_domain_attribute(sd, attr); return sd; } From 302fa4b58ac754a6da13f4f5546f710fecc3b945 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:33 -0700 Subject: [PATCH 074/117] perf/x86: Allow multiple stacks Without this patch, applications with two different stack regions (eg: native stack vs JIT stack) get truncated callchains even when RBP chaining is present. GDB shows proper stack traces and the frame pointer chaining is intact. This patch disables the (fp < RSP) check, hoping that other checks in the code save the day for us. In our limited testing, this didn't seem to break anything. In the long term, we could potentially have userspace advise the kernel on the range of valid stack addresses, so we don't spend a lot of time unwinding from bogus addresses. Signed-off-by: Arun Sharma CC: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Stephane Eranian Cc: Namhyung Kim Cc: Tom Zanussi Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-2-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cb608383e4f6..e78bc256aea8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1781,9 +1781,6 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (bytes != sizeof(frame)) break; - if (fp < compat_ptr(regs->sp)) - break; - perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } @@ -1827,9 +1824,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (bytes != sizeof(frame)) break; - if ((unsigned long)fp < regs->sp) - break; - perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } From 0b0d9cf6ec7bab91977da2d71c09157f110f7c2e Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:34 -0700 Subject: [PATCH 075/117] perf: Limit callchains to 127 Stack depth of 255 seems excessive, given that copy_from_user_nmi() could be slow. Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-3-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1817d4015e5f..45db49f64bb4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -555,7 +555,7 @@ enum perf_event_type { PERF_RECORD_MAX, /* non-ABI */ }; -#define PERF_MAX_STACK_DEPTH 255 +#define PERF_MAX_STACK_DEPTH 127 enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, From bc6ca7b342d5ae15c3ba3081fd40271b8039fb25 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:35 -0700 Subject: [PATCH 076/117] perf/x86: Check if user fp is valid Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-4-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 12 ++++++------ arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 04cd6882308e..e1f3a17034fc 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -33,9 +33,8 @@ #define segment_eq(a, b) ((a).seg == (b).seg) #define user_addr_max() (current_thread_info()->addr_limit.seg) -#define __addr_ok(addr) \ - ((unsigned long __force)(addr) < \ - (current_thread_info()->addr_limit.seg)) +#define __addr_ok(addr) \ + ((unsigned long __force)(addr) < user_addr_max()) /* * Test whether a block of memory is a valid user space address. @@ -47,14 +46,14 @@ * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ -#define __range_not_ok(addr, size) \ +#define __range_not_ok(addr, size, limit) \ ({ \ unsigned long flag, roksum; \ __chk_user_ptr(addr); \ asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ : "=&r" (flag), "=r" (roksum) \ : "1" (addr), "g" ((long)(size)), \ - "rm" (current_thread_info()->addr_limit.seg)); \ + "rm" (limit)); \ flag; \ }) @@ -77,7 +76,8 @@ * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) +#define access_ok(type, addr, size) \ + (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) /* * The exception table consists of pairs of addresses relative to the diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e78bc256aea8..c4706cf9c011 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1757,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } +static inline int +valid_user_frame(const void __user *fp, unsigned long size) +{ + return (__range_not_ok(fp, size, TASK_SIZE) == 0); +} + #ifdef CONFIG_COMPAT #include @@ -1781,6 +1787,9 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (bytes != sizeof(frame)) break; + if (!valid_user_frame(fp, sizeof(frame))) + break; + perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } @@ -1824,6 +1833,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (bytes != sizeof(frame)) break; + if (!valid_user_frame(fp, sizeof(frame))) + break; + perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } From db0dc75d6403b6663c0eab4c6ccb672eb9b2ed72 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:36 -0700 Subject: [PATCH 077/117] perf/x86: Check user address explicitly in copy_from_user_nmi() Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-5-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index f61ee67ec00f..677b1ed184c9 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -8,6 +8,7 @@ #include #include +#include /* * best effort, GUP based copy_from_user() that is NMI-safe @@ -21,6 +22,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) void *map; int ret; + if (__range_not_ok(from, n, TASK_SIZE) == 0) + return len; + do { ret = __get_user_pages_fast(addr, 1, 0, &page); if (!ret) From 10db9e009af5c3647b9ee742dcb14f6ff447a2a5 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 Jun 2012 11:21:44 -0400 Subject: [PATCH 078/117] tile: remove cpu_idle_on_new_stack This routine isn't used unless CONFIG_HOMECACHE is enabled, which isn't even available as a public configuration option yet. Since it no longer links correctly in 3.4, just remove it for now. Signed-off-by: Chris Metcalf --- arch/tile/include/asm/thread_info.h | 5 ----- arch/tile/kernel/entry.S | 14 -------------- 2 files changed, 19 deletions(-) diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 7e1fef36bde6..e9c670d7a7fe 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -91,11 +91,6 @@ extern void smp_nap(void); /* Enable interrupts racelessly and nap forever: helper for cpu_idle(). */ extern void _cpu_idle(void); -/* Switch boot idle thread to a freshly-allocated stack and free old stack. */ -extern void cpu_idle_on_new_stack(struct thread_info *old_ti, - unsigned long new_sp, - unsigned long new_ss10); - #else /* __ASSEMBLY__ */ /* diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S index 133c4b56a99e..c31637baff28 100644 --- a/arch/tile/kernel/entry.S +++ b/arch/tile/kernel/entry.S @@ -68,20 +68,6 @@ STD_ENTRY(KBacktraceIterator_init_current) jrp lr /* keep backtracer happy */ STD_ENDPROC(KBacktraceIterator_init_current) -/* - * Reset our stack to r1/r2 (sp and ksp0+cpu respectively), then - * free the old stack (passed in r0) and re-invoke cpu_idle(). - * We update sp and ksp0 simultaneously to avoid backtracer warnings. - */ -STD_ENTRY(cpu_idle_on_new_stack) - { - move sp, r1 - mtspr SPR_SYSTEM_SAVE_K_0, r2 - } - jal free_thread_info - j cpu_idle - STD_ENDPROC(cpu_idle_on_new_stack) - /* Loop forever on a nap during SMP boot. */ STD_ENTRY(smp_nap) nap From 2ded5c2484d7375860bdb5f3df1954a346a2da26 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 Jun 2012 11:23:19 -0400 Subject: [PATCH 079/117] tile: add #include to unbreak build after generic init_task conversion Some code was moved from init_task.c to setup.c but the appropriate header needed to be moved as well. Signed-off-by: Chris Metcalf --- arch/tile/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 6098ccc59be2..dd87f3420390 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include From edc4a67e15e34d2b3a2ab968625fd157751125d8 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 24 May 2012 16:08:09 +0300 Subject: [PATCH 080/117] mlx4_core: Fix setting VL_cap in mlx4_SET_PORT wrapper flow Commit 096335b3f983 ("mlx4_core: Allow dynamic MTU configuration for IB ports") modifies the port VL setting. This exposes a bug in mlx4_common_set_port(), where the VL cap value passed in (inside the command mailbox) is incorrectly zeroed-out: mlx4_SET_PORT modifies the VL_cap field (byte 3 of the mailbox). Since the SET_PORT command is paravirtualized on the master as well as on the slaves, mlx4_SET_PORT_wrapper() is invoked on the master. This calls mlx4_common_set_port() where mailbox byte 3 gets overwritten by code which should only set a single bit in that byte (for the reset qkey counter flag) -- but instead overwrites the entire byte. The result is that when running in SR-IOV mode, the VL_cap will be set to zero -- fix this. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx4/port.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c index 1fe2c7a8b40c..a8fb52992c64 100644 --- a/drivers/net/ethernet/mellanox/mlx4/port.c +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -697,10 +697,10 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, if (slave != dev->caps.function) memset(inbox->buf, 0, 256); if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { - *(u8 *) inbox->buf = !!reset_qkey_viols << 6; + *(u8 *) inbox->buf |= !!reset_qkey_viols << 6; ((__be32 *) inbox->buf)[2] = agg_cap_mask; } else { - ((u8 *) inbox->buf)[3] = !!reset_qkey_viols; + ((u8 *) inbox->buf)[3] |= !!reset_qkey_viols; ((__be32 *) inbox->buf)[1] = agg_cap_mask; } From fc2d004419abebcfd740f46c32dd8201ce1b33c9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 24 May 2012 16:08:08 +0300 Subject: [PATCH 081/117] IB/mlx4: Fix max_wqe capacity reported from query device 1. Limit the max number of WQEs per QP reported when querying the device, so that ib_create_qp() will not fail for a QP size that the device claimed to support due to additional headroom WQEs being allocated. 2. Limit qp resources accepted for ib_create_qp() to the limits reported in ib_query_device(). In kernel space, make sure that the limits returned to the caller following qp creation also lie within the reported device limits. For userspace, report as before, and do adjustment in libmlx4 (so as not to break ABI). Signed-off-by: Jack Morgenstein Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx4/mlx4_ib.h | 8 ++++++++ drivers/infiniband/hw/mlx4/qp.c | 21 +++++++++++++++------ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 8afea12c8d44..3530c41fcd1f 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -140,7 +140,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; - props->max_qp_wr = dev->dev->caps.max_wqes; + props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e62297cc77cc..ff36655d23d3 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -44,6 +44,14 @@ #include #include +enum { + MLX4_IB_SQ_MIN_WQE_SHIFT = 6, + MLX4_IB_MAX_HEADROOM = 2048 +}; + +#define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) +#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) + struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index ceb33327091a..8d4ed24aef93 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -310,8 +310,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, int is_user, int has_rq, struct mlx4_ib_qp *qp) { /* Sanity check RQ size before proceeding */ - if (cap->max_recv_wr > dev->dev->caps.max_wqes || - cap->max_recv_sge > dev->dev->caps.max_rq_sg) + if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || + cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) return -EINVAL; if (!has_rq) { @@ -329,8 +329,17 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); } - cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; - cap->max_recv_sge = qp->rq.max_gs; + /* leave userspace return values as they were, so as not to break ABI */ + if (is_user) { + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + } else { + cap->max_recv_wr = qp->rq.max_post = + min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); + cap->max_recv_sge = min(qp->rq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + } return 0; } @@ -341,8 +350,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, int s; /* Sanity check SQ size before proceeding */ - if (cap->max_send_wr > dev->dev->caps.max_wqes || - cap->max_send_sge > dev->dev->caps.max_sq_sg || + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || cap->max_inline_data + send_wqe_overhead(type, qp->flags) + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) return -EINVAL; From 23e81d691a813839020f6e516b398d0f9369fe8b Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Wed, 6 Jun 2012 15:45:44 -0400 Subject: [PATCH 082/117] drm/i915: pch_irq_handler -> {ibx, cpt}_irq_handler Cougar/Panther Point redefine the bits in SDEIIR pretty completely. This function is just debugging, but if we're debugging we probably want to be told accurate things instead of lies. I'm told Lynx Point changes this yet more, but I have no idea how... Note from Eugeni's review: "For the record and for future enabling efforts, for LPT, bits 28-31 and 1-14 are gone since CPT/PPT (e.g., those must be zero). And there is the bit 15 as a new addition, but we are not using it yet and probably won't be using in foreseeable future." Signed-off-by: Adam Jackson Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=35103 Reviewed-by: Eugeni Dodonov Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_irq.c | 38 ++++++++++++++++++++++++++++++--- drivers/gpu/drm/i915/i915_reg.h | 35 +++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 1417660a93ec..b1fe0edda955 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -510,7 +510,7 @@ out: return ret; } -static void pch_irq_handler(struct drm_device *dev, u32 pch_iir) +static void ibx_irq_handler(struct drm_device *dev, u32 pch_iir) { drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; int pipe; @@ -550,6 +550,35 @@ static void pch_irq_handler(struct drm_device *dev, u32 pch_iir) DRM_DEBUG_DRIVER("PCH transcoder A underrun interrupt\n"); } +static void cpt_irq_handler(struct drm_device *dev, u32 pch_iir) +{ + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; + int pipe; + + if (pch_iir & SDE_AUDIO_POWER_MASK_CPT) + DRM_DEBUG_DRIVER("PCH audio power change on port %d\n", + (pch_iir & SDE_AUDIO_POWER_MASK_CPT) >> + SDE_AUDIO_POWER_SHIFT_CPT); + + if (pch_iir & SDE_AUX_MASK_CPT) + DRM_DEBUG_DRIVER("AUX channel interrupt\n"); + + if (pch_iir & SDE_GMBUS_CPT) + DRM_DEBUG_DRIVER("PCH GMBUS interrupt\n"); + + if (pch_iir & SDE_AUDIO_CP_REQ_CPT) + DRM_DEBUG_DRIVER("Audio CP request interrupt\n"); + + if (pch_iir & SDE_AUDIO_CP_CHG_CPT) + DRM_DEBUG_DRIVER("Audio CP change interrupt\n"); + + if (pch_iir & SDE_FDI_MASK_CPT) + for_each_pipe(pipe) + DRM_DEBUG_DRIVER(" pipe %c FDI IIR: 0x%08x\n", + pipe_name(pipe), + I915_READ(FDI_RX_IIR(pipe))); +} + static irqreturn_t ivybridge_irq_handler(DRM_IRQ_ARGS) { struct drm_device *dev = (struct drm_device *) arg; @@ -591,7 +620,7 @@ static irqreturn_t ivybridge_irq_handler(DRM_IRQ_ARGS) if (pch_iir & SDE_HOTPLUG_MASK_CPT) queue_work(dev_priv->wq, &dev_priv->hotplug_work); - pch_irq_handler(dev, pch_iir); + cpt_irq_handler(dev, pch_iir); /* clear PCH hotplug event before clear CPU irq */ I915_WRITE(SDEIIR, pch_iir); @@ -684,7 +713,10 @@ static irqreturn_t ironlake_irq_handler(DRM_IRQ_ARGS) if (de_iir & DE_PCH_EVENT) { if (pch_iir & hotplug_mask) queue_work(dev_priv->wq, &dev_priv->hotplug_work); - pch_irq_handler(dev, pch_iir); + if (HAS_PCH_CPT(dev)) + cpt_irq_handler(dev, pch_iir); + else + ibx_irq_handler(dev, pch_iir); } if (de_iir & DE_PCU_EVENT) { diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 76bc2756d7c4..48d5e8e051cf 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -3321,7 +3321,7 @@ /* PCH */ -/* south display engine interrupt */ +/* south display engine interrupt: IBX */ #define SDE_AUDIO_POWER_D (1 << 27) #define SDE_AUDIO_POWER_C (1 << 26) #define SDE_AUDIO_POWER_B (1 << 25) @@ -3357,15 +3357,44 @@ #define SDE_TRANSA_CRC_ERR (1 << 1) #define SDE_TRANSA_FIFO_UNDER (1 << 0) #define SDE_TRANS_MASK (0x3f) -/* CPT */ -#define SDE_CRT_HOTPLUG_CPT (1 << 19) + +/* south display engine interrupt: CPT/PPT */ +#define SDE_AUDIO_POWER_D_CPT (1 << 31) +#define SDE_AUDIO_POWER_C_CPT (1 << 30) +#define SDE_AUDIO_POWER_B_CPT (1 << 29) +#define SDE_AUDIO_POWER_SHIFT_CPT 29 +#define SDE_AUDIO_POWER_MASK_CPT (7 << 29) +#define SDE_AUXD_CPT (1 << 27) +#define SDE_AUXC_CPT (1 << 26) +#define SDE_AUXB_CPT (1 << 25) +#define SDE_AUX_MASK_CPT (7 << 25) #define SDE_PORTD_HOTPLUG_CPT (1 << 23) #define SDE_PORTC_HOTPLUG_CPT (1 << 22) #define SDE_PORTB_HOTPLUG_CPT (1 << 21) +#define SDE_CRT_HOTPLUG_CPT (1 << 19) #define SDE_HOTPLUG_MASK_CPT (SDE_CRT_HOTPLUG_CPT | \ SDE_PORTD_HOTPLUG_CPT | \ SDE_PORTC_HOTPLUG_CPT | \ SDE_PORTB_HOTPLUG_CPT) +#define SDE_GMBUS_CPT (1 << 17) +#define SDE_AUDIO_CP_REQ_C_CPT (1 << 10) +#define SDE_AUDIO_CP_CHG_C_CPT (1 << 9) +#define SDE_FDI_RXC_CPT (1 << 8) +#define SDE_AUDIO_CP_REQ_B_CPT (1 << 6) +#define SDE_AUDIO_CP_CHG_B_CPT (1 << 5) +#define SDE_FDI_RXB_CPT (1 << 4) +#define SDE_AUDIO_CP_REQ_A_CPT (1 << 2) +#define SDE_AUDIO_CP_CHG_A_CPT (1 << 1) +#define SDE_FDI_RXA_CPT (1 << 0) +#define SDE_AUDIO_CP_REQ_CPT (SDE_AUDIO_CP_REQ_C_CPT | \ + SDE_AUDIO_CP_REQ_B_CPT | \ + SDE_AUDIO_CP_REQ_A_CPT) +#define SDE_AUDIO_CP_CHG_CPT (SDE_AUDIO_CP_CHG_C_CPT | \ + SDE_AUDIO_CP_CHG_B_CPT | \ + SDE_AUDIO_CP_CHG_A_CPT) +#define SDE_FDI_MASK_CPT (SDE_FDI_RXC_CPT | \ + SDE_FDI_RXB_CPT | \ + SDE_FDI_RXA_CPT) #define SDEISR 0xc4000 #define SDEIMR 0xc4004 From e9b4cf2094a65a05a831f070e46c554260632330 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 6 Jun 2012 15:22:41 +0300 Subject: [PATCH 083/117] UBI: fix debugfs-less systems support Commit "aa44d1d UBI: remove Kconfig debugging option" broke UBI and it refuses to initialize if debugfs (CONFIG_DEBUG_FS) is disabled. I incorrectly assumed that debugfs files creation function will return success if debugfs is disabled, but they actually return -ENODEV. This patch fixes the issue. Reported-by: Paul Parsons Signed-off-by: Artem Bityutskiy Tested-by: Paul Parsons --- drivers/mtd/ubi/debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c index 9f957c2d48e9..09d4f8d9d592 100644 --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@ -264,6 +264,9 @@ static struct dentry *dfs_rootdir; */ int ubi_debugfs_init(void) { + if (!IS_ENABLED(DEBUG_FS)) + return 0; + dfs_rootdir = debugfs_create_dir("ubi", NULL); if (IS_ERR_OR_NULL(dfs_rootdir)) { int err = dfs_rootdir ? -ENODEV : PTR_ERR(dfs_rootdir); @@ -281,7 +284,8 @@ int ubi_debugfs_init(void) */ void ubi_debugfs_exit(void) { - debugfs_remove(dfs_rootdir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove(dfs_rootdir); } /* Read an UBI debugfs file */ @@ -403,6 +407,9 @@ int ubi_debugfs_init_dev(struct ubi_device *ubi) struct dentry *dent; struct ubi_debug_info *d = ubi->dbg; + if (!IS_ENABLED(DEBUG_FS)) + return 0; + n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN + 1, UBI_DFS_DIR_NAME, ubi->ubi_num); if (n == UBI_DFS_DIR_LEN) { @@ -470,5 +477,6 @@ out: */ void ubi_debugfs_exit_dev(struct ubi_device *ubi) { - debugfs_remove_recursive(ubi->dbg->dfs_dir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove_recursive(ubi->dbg->dfs_dir); } From 818039c7d597db3b1d30964a8f9489ac42c0642d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 6 Jun 2012 16:03:10 +0300 Subject: [PATCH 084/117] UBIFS: fix debugfs-less systems support Commit "f70b7e5 UBIFS: remove Kconfig debugging option" broke UBIFS and it refuses to initialize if debugfs (CONFIG_DEBUG_FS) is disabled. I incorrectly assumed that debugfs files creation function will return success if debugfs is disabled, but they actually return -ENODEV. This patch fixes the issue. Reported-by: Paul Parsons Signed-off-by: Artem Bityutskiy Tested-by: Paul Parsons --- fs/ubifs/debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 685a83756b2b..84a7e6f3c046 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2918,6 +2918,9 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) struct dentry *dent; struct ubifs_debug_info *d = c->dbg; + if (!IS_ENABLED(DEBUG_FS)) + return 0; + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); if (n == UBIFS_DFS_DIR_LEN) { @@ -3010,7 +3013,8 @@ out: */ void dbg_debugfs_exit_fs(struct ubifs_info *c) { - debugfs_remove_recursive(c->dbg->dfs_dir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove_recursive(c->dbg->dfs_dir); } struct ubifs_global_debug_info ubifs_dbg; @@ -3095,6 +3099,9 @@ int dbg_debugfs_init(void) const char *fname; struct dentry *dent; + if (!IS_ENABLED(DEBUG_FS)) + return 0; + fname = "ubifs"; dent = debugfs_create_dir(fname, NULL); if (IS_ERR_OR_NULL(dent)) @@ -3159,7 +3166,8 @@ out: */ void dbg_debugfs_exit(void) { - debugfs_remove_recursive(dfs_rootdir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove_recursive(dfs_rootdir); } /** From 12027f1b3fd69a4e9017e6b13c72547a99c6cf54 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 7 Jun 2012 15:15:30 +0300 Subject: [PATCH 085/117] UBI: correct ubi_wl_flush locking Commit "62f38455 UBI: modify ubi_wl_flush function to clear work queue for a lnum" takes the 'work_sem' semaphore in write mode for the entire loop, which is not very good because it will block other workers for potentially long time. We do not need to have it in write mode - read mode is enough, and we do not need to hole it over the entire loop. So this patch turns changes the locking: takes 'work_sem' in read mode and pushes it down to the loop. Signed-off-by: Artem Bityutskiy --- drivers/mtd/ubi/wl.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 9df100a4ec38..b6be644e7b85 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1262,11 +1262,11 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum) dbg_wl("flush pending work for LEB %d:%d (%d pending works)", vol_id, lnum, ubi->works_count); - down_write(&ubi->work_sem); while (found) { struct ubi_work *wrk; found = 0; + down_read(&ubi->work_sem); spin_lock(&ubi->wl_lock); list_for_each_entry(wrk, &ubi->works, list) { if ((vol_id == UBI_ALL || wrk->vol_id == vol_id) && @@ -1277,18 +1277,27 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum) spin_unlock(&ubi->wl_lock); err = wrk->func(ubi, wrk, 0); - if (err) - goto out; + if (err) { + up_read(&ubi->work_sem); + return err; + } + spin_lock(&ubi->wl_lock); found = 1; break; } } spin_unlock(&ubi->wl_lock); + up_read(&ubi->work_sem); } -out: + /* + * Make sure all the works which have been done in parallel are + * finished. + */ + down_write(&ubi->work_sem); up_write(&ubi->work_sem); + return err; } From 743628e868c5992354fc80b4d1e9a6143da1c0e6 Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Thu, 7 Jun 2012 09:05:21 -0700 Subject: [PATCH 086/117] x86, efi stub: Add .reloc section back into image Some UEFI firmware will not load a .efi with a .reloc section with a size of 0. Therefore, we create a .efi image with 4 main areas and 3 sections. 1. PE/COFF file header 2. .setup section (covers all setup code following the first sector) 3. .reloc section (contains 1 dummy reloc entry, created in build.c) 4. .text section (covers the remaining kernel image) To make room for the new .setup section data, the header bugger_off_msg had to be shortened. Reported-by: Henrik Rydberg Signed-off-by: Jordan Justen Link: http://lkml.kernel.org/r/1339085121-12760-1-git-send-email-jordan.l.justen@intel.com Tested-by: Lee G Rosenbaum Tested-by: Henrik Rydberg Cc: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/boot/header.S | 42 ++++++--- arch/x86/boot/tools/build.c | 172 +++++++++++++++++++++++------------- 2 files changed, 140 insertions(+), 74 deletions(-) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 8bbea6aa40d9..efe5acfc79c3 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -94,10 +94,10 @@ bs_die: .section ".bsdata", "a" bugger_off_msg: - .ascii "Direct booting from floppy is no longer supported.\r\n" - .ascii "Please use a boot loader program instead.\r\n" + .ascii "Direct floppy boot is not supported. " + .ascii "Use a boot loader program instead.\r\n" .ascii "\n" - .ascii "Remove disk and press any key to reboot . . .\r\n" + .ascii "Remove disk and press any key to reboot ...\r\n" .byte 0 #ifdef CONFIG_EFI_STUB @@ -111,7 +111,7 @@ coff_header: #else .word 0x8664 # x86-64 #endif - .word 2 # nr_sections + .word 3 # nr_sections .long 0 # TimeDateStamp .long 0 # PointerToSymbolTable .long 1 # NumberOfSymbols @@ -158,8 +158,8 @@ extra_header_fields: #else .quad 0 # ImageBase #endif - .long 0x1000 # SectionAlignment - .long 0x200 # FileAlignment + .long 0x20 # SectionAlignment + .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion .word 0 # MajorImageVersion @@ -200,8 +200,10 @@ extra_header_fields: # Section table section_table: - .ascii ".text" - .byte 0 + # + # The offset & size fields are filled in by build.c. + # + .ascii ".setup" .byte 0 .byte 0 .long 0 @@ -217,9 +219,8 @@ section_table: # # The EFI application loader requires a relocation section - # because EFI applications must be relocatable. But since - # we don't need the loader to fixup any relocs for us, we - # just create an empty (zero-length) .reloc section header. + # because EFI applications must be relocatable. The .reloc + # offset & size fields are filled in by build.c. # .ascii ".reloc" .byte 0 @@ -233,6 +234,25 @@ section_table: .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers .long 0x42100040 # Characteristics (section flags) + + # + # The offset & size fields are filled in by build.c. + # + .ascii ".text" + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 # startup_{32,64} + .long 0 # Size of initialized data + # on disk + .long 0x0 # startup_{32,64} + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x60500020 # Characteristics (section flags) + #endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 3f61f6e2b46f..4b8e165ee572 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -50,6 +50,8 @@ typedef unsigned int u32; u8 buf[SETUP_SECT_MAX*512]; int is_big_kernel; +#define PECOFF_RELOC_RESERVE 0x20 + /*----------------------------------------------------------------------*/ static const u32 crctab32[] = { @@ -133,11 +135,103 @@ static void usage(void) die("Usage: build setup system [> image]"); } +#ifdef CONFIG_EFI_STUB + +static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +{ + unsigned int pe_header; + unsigned short num_sections; + u8 *section; + + pe_header = get_unaligned_le32(&buf[0x3c]); + num_sections = get_unaligned_le16(&buf[pe_header + 6]); + +#ifdef CONFIG_X86_32 + section = &buf[pe_header + 0xa8]; +#else + section = &buf[pe_header + 0xb8]; +#endif + + while (num_sections > 0) { + if (strncmp((char*)section, section_name, 8) == 0) { + /* section header size field */ + put_unaligned_le32(size, section + 0x8); + + /* section header vma field */ + put_unaligned_le32(offset, section + 0xc); + + /* section header 'size of initialised data' field */ + put_unaligned_le32(size, section + 0x10); + + /* section header 'file offset' field */ + put_unaligned_le32(offset, section + 0x14); + + break; + } + section += 0x28; + num_sections--; + } +} + +static void update_pecoff_setup_and_reloc(unsigned int size) +{ + u32 setup_offset = 0x200; + u32 reloc_offset = size - PECOFF_RELOC_RESERVE; + u32 setup_size = reloc_offset - setup_offset; + + update_pecoff_section_header(".setup", setup_offset, setup_size); + update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE); + + /* + * Modify .reloc section contents with a single entry. The + * relocation is applied to offset 10 of the relocation section. + */ + put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]); + put_unaligned_le32(10, &buf[reloc_offset + 4]); +} + +static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) +{ + unsigned int pe_header; + unsigned int text_sz = file_sz - text_start; + + pe_header = get_unaligned_le32(&buf[0x3c]); + + /* Size of image */ + put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); + + /* + * Size of code: Subtract the size of the first sector (512 bytes) + * which includes the header. + */ + put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]); + +#ifdef CONFIG_X86_32 + /* + * Address of entry point. + * + * The EFI stub entry point is +16 bytes from the start of + * the .text section. + */ + put_unaligned_le32(text_start + 16, &buf[pe_header + 0x28]); +#else + /* + * Address of entry point. startup_32 is at the beginning and + * the 64-bit entry point (startup_64) is always 512 bytes + * after. The EFI stub entry point is 16 bytes after that, as + * the first instruction allows legacy loaders to jump over + * the EFI stub initialisation + */ + put_unaligned_le32(text_start + 528, &buf[pe_header + 0x28]); +#endif /* CONFIG_X86_32 */ + + update_pecoff_section_header(".text", text_start, text_sz); +} + +#endif /* CONFIG_EFI_STUB */ + int main(int argc, char ** argv) { -#ifdef CONFIG_EFI_STUB - unsigned int file_sz, pe_header; -#endif unsigned int i, sz, setup_sectors; int c; u32 sys_size; @@ -163,6 +257,12 @@ int main(int argc, char ** argv) die("Boot block hasn't got boot flag (0xAA55)"); fclose(file); +#ifdef CONFIG_EFI_STUB + /* Reserve 0x20 bytes for .reloc section */ + memset(buf+c, 0, PECOFF_RELOC_RESERVE); + c += PECOFF_RELOC_RESERVE; +#endif + /* Pad unused space with zeros */ setup_sectors = (c + 511) / 512; if (setup_sectors < SETUP_SECT_MIN) @@ -170,6 +270,10 @@ int main(int argc, char ** argv) i = setup_sectors*512; memset(buf+c, 0, i-c); +#ifdef CONFIG_EFI_STUB + update_pecoff_setup_and_reloc(i); +#endif + /* Set the default root device */ put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); @@ -194,66 +298,8 @@ int main(int argc, char ** argv) put_unaligned_le32(sys_size, &buf[0x1f4]); #ifdef CONFIG_EFI_STUB - file_sz = sz + i + ((sys_size * 16) - sz); - - pe_header = get_unaligned_le32(&buf[0x3c]); - - /* Size of image */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); - - /* - * Subtract the size of the first section (512 bytes) which - * includes the header and .reloc section. The remaining size - * is that of the .text section. - */ - file_sz -= 512; - - /* Size of code */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x1c]); - -#ifdef CONFIG_X86_32 - /* - * Address of entry point. - * - * The EFI stub entry point is +16 bytes from the start of - * the .text section. - */ - put_unaligned_le32(i + 16, &buf[pe_header + 0x28]); - - /* .text size */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xb0]); - - /* .text vma */ - put_unaligned_le32(0x200, &buf[pe_header + 0xb4]); - - /* .text size of initialised data */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xb8]); - - /* .text file offset */ - put_unaligned_le32(0x200, &buf[pe_header + 0xbc]); -#else - /* - * Address of entry point. startup_32 is at the beginning and - * the 64-bit entry point (startup_64) is always 512 bytes - * after. The EFI stub entry point is 16 bytes after that, as - * the first instruction allows legacy loaders to jump over - * the EFI stub initialisation - */ - put_unaligned_le32(i + 528, &buf[pe_header + 0x28]); - - /* .text size */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xc0]); - - /* .text vma */ - put_unaligned_le32(0x200, &buf[pe_header + 0xc4]); - - /* .text size of initialised data */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xc8]); - - /* .text file offset */ - put_unaligned_le32(0x200, &buf[pe_header + 0xcc]); -#endif /* CONFIG_X86_32 */ -#endif /* CONFIG_EFI_STUB */ + update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); +#endif crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, stdout) != i) From 0142ef6cdca5f9784eb0762ac50fe378d98d71d4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 7 Jun 2012 14:21:09 -0700 Subject: [PATCH 087/117] shmem: replace_page must flush_dcache and others Commit bde05d1ccd51 ("shmem: replace page if mapping excludes its zone") is not at all likely to break for anyone, but it was an earlier version from before review feedback was incorporated. Fix that up now. * shmem_replace_page must flush_dcache_page after copy_highpage [akpm] * Expand comment on why shmem_unuse_inode needs page_swapcount [akpm] * Remove excess of VM_BUG_ONs from shmem_replace_page [wangcong] * Check page_private matches swap before calling shmem_replace_page [hughd] * shmem_replace_page allow for unexpected race in radix_tree lookup [hughd] Signed-off-by: Hugh Dickins Cc: Cong Wang Cc: Christoph Hellwig Cc: KAMEZAWA Hiroyuki Cc: Alan Cox Cc: Stephane Marchesin Cc: Andi Kleen Cc: Dave Airlie Cc: Daniel Vetter Cc: Rob Clark Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 57 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 585bd220a21e..a15a466d0d1d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -683,10 +683,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, mutex_lock(&shmem_swaplist_mutex); /* * We needed to drop mutex to make that restrictive page - * allocation; but the inode might already be freed by now, - * and we cannot refer to inode or mapping or info to check. - * However, we do hold page lock on the PageSwapCache page, - * so can check if that still has our reference remaining. + * allocation, but the inode might have been freed while we + * dropped it: although a racing shmem_evict_inode() cannot + * complete without emptying the radix_tree, our page lock + * on this swapcache page is not enough to prevent that - + * free_swap_and_cache() of our swap entry will only + * trylock_page(), removing swap from radix_tree whatever. + * + * We must not proceed to shmem_add_to_page_cache() if the + * inode has been freed, but of course we cannot rely on + * inode or mapping or info to check that. However, we can + * safely check if our swap entry is still in use (and here + * it can't have got reused for another page): if it's still + * in use, then the inode cannot have been freed yet, and we + * can safely proceed (if it's no longer in use, that tells + * nothing about the inode, but we don't need to unuse swap). */ if (!page_swapcount(*pagep)) error = -ENOENT; @@ -730,9 +741,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) /* * There's a faint possibility that swap page was replaced before - * caller locked it: it will come back later with the right page. + * caller locked it: caller will come back later with the right page. */ - if (unlikely(!PageSwapCache(page))) + if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) goto out; /* @@ -995,21 +1006,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, newpage = shmem_alloc_page(gfp, info, index); if (!newpage) return -ENOMEM; - VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); - *pagep = newpage; page_cache_get(newpage); copy_highpage(newpage, oldpage); + flush_dcache_page(newpage); - VM_BUG_ON(!PageLocked(oldpage)); __set_page_locked(newpage); - VM_BUG_ON(!PageUptodate(oldpage)); SetPageUptodate(newpage); - VM_BUG_ON(!PageSwapBacked(oldpage)); SetPageSwapBacked(newpage); - VM_BUG_ON(!swap_index); set_page_private(newpage, swap_index); - VM_BUG_ON(!PageSwapCache(oldpage)); SetPageSwapCache(newpage); /* @@ -1019,13 +1024,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, spin_lock_irq(&swap_mapping->tree_lock); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); - __inc_zone_page_state(newpage, NR_FILE_PAGES); - __dec_zone_page_state(oldpage, NR_FILE_PAGES); + if (!error) { + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + } spin_unlock_irq(&swap_mapping->tree_lock); - BUG_ON(error); - mem_cgroup_replace_page_cache(oldpage, newpage); - lru_cache_add_anon(newpage); + if (unlikely(error)) { + /* + * Is this possible? I think not, now that our callers check + * both PageSwapCache and page_private after getting page lock; + * but be defensive. Reverse old to newpage for clear and free. + */ + oldpage = newpage; + } else { + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + *pagep = newpage; + } ClearPageSwapCache(oldpage); set_page_private(oldpage, 0); @@ -1033,7 +1049,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, unlock_page(oldpage); page_cache_release(oldpage); page_cache_release(oldpage); - return 0; + return error; } /* @@ -1107,7 +1123,8 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); - if (!PageSwapCache(page) || page->mapping) { + if (!PageSwapCache(page) || page_private(page) != swap.val || + page->mapping) { error = -EEXIST; /* try again */ goto failed; } From 6305902c2f871fd6db60af367bd7120fa977fa74 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 7 Jun 2012 14:21:10 -0700 Subject: [PATCH 088/117] MAINTAINERS: whitespace fixes Remove trailing spaces at EOL. Always use a tab after the type : Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index dafcba7e2312..14bc7071f9df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1077,7 +1077,7 @@ F: drivers/media/video/s5p-fimc/ ARM/SAMSUNG S5P SERIES Multi Format Codec (MFC) SUPPORT M: Kyungmin Park M: Kamil Debski -M: Jeongtae Park +M: Jeongtae Park L: linux-arm-kernel@lists.infradead.org L: linux-media@vger.kernel.org S: Maintained @@ -1743,10 +1743,10 @@ F: include/linux/can/platform/ CAPABILITIES M: Serge Hallyn L: linux-security-module@vger.kernel.org -S: Supported +S: Supported F: include/linux/capability.h F: security/capability.c -F: security/commoncap.c +F: security/commoncap.c F: kernel/capability.c CELL BROADBAND ENGINE ARCHITECTURE @@ -2146,11 +2146,11 @@ S: Orphan F: drivers/net/wan/pc300* CYTTSP TOUCHSCREEN DRIVER -M: Javier Martinez Canillas -L: linux-input@vger.kernel.org -S: Maintained -F: drivers/input/touchscreen/cyttsp* -F: include/linux/input/cyttsp.h +M: Javier Martinez Canillas +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/input/touchscreen/cyttsp* +F: include/linux/input/cyttsp.h DAMA SLAVE for AX.25 M: Joerg Reuter @@ -5185,7 +5185,7 @@ S: Maintained F: drivers/firmware/pcdp.* PCI ERROR RECOVERY -M: Linas Vepstas +M: Linas Vepstas L: linux-pci@vger.kernel.org S: Supported F: Documentation/PCI/pci-error-recovery.txt From bafb282df29c1524b1617019adebd6d0c3eb7a47 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 7 Jun 2012 14:21:11 -0700 Subject: [PATCH 089/117] c/r: prctl: update prctl_set_mm_exe_file() after mm->num_exe_file_vmas removal A fix for commit b32dfe377102 ("c/r: prctl: add ability to set new mm_struct::exe_file"). After removing mm->num_exe_file_vmas kernel keeps mm->exe_file until final mmput(), it never becomes NULL while task is alive. We can check for other mapped files in mm instead of checking mm->num_exe_file_vmas, and mark mm with flag MMF_EXE_FILE_CHANGED in order to forbid second changing of mm->exe_file. Signed-off-by: Konstantin Khlebnikov Reviewed-by: Cyrill Gorcunov Cc: Oleg Nesterov Cc: Matt Helsley Cc: Kees Cook Cc: KOSAKI Motohiro Cc: Tejun Heo Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/sys.c | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6029d8c54476..c688d4cc2e40 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -439,6 +439,7 @@ extern int get_dumpable(struct mm_struct *mm); /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ +#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/sys.c b/kernel/sys.c index 9ff89cb9657a..54f20fdee93c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1796,17 +1796,11 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma, static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { + struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; - /* - * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's - * remain. So perform a quick test first. - */ - if (mm->num_exe_file_vmas) - return -EBUSY; - exe_file = fget(fd); if (!exe_file) return -EBADF; @@ -1827,17 +1821,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; + down_write(&mm->mmap_sem); + + /* + * Forbid mm->exe_file change if there are mapped other files. + */ + err = -EBUSY; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file && !path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_unlock; + } + /* * The symlink can be changed only once, just to disallow arbitrary * transitions malicious software might bring in. This means one * could make a snapshot over all processes running and monitor * /proc/pid/exe changes to notice unusual activity if needed. */ - down_write(&mm->mmap_sem); - if (likely(!mm->exe_file)) - set_mm_exe_file(mm, exe_file); - else - err = -EBUSY; + err = -EPERM; + if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) + goto exit_unlock; + + set_mm_exe_file(mm, exe_file); +exit_unlock: up_write(&mm->mmap_sem); exit: From 1ad75b9e16280ca4e2501a629a225319cf2eef2e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:11 -0700 Subject: [PATCH 090/117] c/r: prctl: add minimal address test to PR_SET_MM Make sure the address being set is greater than mmap_min_addr (as suggested by Kees Cook). Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Serge Hallyn Cc: Tejun Heo Cc: Pavel Emelyanov Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sys.c b/kernel/sys.c index 54f20fdee93c..19a2c7139960 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1869,7 +1869,7 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); - if (addr >= TASK_SIZE) + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; From 300f786b2683f8bb1ec0afb6e1851183a479c86d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: [PATCH 091/117] c/r: prctl: add ability to get clear_tid_address Zero is written at clear_tid_address when the process exits. This functionality is used by pthread_join(). We already have sys_set_tid_address() to change this address for the current task but there is no way to obtain it from user space. Without the ability to find this address and dump it we can't restore pthread'ed apps which call pthread_join() once they have been restored. This patch introduces the PR_GET_TID_ADDRESS prctl option which allows the current process to obtain own clear_tid_address. This feature is available iif CONFIG_CHECKPOINT_RESTORE is set. [akpm@linux-foundation.org: fix prctl numbering] Signed-off-by: Andrew Vagin Signed-off-by: Cyrill Gorcunov Cc: Pedro Alves Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Tejun Heo Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 10 ++++++---- kernel/sys.c | 13 +++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 711e0a30aacc..3988012255dc 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -127,8 +127,8 @@ #define PR_SET_PTRACER 0x59616d61 # define PR_SET_PTRACER_ANY ((unsigned long)-1) -#define PR_SET_CHILD_SUBREAPER 36 -#define PR_GET_CHILD_SUBREAPER 37 +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 /* * If no_new_privs is set, then operations that grant new privileges (i.e. @@ -142,7 +142,9 @@ * asking selinux for a specific new context (e.g. with runcon) will result * in execve returning -EPERM. */ -#define PR_SET_NO_NEW_PRIVS 38 -#define PR_GET_NO_NEW_PRIVS 39 +#define PR_SET_NO_NEW_PRIVS 38 +#define PR_GET_NO_NEW_PRIVS 39 + +#define PR_GET_TID_ADDRESS 40 #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 19a2c7139960..0ec1942ba7ea 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1988,12 +1988,22 @@ out: up_read(&mm->mmap_sem); return error; } + +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return put_user(me->clear_child_tid, tid_addr); +} + #else /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { return -EINVAL; } +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return -EINVAL; +} #endif SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -2131,6 +2141,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; default: return -EINVAL; } From 736f24d5e59d699c6e300c5da7e3bb882eddda67 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: [PATCH 092/117] c/r: prctl: drop VMA flags test on PR_SET_MM_ stack data assignment In commit b76437579d13 ("procfs: mark thread stack correctly in proc//maps") the stack allocated via clone() is marked in /proc//maps as [stack:%d] thus it might be out of the former mm->start_stack/end_stack values (and even has some custom VMA flags set). So to be able to restore mm->start_stack/end_stack drop vma flags test, but still require the underlying VMA to exist. As always note this feature is under CONFIG_CHECKPOINT_RESTORE and requires CAP_SYS_RESOURCE to be granted. Signed-off-by: Cyrill Gorcunov Cc: Oleg Nesterov Acked-by: Kees Cook Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 0ec1942ba7ea..f0ec44dcd415 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1786,14 +1786,6 @@ SYSCALL_DEFINE1(umask, int, mask) } #ifdef CONFIG_CHECKPOINT_RESTORE -static bool vma_flags_mismatch(struct vm_area_struct *vma, - unsigned long required, - unsigned long banned) -{ - return (vma->vm_flags & required) != required || - (vma->vm_flags & banned); -} - static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct vm_area_struct *vma; @@ -1931,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EFAULT; goto out; } -#ifdef CONFIG_STACK_GROWSUP - if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) -#else - if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) -#endif - goto out; if (opt == PR_SET_MM_START_STACK) mm->start_stack = addr; else if (opt == PR_SET_MM_ARG_START) From 4e791c98ae7ff889121ca93b7bd97206e4a8d793 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: [PATCH 093/117] drivers/platform/x86/acerhdf.c: correct Boris' mail address Correct mail address reference to a mail account which I actually read. Signed-off-by: Borislav Petkov Cc: Peter Feuerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/platform/x86/acerhdf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c index 639db4d0aa76..2fd9d36acd15 100644 --- a/drivers/platform/x86/acerhdf.c +++ b/drivers/platform/x86/acerhdf.c @@ -5,7 +5,7 @@ * * (C) 2009 - Peter Feuerer peter (a) piie.net * http://piie.net - * 2009 Borislav Petkov + * 2009 Borislav Petkov bp (a) alien8.de * * Inspired by and many thanks to: * o acerfand - Rachel Greenham From 7d8a45695cc8f9fcdf4121fcbd897ecb63f758e4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Jun 2012 14:21:13 -0700 Subject: [PATCH 094/117] ipc: shm: restore MADV_REMOVE functionality on shared memory segments Commit 17cf28afea2a ("mm/fs: remove truncate_range") removed the truncate_range inode operation in favour of the fallocate file operation. When using SYSV IPC shared memory segments, calling madvise with the MADV_REMOVE advice on an area of shared memory will attempt to invoke the .fallocate function for the shm_file_operations, which is NULL and therefore returns -EOPNOTSUPP to userspace. The previous behaviour would inherit the inode_operations from the underlying tmpfs file and invoke truncate_range there. This patch restores the previous behaviour by wrapping the underlying fallocate function in shm_fallocate, as we do for fsync. [hughd@google.com: use -ENOTSUPP in shm_fallocate()] Signed-off-by: Will Deacon Acked-by: Hugh Dickins Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ipc/shm.c b/ipc/shm.c index 5e2cbfdab6fc..41c1285d697a 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -393,6 +393,16 @@ static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) return sfd->file->f_op->fsync(sfd->file, start, end, datasync); } +static long shm_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct shm_file_data *sfd = shm_file_data(file); + + if (!sfd->file->f_op->fallocate) + return -EOPNOTSUPP; + return sfd->file->f_op->fallocate(file, mode, offset, len); +} + static unsigned long shm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -410,6 +420,7 @@ static const struct file_operations shm_file_operations = { .get_unmapped_area = shm_get_unmapped_area, #endif .llseek = noop_llseek, + .fallocate = shm_fallocate, }; static const struct file_operations shm_file_operations_huge = { @@ -418,6 +429,7 @@ static const struct file_operations shm_file_operations_huge = { .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, + .fallocate = shm_fallocate, }; int is_file_shm_hugepages(struct file *file) From cbf8ae32f66a9ceb8907ad9e16663c2a29e48990 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 7 Jun 2012 14:21:13 -0700 Subject: [PATCH 095/117] btree: fix tree corruption in btree_get_prev() The memory the parameter __key points to is used as an iterator in btree_get_prev(), so if we save off a bkey() pointer in retry_key and then assign that to __key, we'll end up corrupting the btree internals when we do eg longcpy(__key, bkey(geo, node, i), geo->keylen); to return the key value. What we should do instead is use longcpy() to copy the key value that retry_key points to __key. This can cause a btree to get corrupted by seemingly read-only operations such as btree_for_each_safe. [akpm@linux-foundation.org: avoid the double longcpy()] Signed-off-by: Roland Dreier Acked-by: Joern Engel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/btree.c b/lib/btree.c index e5ec1e9c1aa5..5cf9e74ec3f3 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -319,8 +319,8 @@ void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, if (head->height == 0) return NULL; -retry: longcpy(key, __key, geo->keylen); +retry: dec_key(geo, key); node = head->node; @@ -351,7 +351,7 @@ retry: } miss: if (retry_key) { - __key = retry_key; + longcpy(key, retry_key, geo->keylen); retry_key = NULL; goto retry; } From 39caa0916ef27cf1da5026eb708a2b8413156f75 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Thu, 7 Jun 2012 14:21:14 -0700 Subject: [PATCH 096/117] btree: catch NULL value before it does harm Storing NULL values in the btree is illegal and can lead to memory corruption and possible other fun as well. Catch it on insert, instead of waiting for the inevitable. Signed-off-by: Joern Engel Signed-off-by: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/btree.c b/lib/btree.c index 5cf9e74ec3f3..f9a484676cb6 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -509,6 +509,7 @@ retry: int btree_insert(struct btree_head *head, struct btree_geo *geo, unsigned long *key, void *val, gfp_t gfp) { + BUG_ON(!val); return btree_insert_level(head, geo, key, val, 1, gfp); } EXPORT_SYMBOL_GPL(btree_insert); From 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 7 Jun 2012 14:21:14 -0700 Subject: [PATCH 097/117] mm: correctly synchronize rss-counters at exit/exec mm->rss_stat counters have per-task delta: task->rss_stat. Before changing task->mm pointer the kernel must flush this delta with sync_mm_rss(). do_exit() already calls sync_mm_rss() to flush the rss-counters before committing the rss statistics into task->signal->maxrss, taskstats, audit and other stuff. Unfortunately the kernel does this before calling mm_release(), which can call put_user() for processing task->clear_child_tid. So at this point we can trigger page-faults and task->rss_stat becomes non-zero again. As a result mm->rss_stat becomes inconsistent and check_mm() will print something like this: | BUG: Bad rss-counter state mm:ffff88020813c380 idx:1 val:-1 | BUG: Bad rss-counter state mm:ffff88020813c380 idx:2 val:1 This patch moves sync_mm_rss() into mm_release(), and moves mm_release() out of do_exit() and calls it earlier. After mm_release() there should be no pagefaults. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Konstantin Khlebnikov Reported-by: Markus Trippelsdorf Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: [3.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 - kernel/exit.c | 13 ++++++++----- kernel/fork.c | 8 ++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index a79786a8d2c8..b926ed19301e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -819,7 +819,6 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..804fb6bb8161 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -423,6 +423,7 @@ void daemonize(const char *name, ...) * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ + mm_release(current, current->mm); exit_mm(current); /* * We don't want to get frozen, in case system-wide hibernation @@ -640,7 +641,6 @@ static void exit_mm(struct task_struct * tsk) struct mm_struct *mm = tsk->mm; struct core_state *core_state; - mm_release(tsk, mm); if (!mm) return; /* @@ -960,9 +960,13 @@ void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); + + /* Set exit_code before complete_vfork_done() in mm_release() */ + tsk->exit_code = code; + + /* Release mm and sync mm's RSS info before statistics gathering */ + mm_release(tsk, tsk->mm); + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -975,7 +979,6 @@ void do_exit(long code) tty_audit_exit(); audit_free(tsk); - tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..0560781c6904 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -619,6 +619,14 @@ void mmput(struct mm_struct *mm) module_put(mm->binfmt->module); mmdrop(mm); } + + /* + * Final rss-counter synchronization. After this point there must be + * no pagefaults into this mm from the current context. Otherwise + * mm->rss_stat will be inconsistent. + */ + if (mm) + sync_mm_rss(mm); } EXPORT_SYMBOL_GPL(mmput); From b0dd6b70f0fda17ae9762fbb72d98e40a4f66556 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 7 Jun 2012 18:56:06 -0400 Subject: [PATCH 098/117] ext4: fix the free blocks calculation for ext3 file systems w/ uninit_bg Ext3 filesystems that are converted to use as many ext4 file system features as possible will enable uninit_bg to speed up e2fsck times. These file systems will have a native ext3 layout of inode tables and block allocation bitmaps (as opposed to ext4's flex_bg layout). Unfortunately, in these cases, when first allocating a block in an uninitialized block group, ext4 would incorrectly calculate the number of free blocks in that block group, and then errorneously report that the file system was corrupt: EXT4-fs error (device vdd): ext4_mb_generate_buddy:741: group 30, 32254 clusters in bitmap, 32258 in gd This problem can be reproduced via: mke2fs -q -t ext4 -O ^flex_bg /dev/vdd 5g mount -t ext4 /dev/vdd /mnt fallocate -l 4600m /mnt/test The problem was caused by a bone headed mistake in the check to see if a particular metadata block was part of the block group. Many thanks to Kees Cook for finding and bisecting the buggy commit which introduced this bug (commit fd034a84e1, present since v3.2). Reported-by: Sander Eikelenboom Reported-by: Kees Cook Signed-off-by: "Theodore Ts'o" Tested-by: Kees Cook Cc: stable@kernel.org --- fs/ext4/balloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 99b6324290db..cee7812cc3cf 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -90,8 +90,8 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { - block_cluster = EXT4_B2C(sbi, (start - - ext4_block_bitmap(sb, gdp))); + block_cluster = EXT4_B2C(sbi, + ext4_block_bitmap(sb, gdp) - start); if (block_cluster < num_clusters) block_cluster = -1; else if (block_cluster == num_clusters) { @@ -102,7 +102,7 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, - start - ext4_inode_bitmap(sb, gdp)); + ext4_inode_bitmap(sb, gdp) - start); if (inode_cluster < num_clusters) inode_cluster = -1; else if (inode_cluster == num_clusters) { @@ -114,7 +114,7 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, itbl_blk = ext4_inode_table(sb, gdp); for (i = 0; i < sbi->s_itb_per_group; i++) { if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { - c = EXT4_B2C(sbi, start - itbl_blk + i); + c = EXT4_B2C(sbi, itbl_blk + i - start); if ((c < num_clusters) || (c == inode_cluster) || (c == block_cluster) || (c == itbl_cluster)) continue; From b22b1f178f6799278d3178d894f37facb2085765 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 7 Jun 2012 19:04:19 -0400 Subject: [PATCH 099/117] ext4: don't set i_flags in EXT4_IOC_SETFLAGS Commit 7990696 uses the ext4_{set,clear}_inode_flags() functions to change the i_flags automatically but fails to remove the error setting of i_flags. So we still have the problem of trashing state flags. Fix this by removing the assignment. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ioctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8ad112ae0ade..e34deac3f366 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -123,7 +123,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) else ext4_clear_inode_flag(inode, i); } - ei->i_flags = flags; ext4_set_inode_flags(inode); inode->i_ctime = ext4_current_time(inode); From 48d212a2eecaca2e1875925837ad27b2f43f48a3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 7 Jun 2012 17:54:07 -0700 Subject: [PATCH 100/117] Revert "mm: correctly synchronize rss-counters at exit/exec" This reverts commit 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94. It's horribly and utterly broken for at least the following reasons: - calling sync_mm_rss() from mmput() is fundamentally wrong, because there's absolutely no reason to believe that the task that does the mmput() always does it on its own VM. Example: fork, ptrace, /proc - you name it. - calling it *after* having done mmdrop() on it is doubly insane, since the mm struct may well be gone now. - testing mm against NULL before you call it is insane too, since a NULL mm there would have caused oopses long before. .. and those are just the three bugs I found before I decided to give up looking for me and revert it asap. I should have caught it before I even took it, but I trusted Andrew too much. Cc: Konstantin Khlebnikov Cc: Markus Trippelsdorf Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + kernel/exit.c | 13 +++++-------- kernel/fork.c | 8 -------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b926ed19301e..a79786a8d2c8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -819,6 +819,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; + sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/kernel/exit.c b/kernel/exit.c index 804fb6bb8161..34867cc5b42a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -423,7 +423,6 @@ void daemonize(const char *name, ...) * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ - mm_release(current, current->mm); exit_mm(current); /* * We don't want to get frozen, in case system-wide hibernation @@ -641,6 +640,7 @@ static void exit_mm(struct task_struct * tsk) struct mm_struct *mm = tsk->mm; struct core_state *core_state; + mm_release(tsk, mm); if (!mm) return; /* @@ -960,13 +960,9 @@ void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - - /* Set exit_code before complete_vfork_done() in mm_release() */ - tsk->exit_code = code; - - /* Release mm and sync mm's RSS info before statistics gathering */ - mm_release(tsk, tsk->mm); - + /* sync mm's RSS info before statistics gathering */ + if (tsk->mm) + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -979,6 +975,7 @@ void do_exit(long code) tty_audit_exit(); audit_free(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 0560781c6904..ab5211b9e622 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -619,14 +619,6 @@ void mmput(struct mm_struct *mm) module_put(mm->binfmt->module); mmdrop(mm); } - - /* - * Final rss-counter synchronization. After this point there must be - * no pagefaults into this mm from the current context. Otherwise - * mm->rss_stat will be inconsistent. - */ - if (mm) - sync_mm_rss(mm); } EXPORT_SYMBOL_GPL(mmput); From 860aed25a1f0936d4852ab936252b47cd1e630f1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 1 Jun 2012 18:13:43 +1000 Subject: [PATCH 101/117] powerpc/time: Sanity check of decrementer expiration is necessary This reverts 68568add2c ("powerpc/time: Remove unnecessary sanity check of decrementer expiration"). We do need to check whether we have reached the expiration time of the next event, because we sometimes get an early decrementer interrupt, most notably when we set the decrementer to 1 in arch_irq_work_raise(). The effect of not having the sanity check is that if timer_interrupt() gets called early, we leave the decrementer set to its maximum value, which means we then don't get any more decrementer interrupts for about 4 seconds (or longer, depending on timebase frequency). I saw these pauses as a consequence of getting a stray hypervisor decrementer interrupt left over from exiting a KVM guest. This isn't quite a straight revert because of changes to the surrounding code, but it restores the same algorithm as was previously used. Cc: stable@vger.kernel.org Acked-by: Anton Blanchard Acked-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/time.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 99a995c2a3f2..be171ee73bf8 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -475,6 +475,7 @@ void timer_interrupt(struct pt_regs * regs) struct pt_regs *old_regs; u64 *next_tb = &__get_cpu_var(decrementers_next_tb); struct clock_event_device *evt = &__get_cpu_var(decrementers); + u64 now; /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. @@ -509,9 +510,16 @@ void timer_interrupt(struct pt_regs * regs) irq_work_run(); } - *next_tb = ~(u64)0; - if (evt->event_handler) - evt->event_handler(evt); + now = get_tb_or_rtc(); + if (now >= *next_tb) { + *next_tb = ~(u64)0; + if (evt->event_handler) + evt->event_handler(evt); + } else { + now = *next_tb - now; + if (now <= DECREMENTER_MAX) + set_dec((int)now); + } #ifdef CONFIG_PPC64 /* collect purr register values often, for accurate calculations */ From ae82fdb1406ad41d68f07027fe31f2d35ba22a90 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 8 Jun 2012 14:58:13 +0930 Subject: [PATCH 102/117] module_param: stop double-calling parameters. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 026cee0086fe1df4cf74691cf273062cc769617d "params: _initcall-like kernel parameters" set old-style module parameters to level 0. And we call those level 0 calls where we used to, early in start_kernel(). We also loop through the initcall levels and call the levelled module_params before the corresponding initcall. Unfortunately level 0 is early_init(), so we call the standard module_param calls twice. (Turns out most things don't care, but at least ubi.mtd does). Change the level to -1 for standard module_param calls. Reported-by: Benoît Thébaudeau Signed-off-by: Rusty Russell Cc: stable@kernel.org --- include/linux/moduleparam.h | 10 +++++----- init/main.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 1b14d25162cb..d6a58065c09c 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -128,7 +128,7 @@ struct kparam_array * The ops can have NULL set or get functions. */ #define module_param_cb(name, ops, arg, perm) \ - __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, 0) + __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, -1) /** * _param_cb - general callback for a module/cmdline parameter @@ -192,7 +192,7 @@ struct kparam_array { (void *)set, (void *)get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ name, &__param_ops_##name, arg, \ - (perm) + sizeof(__check_old_set_param(set))*0, 0) + (perm) + sizeof(__check_old_set_param(set))*0, -1) /* We don't get oldget: it's often a new-style param_get_uint, etc. */ static inline int @@ -272,7 +272,7 @@ static inline void __kernel_param_unlock(void) */ #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ - __module_param_call("", name, ¶m_ops_##type, &var, perm, 0) + __module_param_call("", name, ¶m_ops_##type, &var, perm, -1) #endif /* !MODULE */ /** @@ -290,7 +290,7 @@ static inline void __kernel_param_unlock(void) = { len, string }; \ __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_ops_string, \ - .str = &__param_string_##name, perm, 0); \ + .str = &__param_string_##name, perm, -1); \ __MODULE_PARM_TYPE(name, "string") /** @@ -432,7 +432,7 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_array_ops, \ .arr = &__param_arr_##name, \ - perm, 0); \ + perm, -1); \ __MODULE_PARM_TYPE(name, "array of " #type) extern struct kernel_param_ops param_array_ops; diff --git a/init/main.c b/init/main.c index 1ca6b32c4828..37e12098eac1 100644 --- a/init/main.c +++ b/init/main.c @@ -508,7 +508,7 @@ asmlinkage void __init start_kernel(void) parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, - 0, 0, &unknown_bootoption); + -1, -1, &unknown_bootoption); jump_label_init(); From 19efb72fdc3c3bbfb929d91e34312b0494f14409 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 1 Jun 2012 18:56:00 +0200 Subject: [PATCH 103/117] init: Drop initcall level output 9fb48c744ba6a ("params: add 3rd arg to option handler callback signature") added similar lines to dmesg: initlevel:0=early, 4 registered initcalls initlevel:1=core, 31 registered initcalls initlevel:2=postcore, 11 registered initcalls initlevel:3=arch, 7 registered initcalls initlevel:4=subsys, 40 registered initcalls initlevel:5=fs, 30 registered initcalls initlevel:6=device, 250 registered initcalls initlevel:7=late, 35 registered initcalls but they don't contain any info for the general user staring at dmesg. I'm very doubtful the count of initcalls registered per level helps anyone so drop that output completely. Cc: Jim Cromie Cc: Rusty Russell Cc: Jason Baron Signed-off-by: Borislav Petkov Signed-off-by: Rusty Russell --- init/main.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/init/main.c b/init/main.c index 37e12098eac1..b5cc0a7c4708 100644 --- a/init/main.c +++ b/init/main.c @@ -755,13 +755,8 @@ static void __init do_initcalls(void) { int level; - for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) { - pr_info("initlevel:%d=%s, %d registered initcalls\n", - level, initcall_level_names[level], - (int) (initcall_levels[level+1] - - initcall_levels[level])); + for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) do_initcall_level(level); - } } /* From bd2753b2dda7bb43c7468826de75f49c6a7e8965 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 Jun 2012 10:55:40 -0700 Subject: [PATCH 104/117] x86/mm: Only add extra pages count for the first memory range during pre-allocation early page table space Robin found this regression: | I just tried to boot an 8TB system. It fails very early in boot with: | Kernel panic - not syncing: Cannot find space for the kernel page tables git bisect commit 722bc6b16771ed80871e1fd81c86d3627dda2ac8. A git revert of that commit does boot past that point on the 8TB configuration. That commit will add up extra pages for all memory range even above 4g. Try to limit that extra page count adding to first entry only. Bisected-by: Robin Holt Tested-by: Robin Holt Signed-off-by: Yinghai Lu Cc: WANG Cong Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/CAE9FiQUj3wyzQxtq9yzBNc9u220p8JZ1FYHG7t%3DMOzJ%3D9BZMYA@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 97141c26a13a..bc4e9d84157f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -62,7 +62,8 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en extra += PMD_SIZE; #endif /* The first 2/4M doesn't use large pages. */ - extra += mr->end - mr->start; + if (mr->start < PMD_SIZE) + extra += mr->end - mr->start; ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else From d5d2d2eea84b0d8450b082edbc3dbde41fb8bfd8 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 7 Jun 2012 08:31:40 -0500 Subject: [PATCH 105/117] x86/uv: Fix UV2 BAU legacy mode The SGI Altix UV2 BAU (Broadcast Assist Unit) as used for tlb-shootdown (selective broadcast mode) always uses UV2 broadcast descriptor format. There is no need to clear the 'legacy' (UV1) mode, because the hardware always uses UV2 mode for selective broadcast. But the BIOS uses general broadcast and legacy mode, and the hardware pays attention to the legacy mode bit for general broadcast. So the kernel must not clear that mode bit. Signed-off-by: Cliff Wickman Cc: Link: http://lkml.kernel.org/r/E1SccoO-0002Lh-Cb@eag09.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 - arch/x86/platform/uv/tlb_uv.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index becf47b81735..6149b476d9df 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -149,7 +149,6 @@ /* 4 bits of software ack period */ #define UV2_ACK_MASK 0x7UL #define UV2_ACK_UNITS_SHFT 3 -#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT #define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT /* diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3ae0e61abd23..59880afa851f 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1295,7 +1295,6 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image &= ~(1L << UV2_LEG_SHFT); mmr_image |= (1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); From 3c75296562f43e6fbc6cddd3de948a7b3e4e9bcf Mon Sep 17 00:00:00 2001 From: Steffen Rumler Date: Wed, 6 Jun 2012 16:37:17 +0200 Subject: [PATCH 106/117] powerpc: Fix kernel panic during kernel module load This fixes a problem which can causes kernel oopses while loading a kernel module. According to the PowerPC EABI specification, GPR r11 is assigned the dedicated function to point to the previous stack frame. In the powerpc-specific kernel module loader, do_plt_call() (in arch/powerpc/kernel/module_32.c), GPR r11 is also used to generate trampoline code. This combination crashes the kernel, in the case where the compiler chooses to use a helper function for saving GPRs on entry, and the module loader has placed the .init.text section far away from the .text section, meaning that it has to generate a trampoline for functions in the .init.text section to call the GPR save helper. Because the trampoline trashes r11, references to the stack frame using r11 can cause an oops. The fix just uses GPR r12 instead of GPR r11 for generating the trampoline code. According to the statements from Freescale, this is safe from an EABI perspective. I've tested the fix for kernel 2.6.33 on MPC8541. Cc: stable@vger.kernel.org Signed-off-by: Steffen Rumler [paulus@samba.org: reworded the description] Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/module_32.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 0b6d79617d7b..2e3200ca485f 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -176,8 +176,8 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr, static inline int entry_matches(struct ppc_plt_entry *entry, Elf32_Addr val) { - if (entry->jump[0] == 0x3d600000 + ((val + 0x8000) >> 16) - && entry->jump[1] == 0x396b0000 + (val & 0xffff)) + if (entry->jump[0] == 0x3d800000 + ((val + 0x8000) >> 16) + && entry->jump[1] == 0x398c0000 + (val & 0xffff)) return 1; return 0; } @@ -204,10 +204,9 @@ static uint32_t do_plt_call(void *location, entry++; } - /* Stolen from Paul Mackerras as well... */ - entry->jump[0] = 0x3d600000+((val+0x8000)>>16); /* lis r11,sym@ha */ - entry->jump[1] = 0x396b0000 + (val&0xffff); /* addi r11,r11,sym@l*/ - entry->jump[2] = 0x7d6903a6; /* mtctr r11 */ + entry->jump[0] = 0x3d800000+((val+0x8000)>>16); /* lis r12,sym@ha */ + entry->jump[1] = 0x398c0000 + (val&0xffff); /* addi r12,r12,sym@l*/ + entry->jump[2] = 0x7d8903a6; /* mtctr r12 */ entry->jump[3] = 0x4e800420; /* bctr */ DEBUGP("Initialized plt for 0x%x at %p\n", val, entry); From eeaaa96a3a2134a174100afd129bb0891d05f4b2 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 6 Jun 2012 10:05:42 -0400 Subject: [PATCH 107/117] x86/nmi: Fix section mismatch warnings on 32-bit It was reported that compiling for 32-bit caused a bunch of section mismatch warnings: VDSOSYM arch/x86/vdso/vdso32-syms.lds LD arch/x86/vdso/built-in.o LD arch/x86/built-in.o WARNING: arch/x86/built-in.o(.data+0x5af0): Section mismatch in reference from the variable test_nmi_ipi_callback_na.10451 to the function .init.text:test_nmi_ipi_callback() [...] WARNING: arch/x86/built-in.o(.data+0x5b04): Section mismatch in reference from the variable nmi_unk_cb_na.10399 to the function .init.text:nmi_unk_cb() The variable nmi_unk_cb_na.10399 references the function __init nmi_unk_cb() [...] Both of these are attributed to the internal representation of the nmiaction struct created during register_nmi_handler. The reason for this is that those structs are not defined in the init section whereas the rest of the code in nmi_selftest.c is. To resolve this, I created a new #define, register_nmi_handler_initonly, that tags the struct as __initdata to resolve the mismatch. This #define should only be used in rare situations where the register/unregister is called during init of the kernel. Big thanks to Jan Beulich for decoding this for me as I didn't have a clue what was going on. Reported-by: Witold Baryluk Tested-by: Witold Baryluk Cc: Jan Beulich Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1338991542-23000-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 14 ++++++++++++++ arch/x86/kernel/nmi_selftest.c | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 0e3793b821ef..dc580c42851c 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -54,6 +54,20 @@ struct nmiaction { __register_nmi_handler((t), &fn##_na); \ }) +/* + * For special handlers that register/unregister in the + * init section only. This should be considered rare. + */ +#define register_nmi_handler_initonly(t, fn, fg, n) \ +({ \ + static struct nmiaction fn##_na __initdata = { \ + .handler = (fn), \ + .name = (n), \ + .flags = (fg), \ + }; \ + __register_nmi_handler((t), &fn##_na); \ +}) + int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index e31bf8d5c4d2..149b8d9c6ad4 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,7 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); } static void __init cleanup_nmi_testsuite(void) @@ -64,7 +64,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; - if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, NMI_FLAG_FIRST, "nmi_selftest")) { nmi_fail = FAILURE; return; From 32ba9c3fcab960f0b0d332c86ebcd2c4870d9bb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Jun 2012 10:34:03 -0700 Subject: [PATCH 108/117] Revert "vfs: stop d_splice_alias creating directory aliases" This reverts commit 7732a557b1342c6e6966efb5f07effcf99f56167 (and commit 3f50fff4dace23d3cfeb195d5cd4ee813cee68b7, which was a follow-up cleanup). We're chasing an elusive bug that Dave Jones can apparently reproduce using his system call fuzzer tool, and that looks like some kind of locking ordering problem on the directory i_mutex chain. Our i_mutex locking is rather complex, and depends on the topological ordering of the directories, which is why we have been very wary of splicing directory entries around. Of course, we really don't want to ever see aliased unconnected directories anyway, so none of this should ever happen, but this revert aims to basically get us back to a known older state. Bruce points to some of the previous discussion at http://marc.info/?i=<20110310105821.GE22723@ZenIV.linux.org.uk> and in particular a long post from Neil: http://marc.info/?i=<20110311150749.2fa2be66@notabene.brown> It should be noted that it's possible that Dave's problems come from other changes altohgether, including possibly just the fact that Dave constantly is teachning his fuzzer new tricks. So what appears to be a new bug could in fact be an old one that just gets newly triggered, but reverting these patches as "still under heavy discussion" is the right thing regardless. Requested-by: Al Viro Acked-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- fs/dcache.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 85c9e2bff8e6..40469044088d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -683,6 +683,8 @@ EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question + * @want_discon: flag, used by d_splice_alias, to request + * that only a DISCONNECTED alias be returned. * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. @@ -691,9 +693,10 @@ EXPORT_SYMBOL(dget_parent); * of a filesystem. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that. + * any other hashed alias over that one unless @want_discon is set, + * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. */ -static struct dentry *__d_find_alias(struct inode *inode) +static struct dentry *__d_find_alias(struct inode *inode, int want_discon) { struct dentry *alias, *discon_alias; @@ -705,7 +708,7 @@ again: if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; - } else { + } else if (!want_discon) { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; @@ -736,7 +739,7 @@ struct dentry *d_find_alias(struct inode *inode) if (!list_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); - de = __d_find_alias(inode); + de = __d_find_alias(inode, 0); spin_unlock(&inode->i_lock); } return de; @@ -1647,8 +1650,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (inode && S_ISDIR(inode->i_mode)) { spin_lock(&inode->i_lock); - new = __d_find_any_alias(inode); + new = __d_find_alias(inode, 1); if (new) { + BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); d_move(new, dentry); @@ -2478,7 +2482,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) struct dentry *alias; /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode); + alias = __d_find_alias(inode, 0); if (alias) { actual = alias; write_seqlock(&rename_lock); From 8f53369b753f5f4c7684c2eb0b592152abb1dd00 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Jun 2012 14:53:06 -0700 Subject: [PATCH 109/117] Revert "drm/i915/crt: Do not rely upon the HPD presence pin" This reverts commit 9e612a008fa7fe493a473454def56aa321479495. It incorrectly finds VGA connectors where none are attached, apparently not noticing that nothing replied to the EDID queries, and happily using the default EDID modes that have nothing to do with actual hardware. That in turn then causes X to fall down to the lowest common denominator, which is usually the default 1024x768 mode that is in the default EDID and pretty much anything supports). I'd suggest that if not relying on the HDP pin, the code should at least check whether it gets valid EDID data back, rather than just assume there's something on the VGA connector. Cc: Dave Airlie Cc: Chris Wilson Cc: Daniel Vetter Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/intel_crt.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 844e93e1e395..75a70c46ef1b 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -453,15 +453,13 @@ intel_crt_detect(struct drm_connector *connector, bool force) struct intel_load_detect_pipe tmp; if (I915_HAS_HOTPLUG(dev)) { - /* We can not rely on the HPD pin always being correctly wired - * up, for example many KVM do not pass it through, and so - * only trust an assertion that the monitor is connected. - */ if (intel_crt_detect_hotplug(connector)) { DRM_DEBUG_KMS("CRT detected via hotplug\n"); return connector_status_connected; - } else + } else { DRM_DEBUG_KMS("CRT not detected via hotplug\n"); + return connector_status_disconnected; + } } if (intel_crt_detect_ddc(connector)) From cd96891d48a945ca2011fbeceda73813d6286195 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Jun 2012 13:18:33 -0700 Subject: [PATCH 110/117] sched/fair: fix lots of kernel-doc warnings Fix lots of new kernel-doc warnings in kernel/sched/fair.c: Warning(kernel/sched/fair.c:3625): No description found for parameter 'env' Warning(kernel/sched/fair.c:3625): Excess function parameter 'sd' description in 'update_sg_lb_stats' Warning(kernel/sched/fair.c:3735): No description found for parameter 'env' Warning(kernel/sched/fair.c:3735): Excess function parameter 'sd' description in 'update_sd_pick_busiest' Warning(kernel/sched/fair.c:3735): Excess function parameter 'this_cpu' description in 'update_sd_pick_busiest' .. more warnings Signed-off-by: Randy Dunlap Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2a2d236f27b..d5583f9588e7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3632,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @sd: The sched_domain whose statistics are to be updated. + * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. @@ -3741,11 +3741,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, /** * update_sd_pick_busiest - return 1 on busiest group - * @sd: sched_domain whose statistics are to be checked + * @env: The load balancing environment. * @sds: sched_domain statistics * @sg: sched_group candidate to be checked for being the busiest * @sgs: sched_group statistics - * @this_cpu: the current cpu * * Determine if @sg is a busier group than the previously selected * busiest group. @@ -3783,9 +3782,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu + * @env: The load balancing environment. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -3874,10 +3871,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, * Returns 1 when packing is required and a task should be moved to * this CPU. The amount of the imbalance is returned in *imbalance. * - * @sd: The sched_domain whose packing is to be checked. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: returns amount of imbalanced due to packing. */ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) { @@ -3903,9 +3898,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during * load balancing. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance. */ static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) @@ -4048,11 +4042,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * Also calculates the amount of weighted load which should be moved * to restore balance. * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - * be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. + * @env: The load balancing environment. * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. From 1e11ad8dc42975d5c2bab7d478f6cd875602eda4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 8 Jun 2012 13:21:26 -0700 Subject: [PATCH 111/117] mm, oom: fix badness score underflow If the privileges given to root threads (3% of allowable memory) or a negative value of /proc/pid/oom_score_adj happen to exceed the amount of rss of a thread, its badness score overflows as a result of commit a7f638f999ff ("mm, oom: normalize oom scores to oom_score_adj scale only for userspace"). Fix this by making the type signed and return 1, meaning the thread is still eligible for kill, if the value is negative. Reported-by: Dave Jones Acked-by: Oleg Nesterov Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed0e19677360..416637f0e924 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -183,7 +183,7 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { - unsigned long points; + long points; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -223,7 +223,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * Never return 0 for an eligible task regardless of the root bonus and * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ - return points ? points : 1; + return points > 0 ? points : 1; } /* From cfaf025112d3856637ff34a767ef785ef5cf2ca9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Jun 2012 18:40:09 -0700 Subject: [PATCH 112/117] Linux 3.5-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d718ede9ea5..d845c2a1aa68 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Saber-toothed Squirrel # *DOCUMENTATION* From 19f5f7364a1cc770b14692f609bb9b802fc334d5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 27 Jul 2011 17:29:28 +0200 Subject: [PATCH 113/117] nohz: Separate idle sleeping time accounting from nohz logic As we plan to be able to stop the tick outside the idle task, we need to prepare for separating nohz logic from idle. As a start, this pulls the idle sleeping time accounting out of the tick stop/restart API to the callers on idle entry/exit. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 77 ++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da70c6db496c..81409bba2425 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,10 +271,10 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, now; + ktime_t last_update, expires; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; @@ -282,8 +282,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); - now = tick_nohz_start_idle(cpu, ts); - /* * If this cpu is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by @@ -444,6 +442,14 @@ out: ts->sleep_length = ktime_sub(dev->next_event, now); } +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ + ktime_t now; + + now = tick_nohz_start_idle(smp_processor_id(), ts); + tick_nohz_stop_sched_tick(ts, now); +} + /** * tick_nohz_idle_enter - stop the idle tick from the idle task * @@ -479,7 +485,7 @@ void tick_nohz_idle_enter(void) * update of the idle time accounting in tick_nohz_start_idle(). */ ts->inidle = 1; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); local_irq_enable(); } @@ -499,7 +505,7 @@ void tick_nohz_irq_exit(void) if (!ts->inidle) return; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); } /** @@ -540,39 +546,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } } -/** - * tick_nohz_idle_exit - restart the idle tick from the idle task - * - * Restart the idle tick when the CPU is woken up from idle - * This also exit the RCU extended quiescent state. The CPU - * can use RCU again after this function is called. - */ -void tick_nohz_idle_exit(void) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); #ifndef CONFIG_VIRT_CPU_ACCOUNTING unsigned long ticks; #endif - ktime_t now; - - local_irq_disable(); - - WARN_ON_ONCE(!ts->inidle); - - ts->inidle = 0; - - if (ts->idle_active || ts->tick_stopped) - now = ktime_get(); - - if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); - - if (!ts->tick_stopped) { - local_irq_enable(); - return; - } - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); @@ -600,6 +578,35 @@ void tick_nohz_idle_exit(void) ts->idle_exittime = now; tick_nohz_restart(ts, now); +} + +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now; + + local_irq_disable(); + + WARN_ON_ONCE(!ts->inidle); + + ts->inidle = 0; + + if (ts->idle_active || ts->tick_stopped) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); + + if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, now); local_irq_enable(); } From 2ac0d98fd624ae50f5e6ae9c800977a9dbbfcde6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jul 2011 04:00:47 +0200 Subject: [PATCH 114/117] nohz: Make nohz API agnostic against idle ticks cputime accounting When the timer tick fires, it accounts the new jiffy as either part of system, user or idle time. This is how we record the cputime statistics. But when the tick is stopped from the idle task, we still need to record the number of jiffies spent tickless until we restart the tick and fall back to traditional tick-based cputime accounting. To do this, we take a snapshot of jiffies when the tick is stopped and compute the difference against the new value of jiffies when the tick is restarted. Then we account this whole difference to the idle cputime. However we are preparing to be able to stop the tick from other places than idle. So this idle time accounting needs to be performed from the callers of nohz APIs, not from the nohz APIs themselves because we now want them to be agnostic against places that stop/restart tick. Therefore, we pull the tickless idle time accounting out of generic nohz helpers up to idle entry/exit callers. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 81409bba2425..911834b33b8a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -402,7 +402,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - ts->idle_jiffies = last_jiffies; } ts->idle_sleeps++; @@ -445,9 +444,13 @@ out: static void __tick_nohz_idle_enter(struct tick_sched *ts) { ktime_t now; + int was_stopped = ts->tick_stopped; now = tick_nohz_start_idle(smp_processor_id(), ts); tick_nohz_stop_sched_tick(ts, now); + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; } /** @@ -548,15 +551,25 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - unsigned long ticks; -#endif /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); update_cpu_load_nohz(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} + +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ #ifndef CONFIG_VIRT_CPU_ACCOUNTING + unsigned long ticks; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -569,15 +582,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); #endif - - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); } /** @@ -605,8 +609,10 @@ void tick_nohz_idle_exit(void) if (ts->idle_active) tick_nohz_stop_idle(cpu, now); - if (ts->tick_stopped) + if (ts->tick_stopped) { tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); + } local_irq_enable(); } @@ -811,7 +817,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - ts->idle_jiffies++; + if (idle_cpu(cpu)) + ts->idle_jiffies++; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); From f5d411c91ede162240f34e05a233f2759412988e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 31 Jul 2011 17:44:12 +0200 Subject: [PATCH 115/117] nohz: Rename ts->idle_tick to ts->last_tick Now that idle and nohz logics are going to be independant each others, ts->idle_tick becomes too much a biased name to describe the field that saves the last scheduled tick on top of which we re-calculate the next tick to schedule when the timer is restarted. We want to reuse this even to stop the tick outside idle cases. So let's rename it to some more generic name: ts->last_tick. This changes a bit the timer list stat export so we need to increase its version. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- include/linux/tick.h | 8 ++++---- kernel/time/tick-sched.c | 4 ++-- kernel/time/timer_list.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index ab8be90b5cc9..f37fceb69b73 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -31,10 +31,10 @@ enum tick_nohz_mode { * struct tick_sched - sched tick emulation and no idle tick control/stats * @sched_timer: hrtimer to schedule the periodic tick in high * resolution mode - * @idle_tick: Store the last idle tick expiry time when the tick - * timer is modified for idle sleeps. This is necessary + * @last_tick: Store the last tick expiry time when the tick + * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline - * when the CPU returns from idle + * when the CPU returns from nohz sleep. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -51,7 +51,7 @@ struct tick_sched { struct hrtimer sched_timer; unsigned long check_clocks; enum tick_nohz_mode nohz_mode; - ktime_t idle_tick; + ktime_t last_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 911834b33b8a..73cc4901336d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -400,7 +400,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) if (!ts->tick_stopped) { select_nohz_load_balancer(1); - ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; } @@ -526,7 +526,7 @@ ktime_t tick_nohz_get_sleep_length(void) static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); while (1) { /* Forward the time to expire in the future */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3258455549f4..af5a7e9f164b 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); - P_ns(idle_tick); + P_ns(last_tick); P(tick_stopped); P(idle_jiffies); P(idle_calls); @@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.6\n"); + SEQ_printf(m, "Timer List Version: v0.7\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); From 5b39939a40801f0c17e31adaf643d6e974227856 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Aug 2011 00:06:10 +0200 Subject: [PATCH 116/117] nohz: Move ts->idle_calls incrementation into strict idle logic Since we want to prepare for making the nohz API to work further the idle case, we need to pull ts->idle_calls incrementation up to the callers in idle. To perform this, we split tick_nohz_stop_sched_tick() in two parts: a first one that checks if we can really stop the tick for idle, and another that actually stops it. Then from the callers in idle, we check if we can stop the tick and only then we increment idle_calls and finally relay to the nohz API that won't care about these details anymore. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 86 ++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 73cc4901336d..430e1b6901cc 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,47 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; - int cpu; - cpu = smp_processor_id(); - ts = &per_cpu(tick_cpu_sched, cpu); - /* - * If this cpu is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - } - - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - return; - - if (need_resched()) - return; - - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } - return; - } - - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&xtime_lock); @@ -441,16 +409,56 @@ out: ts->sleep_length = ktime_sub(dev->next_event, now); } +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + return false; + + if (need_resched()) + return false; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + return false; + } + + return true; +} + static void __tick_nohz_idle_enter(struct tick_sched *ts) { ktime_t now; - int was_stopped = ts->tick_stopped; + int cpu = smp_processor_id(); - now = tick_nohz_start_idle(smp_processor_id(), ts); - tick_nohz_stop_sched_tick(ts, now); + now = tick_nohz_start_idle(cpu, ts); - if (!was_stopped && ts->tick_stopped) - ts->idle_jiffies = ts->last_jiffies; + if (can_stop_idle_tick(cpu, ts)) { + int was_stopped = ts->tick_stopped; + + ts->idle_calls++; + tick_nohz_stop_sched_tick(ts, now, cpu); + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; + } } /** From 84bf1bccc60cc64376125ea2eae05e4ba12f795b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Aug 2011 01:25:38 +0200 Subject: [PATCH 117/117] nohz: Move next idle expiry time record into idle logic area The next idle expiry time record and idle sleeps tracking are statistics that only concern idle. Since we want the nohz APIs to become usable further idle context, let's pull up the handling of these statistics to the callers in idle. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 430e1b6901cc..60c9c60e9108 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,11 +271,11 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires; + ktime_t last_update, expires, ret = { .tv64 = 0 }; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -358,6 +358,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; + ret = expires; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -372,11 +374,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->tick_stopped = 1; } - ts->idle_sleeps++; - - /* Mark expires */ - ts->idle_expires = expires; - /* * If the expiration time == KTIME_MAX, then * in this case we simply stop the tick timer. @@ -407,6 +404,8 @@ out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); + + return ret; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) @@ -445,7 +444,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) static void __tick_nohz_idle_enter(struct tick_sched *ts) { - ktime_t now; + ktime_t now, expires; int cpu = smp_processor_id(); now = tick_nohz_start_idle(cpu, ts); @@ -454,7 +453,12 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) int was_stopped = ts->tick_stopped; ts->idle_calls++; - tick_nohz_stop_sched_tick(ts, now, cpu); + + expires = tick_nohz_stop_sched_tick(ts, now, cpu); + if (expires.tv64 > 0LL) { + ts->idle_sleeps++; + ts->idle_expires = expires; + } if (!was_stopped && ts->tick_stopped) ts->idle_jiffies = ts->last_jiffies;