From 0e78a87306a6f55b1c7bbafad1de62c3975953ca Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 3 May 2017 08:44:27 +0200 Subject: [PATCH 001/254] esp4: Fix udpencap for local TCP packets. Locally generated TCP packets are usually cloned, so we do skb_cow_data() on this packets. After that we need to reload the pointer to the esp header. On udpencap this header has an offset to skb_transport_header, so take this offset into account. Fixes: 67d349ed603 ("net/esp4: Fix invalid esph pointer crash") Fixes: fca11ebde3f0 ("esp4: Reorganize esp_output") Reported-by: Don Bowman Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 65cc02bd82bc..93322f895eab 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; @@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); - esp->esph = ip_esp_hdr(skb); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); From 9b3eb54106cf6acd03f07cf0ab01c13676a226c2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 3 May 2017 16:43:19 +0200 Subject: [PATCH 002/254] xfrm: fix stack access out of bounds with CONFIG_XFRM_SUB_POLICY When CONFIG_XFRM_SUB_POLICY=y, xfrm_dst stores a copy of the flowi for that dst. Unfortunately, the code that allocates and fills this copy doesn't care about what type of flowi (flowi, flowi4, flowi6) gets passed. In multiple code paths (from raw_sendmsg, from TCP when replying to a FIN, in vxlan, geneve, and gre), the flowi that gets passed to xfrm is actually an on-stack flowi4, so we end up reading stuff from the stack past the end of the flowi4 struct. Since xfrm_dst->origin isn't used anywhere following commit ca116922afa8 ("xfrm: Eliminate "fl" and "pol" args to xfrm_bundle_ok()."), just get rid of it. xfrm_dst->partner isn't used either, so get rid of that too. Fixes: 9d6ec938019c ("ipv4: Use flowi4 in public route lookup interfaces.") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 10 --------- net/xfrm/xfrm_policy.c | 47 ------------------------------------------ 2 files changed, 57 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6793a30c66b1..7e7e2b0d2915 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -979,10 +979,6 @@ struct xfrm_dst { struct flow_cache_object flo; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; -#ifdef CONFIG_XFRM_SUB_POLICY - struct flowi *origin; - struct xfrm_selector *partner; -#endif u32 xfrm_genid; u32 policy_genid; u32 route_mtu_cached; @@ -998,12 +994,6 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) dst_release(xdst->route); if (likely(xdst->u.dst.xfrm)) xfrm_state_put(xdst->u.dst.xfrm); -#ifdef CONFIG_XFRM_SUB_POLICY - kfree(xdst->origin); - xdst->origin = NULL; - kfree(xdst->partner); - xdst->partner = NULL; -#endif } #endif diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b00a1d5a7f52..ed4e52d95172 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1797,43 +1797,6 @@ free_dst: goto out; } -#ifdef CONFIG_XFRM_SUB_POLICY -static int xfrm_dst_alloc_copy(void **target, const void *src, int size) -{ - if (!*target) { - *target = kmalloc(size, GFP_ATOMIC); - if (!*target) - return -ENOMEM; - } - - memcpy(*target, src, size); - return 0; -} -#endif - -static int xfrm_dst_update_parent(struct dst_entry *dst, - const struct xfrm_selector *sel) -{ -#ifdef CONFIG_XFRM_SUB_POLICY - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - return xfrm_dst_alloc_copy((void **)&(xdst->partner), - sel, sizeof(*sel)); -#else - return 0; -#endif -} - -static int xfrm_dst_update_origin(struct dst_entry *dst, - const struct flowi *fl) -{ -#ifdef CONFIG_XFRM_SUB_POLICY - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl)); -#else - return 0; -#endif -} - static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) @@ -1905,16 +1868,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; - if (num_pols > 1) - err = xfrm_dst_update_parent(dst, &pols[1]->selector); - else - err = xfrm_dst_update_origin(dst, fl); - if (unlikely(err)) { - dst_free(dst); - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); - return ERR_PTR(err); - } - xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); From d86ab9cff8b936aadde444d0e263a8db5ff0349b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 3 May 2017 14:30:48 +0100 Subject: [PATCH 003/254] cpufreq: schedutil: use now as reference when aggregating shared policy requests Currently, sugov_next_freq_shared() uses last_freq_update_time as a reference to decide when to start considering CPU contributions as stale. However, since last_freq_update_time is set by the last CPU that issued a frequency transition, this might cause problems in certain cases. In practice, the detection of stale utilization values fails whenever the CPU with such values was the last to update the policy. For example (and please note again that the SCHED_CPUFREQ_RT flag is not the problem here, but only the detection of after how much time that flag has to be considered stale), suppose a policy with 2 CPUs: CPU0 | CPU1 | | RT task scheduled | SCHED_CPUFREQ_RT is set | CPU1->last_update = now | freq transition to max | last_freq_update_time = now | more than TICK_NSEC nsecs | a small CFS wakes up | CPU0->last_update = now1 | delta_ns(CPU0) < TICK_NSEC* | CPU0's util is considered | delta_ns(CPU1) = | last_freq_update_time - | CPU1->last_update = 0 | < TICK_NSEC | CPU1 is still considered | CPU1->SCHED_CPUFREQ_RT is set | we stay at max (until CPU1 | exits from idle) | * delta_ns is actually negative as now1 > last_freq_update_time While last_freq_update_time is a sensible reference for rate limiting, it doesn't seem to be useful for working around stale CPU states. Fix the problem by always considering now (time) as the reference for deciding when CPUs have stale contributions. Signed-off-by: Juri Lelli Acked-by: Vincent Guittot Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..622eed1b7658 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -245,11 +245,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned long util = 0, max = 1; unsigned int j; @@ -265,7 +264,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) * enough, don't take the CPU into account as it probably is * idle now (and clear iowait_boost for it). */ - delta_ns = last_freq_update_time - j_sg_cpu->last_update; + delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; continue; @@ -309,7 +308,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT_DL) next_f = sg_policy->policy->cpuinfo.max_freq; else - next_f = sugov_next_freq_shared(sg_cpu); + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } From d90c902449a7561f1b1d58ba5a0d11728ce8b0b2 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 5 May 2017 07:40:42 +0200 Subject: [PATCH 004/254] af_key: Fix slab-out-of-bounds in pfkey_compile_policy. The sadb_x_sec_len is stored in the unit 'byte divided by eight'. So we have to multiply this value by eight before we can do size checks. Otherwise we may get a slab-out-of-bounds when we memcpy the user sec_ctx. Fixes: df71837d502 ("[LSM-IPSec]: Security association restriction.") Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Steffen Klassert --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index c1950bb14735..512dc43d0ce6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3285,7 +3285,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + - sec_ctx->sadb_x_sec_len) { + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } From 2c1497bbc8fdee897341ab48ee9c9209b421b8c0 Mon Sep 17 00:00:00 2001 From: Ilan Tayari Date: Mon, 8 May 2017 10:30:18 +0300 Subject: [PATCH 005/254] xfrm: Fix NETDEV_DOWN with IPSec offload Upon NETDEV_DOWN event, all xfrm_state objects which are bound to the device are flushed. The condition for this is wrong, though, testing dev->hw_features instead of dev->features. If a device has non-user-modifiable NETIF_F_HW_ESP, then its xfrm_state objects are not flushed, causing a crash later on after the device is deleted. Check dev->features instead of dev->hw_features. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Ilan Tayari Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 8ec8a3fcf8d4..574e6f32f94f 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -170,7 +170,7 @@ static int xfrm_dev_feat_change(struct net_device *dev) static int xfrm_dev_down(struct net_device *dev) { - if (dev->hw_features & NETIF_F_HW_ESP) + if (dev->features & NETIF_F_HW_ESP) xfrm_dev_state_flush(dev_net(dev), dev, true); xfrm_garbage_collect(dev_net(dev)); From 3c5ab3f395d66a9e4e937fcfdf6ebc63894f028b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 29 Apr 2017 20:33:09 +0300 Subject: [PATCH 006/254] ipvs: SNAT packet replies only for NATed connections We do not check if packet from real server is for NAT connection before performing SNAT. This causes problems for setups that use DR/TUN and allow local clients to access the real server directly, for example: - local client in director creates IPVS-DR/TUN connection CIP->VIP and the request packets are routed to RIP. Talks are finished but IPVS connection is not expired yet. - second local client creates non-IPVS connection CIP->RIP with same reply tuple RIP->CIP and when replies are received on LOCAL_IN we wrongly assign them for the first client connection because RIP->CIP matches the reply direction. As result, IPVS SNATs replies for non-IPVS connections. The problem is more visible to local UDP clients but in rare cases it can happen also for TCP or remote clients when the real server sends the reply traffic via the director. So, better to be more precise for the reply traffic. As replies are not expected for DR/TUN connections, better to not touch them. Reported-by: Nick Moriarty Tested-by: Nick Moriarty Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d2d7bdf1d510..ad99c1ceea6f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -849,10 +849,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, { unsigned int verdict = NF_DROP; - if (IP_VS_FWD_METHOD(cp) != 0) { - pr_err("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto ignore_cp; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { @@ -886,6 +884,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); + +ignore_cp: verdict = NF_ACCEPT; out: @@ -1385,8 +1385,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in */ cp = pp->conn_out_get(ipvs, af, skb, &iph); - if (likely(cp)) + if (likely(cp)) { + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto ignore_cp; return handle_response(af, skb, pd, cp, &iph, hooknum); + } /* Check for real-server-started requests */ if (atomic_read(&ipvs->conn_out_counter)) { @@ -1444,9 +1447,15 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in } } } + +out: IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; + +ignore_cp: + __ip_vs_conn_put(cp); + goto out; } /* From 29f6ca6916e29fc46f1418885374d9ed50430687 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 May 2017 14:59:02 +0900 Subject: [PATCH 007/254] scsi: sd: Unlock zone in case of error in sd_setup_write_same_cmnd() scsi_io_init() may fail, leaving a zone of a zoned block device locked. Fix this by properly unlocking the write same request target zone if scsi_io_init() fails. Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f9d1432d7cc5..e60a309b26bf 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -948,6 +948,10 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) rq->__data_len = sdp->sector_size; ret = scsi_init_io(cmd); rq->__data_len = nr_bytes; + + if (sd_is_zoned(sdkp) && ret != BLKPREP_OK) + sd_zbc_write_unlock_zone(cmd); + return ret; } From ed44fd7fd8a6785b73cfc6d44594c434e578d724 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 May 2017 15:48:19 +0900 Subject: [PATCH 008/254] scsi: sd: Write lock zone for REQ_OP_WRITE_ZEROES For a zoned block device, sd_zbc_complete() handles zone write unlock on completion of a REQ_OP_WRITE_ZEROES command but the zone write locking is missing from sd_setup_write_zeroes_cmnd(). This patch fixes this problem by locking the target zone of a REQ_OP_WRITE_ZEROES request. Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e60a309b26bf..de9e2f2ef662 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -827,21 +827,32 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9); u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9); + int ret; if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { case SD_ZERO_WS16_UNMAP: - return sd_setup_write_same16_cmnd(cmd, true); + ret = sd_setup_write_same16_cmnd(cmd, true); + goto out; case SD_ZERO_WS10_UNMAP: - return sd_setup_write_same10_cmnd(cmd, true); + ret = sd_setup_write_same10_cmnd(cmd, true); + goto out; } } if (sdp->no_write_same) return BLKPREP_INVALID; + if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff) - return sd_setup_write_same16_cmnd(cmd, false); - return sd_setup_write_same10_cmnd(cmd, false); + ret = sd_setup_write_same16_cmnd(cmd, false); + else + ret = sd_setup_write_same10_cmnd(cmd, false); + +out: + if (sd_is_zoned(sdkp) && ret == BLKPREP_OK) + return sd_zbc_write_lock_zone(cmd); + + return ret; } static void sd_config_write_same(struct scsi_disk *sdkp) From 48ae8484e9fc324b4968d33c585e54bc98e44d61 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 10 May 2017 09:53:40 +0200 Subject: [PATCH 009/254] scsi: sg: don't return bogus Sg_requests If the list search in sg_get_rq_mark() fails to find a valid request, we return a bogus element. This then can later lead to a GPF in sg_remove_scat(). So don't return bogus Sg_requests in sg_get_rq_mark() but NULL in case the list search doesn't find a valid request. Signed-off-by: Johannes Thumshirn Reported-by: Andrey Konovalov Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Doug Gilbert Reviewed-by: Hannes Reinecke Acked-by: Doug Gilbert Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0a38ba01b7b4..82c33a6edbea 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -2074,11 +2074,12 @@ sg_get_rq_mark(Sg_fd * sfp, int pack_id) if ((1 == resp->done) && (!resp->sg_io_owned) && ((-1 == pack_id) || (resp->header.pack_id == pack_id))) { resp->done = 2; /* guard against other readers */ - break; + write_unlock_irqrestore(&sfp->rq_list_lock, iflags); + return resp; } } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); - return resp; + return NULL; } /* always adds to end of list */ From bdce57e7ae9cac56cda958029104c752afaf0894 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Fri, 3 Mar 2017 09:44:11 -0500 Subject: [PATCH 010/254] tools/power/acpi: Add .gitignore file Add a .gitignore file so that git commands do not pick up the resulting binaries and directories. Signed-off-by: Prarit Bhargava Acked-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- tools/power/acpi/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 tools/power/acpi/.gitignore diff --git a/tools/power/acpi/.gitignore b/tools/power/acpi/.gitignore new file mode 100644 index 000000000000..cba3d994995c --- /dev/null +++ b/tools/power/acpi/.gitignore @@ -0,0 +1,4 @@ +acpidbg +acpidump +ec +include From f369fdf4f661322b73f3307e9f3cd55fb3a20123 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Tue, 9 May 2017 15:02:22 +0800 Subject: [PATCH 011/254] Revert "ACPI / button: Remove lid_init_state=method mode" This reverts commit ecb10b694b72ca5ea51b3c90a71ff2a11963425a. The only expected ACPI control method lid device's usage model is 1. Listen to the lid notification, 2. Evaluate _LID after being notified by BIOS, 3. Suspend the system (if users configure to do so) after seeing "close". It's not ensured that BIOS will notify OS after boot/resume, and it's not ensured that BIOS will always generate "open" event upon opening the lid. But there are 2 wrong usage models: 1. When the lid device is responsible for suspend/resume the system, userspace requires to see "open" event to be paired with "close" after the system is resumed, or it will suspend the system again. 2. When an external monitor connects to the laptop attached docks, userspace requires to see "close" event after the system is resumed so that it can determine whether the internal display should remain dark and the external display should be lit on. After we made default kernel behavior to be suitable for usage model 1, users of usage model 2 start to report regressions for such behavior change. Reversion of button.lid_init_state=method doesn't actually reverts to old default behavior as doing so can enter a regression loop, but facilitates users to work the reported regressions around with button.lid_init_state=method. Fixes: ecb10b694b72 (ACPI / button: Remove lid_init_state=method mode) Cc: 4.11+ # 4.11+ Link: https://bugzilla.kernel.org/show_bug.cgi?id=195455 Link: https://bugzilla.redhat.com/show_bug.cgi?id=1430259 Tested-by: Steffen Weber Tested-by: Julian Wiedmann Reported-by: Joachim Frieben Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- Documentation/acpi/acpi-lid.txt | 16 ++++++++++++---- drivers/acpi/button.c | 9 +++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Documentation/acpi/acpi-lid.txt b/Documentation/acpi/acpi-lid.txt index 22cb3091f297..effe7af3a5af 100644 --- a/Documentation/acpi/acpi-lid.txt +++ b/Documentation/acpi/acpi-lid.txt @@ -59,20 +59,28 @@ button driver uses the following 3 modes in order not to trigger issues. If the userspace hasn't been prepared to ignore the unreliable "opened" events and the unreliable initial state notification, Linux users can use the following kernel parameters to handle the possible issues: -A. button.lid_init_state=open: +A. button.lid_init_state=method: + When this option is specified, the ACPI button driver reports the + initial lid state using the returning value of the _LID control method + and whether the "opened"/"closed" events are paired fully relies on the + firmware implementation. + This option can be used to fix some platforms where the returning value + of the _LID control method is reliable but the initial lid state + notification is missing. + This option is the default behavior during the period the userspace + isn't ready to handle the buggy AML tables. +B. button.lid_init_state=open: When this option is specified, the ACPI button driver always reports the initial lid state as "opened" and whether the "opened"/"closed" events are paired fully relies on the firmware implementation. This may fix some platforms where the returning value of the _LID control method is not reliable and the initial lid state notification is missing. - This option is the default behavior during the period the userspace - isn't ready to handle the buggy AML tables. If the userspace has been prepared to ignore the unreliable "opened" events and the unreliable initial state notification, Linux users should always use the following kernel parameter: -B. button.lid_init_state=ignore: +C. button.lid_init_state=ignore: When this option is specified, the ACPI button driver never reports the initial lid state and there is a compensation mechanism implemented to ensure that the reliable "closed" notifications can always be delievered diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 668137e4a069..6d5a8c1d3132 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -57,6 +57,7 @@ #define ACPI_BUTTON_LID_INIT_IGNORE 0x00 #define ACPI_BUTTON_LID_INIT_OPEN 0x01 +#define ACPI_BUTTON_LID_INIT_METHOD 0x02 #define _COMPONENT ACPI_BUTTON_COMPONENT ACPI_MODULE_NAME("button"); @@ -376,6 +377,9 @@ static void acpi_lid_initialize_state(struct acpi_device *device) case ACPI_BUTTON_LID_INIT_OPEN: (void)acpi_lid_notify_state(device, 1); break; + case ACPI_BUTTON_LID_INIT_METHOD: + (void)acpi_lid_update_state(device); + break; case ACPI_BUTTON_LID_INIT_IGNORE: default: break; @@ -559,6 +563,9 @@ static int param_set_lid_init_state(const char *val, struct kernel_param *kp) if (!strncmp(val, "open", sizeof("open") - 1)) { lid_init_state = ACPI_BUTTON_LID_INIT_OPEN; pr_info("Notify initial lid state as open\n"); + } else if (!strncmp(val, "method", sizeof("method") - 1)) { + lid_init_state = ACPI_BUTTON_LID_INIT_METHOD; + pr_info("Notify initial lid state with _LID return value\n"); } else if (!strncmp(val, "ignore", sizeof("ignore") - 1)) { lid_init_state = ACPI_BUTTON_LID_INIT_IGNORE; pr_info("Do not notify initial lid state\n"); @@ -572,6 +579,8 @@ static int param_get_lid_init_state(char *buffer, struct kernel_param *kp) switch (lid_init_state) { case ACPI_BUTTON_LID_INIT_OPEN: return sprintf(buffer, "open"); + case ACPI_BUTTON_LID_INIT_METHOD: + return sprintf(buffer, "method"); case ACPI_BUTTON_LID_INIT_IGNORE: return sprintf(buffer, "ignore"); default: From 33fc30b470983e5b641a16cccc882f6777dd50ef Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 14 May 2017 02:06:03 +0200 Subject: [PATCH 012/254] cpufreq: intel_pstate: Document the current behavior and user interface Add a document describing the current behavior and user space interface of the intel_pstate driver in the RST format and drop the existing outdated intel_pstate.txt document. Also update admin-guide/pm/cpufreq.rst with proper RST references to the new intel_pstate.rst document. Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/cpufreq.rst | 19 +- Documentation/admin-guide/pm/index.rst | 1 + Documentation/admin-guide/pm/intel_pstate.rst | 755 ++++++++++++++++++ Documentation/cpu-freq/intel-pstate.txt | 281 ------- 4 files changed, 766 insertions(+), 290 deletions(-) create mode 100644 Documentation/admin-guide/pm/intel_pstate.rst delete mode 100644 Documentation/cpu-freq/intel-pstate.txt diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index 289c80f7760e..09aa2e949787 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -1,4 +1,5 @@ .. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy ` +.. |intel_pstate| replace:: :doc:`intel_pstate ` ======================= CPU Performance Scaling @@ -75,7 +76,7 @@ feedback registers, as that information is typically specific to the hardware interface it comes from and may not be easily represented in an abstract, platform-independent way. For this reason, ``CPUFreq`` allows scaling drivers to bypass the governor layer and implement their own performance scaling -algorithms. That is done by the ``intel_pstate`` scaling driver. +algorithms. That is done by the |intel_pstate| scaling driver. ``CPUFreq`` Policy Objects @@ -174,13 +175,13 @@ necessary to restart the scaling governor so that it can take the new online CPU into account. That is achieved by invoking the governor's ``->stop`` and ``->start()`` callbacks, in this order, for the entire policy. -As mentioned before, the ``intel_pstate`` scaling driver bypasses the scaling +As mentioned before, the |intel_pstate| scaling driver bypasses the scaling governor layer of ``CPUFreq`` and provides its own P-state selection algorithms. -Consequently, if ``intel_pstate`` is used, scaling governors are not attached to +Consequently, if |intel_pstate| is used, scaling governors are not attached to new policy objects. Instead, the driver's ``->setpolicy()`` callback is invoked to register per-CPU utilization update callbacks for each policy. These callbacks are invoked by the CPU scheduler in the same way as for scaling -governors, but in the ``intel_pstate`` case they both determine the P-state to +governors, but in the |intel_pstate| case they both determine the P-state to use and change the hardware configuration accordingly in one go from scheduler context. @@ -257,7 +258,7 @@ are the following: ``scaling_available_governors`` List of ``CPUFreq`` scaling governors present in the kernel that can - be attached to this policy or (if the ``intel_pstate`` scaling driver is + be attached to this policy or (if the |intel_pstate| scaling driver is in use) list of scaling algorithms provided by the driver that can be applied to this policy. @@ -274,7 +275,7 @@ are the following: the CPU is actually running at (due to hardware design and other limitations). - Some scaling drivers (e.g. ``intel_pstate``) attempt to provide + Some scaling drivers (e.g. |intel_pstate|) attempt to provide information more precisely reflecting the current CPU frequency through this attribute, but that still may not be the exact current CPU frequency as seen by the hardware at the moment. @@ -284,13 +285,13 @@ are the following: ``scaling_governor`` The scaling governor currently attached to this policy or (if the - ``intel_pstate`` scaling driver is in use) the scaling algorithm + |intel_pstate| scaling driver is in use) the scaling algorithm provided by the driver that is currently applied to this policy. This attribute is read-write and writing to it will cause a new scaling governor to be attached to this policy or a new scaling algorithm provided by the scaling driver to be applied to it (in the - ``intel_pstate`` case), as indicated by the string written to this + |intel_pstate| case), as indicated by the string written to this attribute (which must be one of the names listed by the ``scaling_available_governors`` attribute described above). @@ -619,7 +620,7 @@ This file is located under :file:`/sys/devices/system/cpu/cpufreq/` and controls the "boost" setting for the whole system. It is not present if the underlying scaling driver does not support the frequency boost mechanism (or supports it, but provides a driver-specific interface for controlling it, like -``intel_pstate``). +|intel_pstate|). If the value in this file is 1, the frequency boost mechanism is enabled. This means that either the hardware can be put into states in which it is able to diff --git a/Documentation/admin-guide/pm/index.rst b/Documentation/admin-guide/pm/index.rst index c80f087321fc..7f148f76f432 100644 --- a/Documentation/admin-guide/pm/index.rst +++ b/Documentation/admin-guide/pm/index.rst @@ -6,6 +6,7 @@ Power Management :maxdepth: 2 cpufreq + intel_pstate .. only:: subproject and html diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst new file mode 100644 index 000000000000..33d703989ea8 --- /dev/null +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -0,0 +1,755 @@ +=============================================== +``intel_pstate`` CPU Performance Scaling Driver +=============================================== + +:: + + Copyright (c) 2017 Intel Corp., Rafael J. Wysocki + + +General Information +=================== + +``intel_pstate`` is a part of the +:doc:`CPU performance scaling subsystem ` in the Linux kernel +(``CPUFreq``). It is a scaling driver for the Sandy Bridge and later +generations of Intel processors. Note, however, that some of those processors +may not be supported. [To understand ``intel_pstate`` it is necessary to know +how ``CPUFreq`` works in general, so this is the time to read :doc:`cpufreq` if +you have not done that yet.] + +For the processors supported by ``intel_pstate``, the P-state concept is broader +than just an operating frequency or an operating performance point (see the +`LinuxCon Europe 2015 presentation by Kristen Accardi `_ for more +information about that). For this reason, the representation of P-states used +by ``intel_pstate`` internally follows the hardware specification (for details +refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual +Volume 3: System Programming Guide `_). However, the ``CPUFreq`` core +uses frequencies for identifying operating performance points of CPUs and +frequencies are involved in the user space interface exposed by it, so +``intel_pstate`` maps its internal representation of P-states to frequencies too +(fortunately, that mapping is unambiguous). At the same time, it would not be +practical for ``intel_pstate`` to supply the ``CPUFreq`` core with a table of +available frequencies due to the possible size of it, so the driver does not do +that. Some functionality of the core is limited by that. + +Since the hardware P-state selection interface used by ``intel_pstate`` is +available at the logical CPU level, the driver always works with individual +CPUs. Consequently, if ``intel_pstate`` is in use, every ``CPUFreq`` policy +object corresponds to one logical CPU and ``CPUFreq`` policies are effectively +equivalent to CPUs. In particular, this means that they become "inactive" every +time the corresponding CPU is taken offline and need to be re-initialized when +it goes back online. + +``intel_pstate`` is not modular, so it cannot be unloaded, which means that the +only way to pass early-configuration-time parameters to it is via the kernel +command line. However, its configuration can be adjusted via ``sysfs`` to a +great extent. In some configurations it even is possible to unregister it via +``sysfs`` which allows another ``CPUFreq`` scaling driver to be loaded and +registered (see `below `_). + + +Operation Modes +=============== + +``intel_pstate`` can operate in three different modes: in the active mode with +or without hardware-managed P-states support and in the passive mode. Which of +them will be in effect depends on what kernel command line options are used and +on the capabilities of the processor. + +Active Mode +----------- + +This is the default operation mode of ``intel_pstate``. If it works in this +mode, the ``scaling_driver`` policy attribute in ``sysfs`` for all ``CPUFreq`` +policies contains the string "intel_pstate". + +In this mode the driver bypasses the scaling governors layer of ``CPUFreq`` and +provides its own scaling algorithms for P-state selection. Those algorithms +can be applied to ``CPUFreq`` policies in the same way as generic scaling +governors (that is, through the ``scaling_governor`` policy attribute in +``sysfs``). [Note that different P-state selection algorithms may be chosen for +different policies, but that is not recommended.] + +They are not generic scaling governors, but their names are the same as the +names of some of those governors. Moreover, confusingly enough, they generally +do not work in the same way as the generic governors they share the names with. +For example, the ``powersave`` P-state selection algorithm provided by +``intel_pstate`` is not a counterpart of the generic ``powersave`` governor +(roughly, it corresponds to the ``schedutil`` and ``ondemand`` governors). + +There are two P-state selection algorithms provided by ``intel_pstate`` in the +active mode: ``powersave`` and ``performance``. The way they both operate +depends on whether or not the hardware-managed P-states (HWP) feature has been +enabled in the processor and possibly on the processor model. + +Which of the P-state selection algorithms is used by default depends on the +:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option. +Namely, if that option is set, the ``performance`` algorithm will be used by +default, and the other one will be used by default if it is not set. + +Active Mode With HWP +~~~~~~~~~~~~~~~~~~~~ + +If the processor supports the HWP feature, it will be enabled during the +processor initialization and cannot be disabled after that. It is possible +to avoid enabling it by passing the ``intel_pstate=no_hwp`` argument to the +kernel in the command line. + +If the HWP feature has been enabled, ``intel_pstate`` relies on the processor to +select P-states by itself, but still it can give hints to the processor's +internal P-state selection logic. What those hints are depends on which P-state +selection algorithm has been applied to the given policy (or to the CPU it +corresponds to). + +Even though the P-state selection is carried out by the processor automatically, +``intel_pstate`` registers utilization update callbacks with the CPU scheduler +in this mode. However, they are not used for running a P-state selection +algorithm, but for periodic updates of the current CPU frequency information to +be made available from the ``scaling_cur_freq`` policy attribute in ``sysfs``. + +HWP + ``performance`` +..................... + +In this configuration ``intel_pstate`` will write 0 to the processor's +Energy-Performance Preference (EPP) knob (if supported) or its +Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's +internal P-state selection logic is expected to focus entirely on performance. + +This will override the EPP/EPB setting coming from the ``sysfs`` interface +(see `Energy vs Performance Hints`_ below). + +Also, in this configuration the range of P-states available to the processor's +internal P-state selection logic is always restricted to the upper boundary +(that is, the maximum P-state that the driver is allowed to use). + +HWP + ``powersave`` +................... + +In this configuration ``intel_pstate`` will set the processor's +Energy-Performance Preference (EPP) knob (if supported) or its +Energy-Performance Bias (EPB) knob (otherwise) to whatever value it was +previously set to via ``sysfs`` (or whatever default value it was +set to by the platform firmware). This usually causes the processor's +internal P-state selection logic to be less performance-focused. + +Active Mode Without HWP +~~~~~~~~~~~~~~~~~~~~~~~ + +This is the default operation mode for processors that do not support the HWP +feature. It also is used by default with the ``intel_pstate=no_hwp`` argument +in the kernel command line. However, in this mode ``intel_pstate`` may refuse +to work with the given processor if it does not recognize it. [Note that +``intel_pstate`` will never refuse to work with any processor with the HWP +feature enabled.] + +In this mode ``intel_pstate`` registers utilization update callbacks with the +CPU scheduler in order to run a P-state selection algorithm, either +``powersave`` or ``performance``, depending on the ``scaling_cur_freq`` policy +setting in ``sysfs``. The current CPU frequency information to be made +available from the ``scaling_cur_freq`` policy attribute in ``sysfs`` is +periodically updated by those utilization update callbacks too. + +``performance`` +............... + +Without HWP, this P-state selection algorithm is always the same regardless of +the processor model and platform configuration. + +It selects the maximum P-state it is allowed to use, subject to limits set via +``sysfs``, every time the P-state selection computations are carried out by the +driver's utilization update callback for the given CPU (that does not happen +more often than every 10 ms), but the hardware configuration will not be changed +if the new P-state is the same as the current one. + +This is the default P-state selection algorithm if the +:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option +is set. + +``powersave`` +............. + +Without HWP, this P-state selection algorithm generally depends on the +processor model and/or the system profile setting in the ACPI tables and there +are two variants of it. + +One of them is used with processors from the Atom line and (regardless of the +processor model) on platforms with the system profile in the ACPI tables set to +"mobile" (laptops mostly), "tablet", "appliance PC", "desktop", or +"workstation". It is also used with processors supporting the HWP feature if +that feature has not been enabled (that is, with the ``intel_pstate=no_hwp`` +argument in the kernel command line). It is similar to the algorithm +implemented by the generic ``schedutil`` scaling governor except that the +utilization metric used by it is based on numbers coming from feedback +registers of the CPU. It generally selects P-states proportional to the +current CPU utilization, so it is referred to as the "proportional" algorithm. + +The second variant of the ``powersave`` P-state selection algorithm, used in all +of the other cases (generally, on processors from the Core line, so it is +referred to as the "Core" algorithm), is based on the values read from the APERF +and MPERF feedback registers and the previously requested target P-state. +It does not really take CPU utilization into account explicitly, but as a rule +it causes the CPU P-state to ramp up very quickly in response to increased +utilization which is generally desirable in server environments. + +Regardless of the variant, this algorithm is run by the driver's utilization +update callback for the given CPU when it is invoked by the CPU scheduler, but +not more often than every 10 ms (that can be tweaked via ``debugfs`` in `this +particular case `_). Like in the ``performance`` +case, the hardware configuration is not touched if the new P-state turns out to +be the same as the current one. + +This is the default P-state selection algorithm if the +:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option +is not set. + +Passive Mode +------------ + +This mode is used if the ``intel_pstate=passive`` argument is passed to the +kernel in the command line (it implies the ``intel_pstate=no_hwp`` setting too). +Like in the active mode without HWP support, in this mode ``intel_pstate`` may +refuse to work with the given processor if it does not recognize it. + +If the driver works in this mode, the ``scaling_driver`` policy attribute in +``sysfs`` for all ``CPUFreq`` policies contains the string "intel_cpufreq". +Then, the driver behaves like a regular ``CPUFreq`` scaling driver. That is, +it is invoked by generic scaling governors when necessary to talk to the +hardware in order to change the P-state of a CPU (in particular, the +``schedutil`` governor can invoke it directly from scheduler context). + +While in this mode, ``intel_pstate`` can be used with all of the (generic) +scaling governors listed by the ``scaling_available_governors`` policy attribute +in ``sysfs`` (and the P-state selection algorithms described above are not +used). Then, it is responsible for the configuration of policy objects +corresponding to CPUs and provides the ``CPUFreq`` core (and the scaling +governors attached to the policy objects) with accurate information on the +maximum and minimum operating frequencies supported by the hardware (including +the so-called "turbo" frequency ranges). In other words, in the passive mode +the entire range of available P-states is exposed by ``intel_pstate`` to the +``CPUFreq`` core. However, in this mode the driver does not register +utilization update callbacks with the CPU scheduler and the ``scaling_cur_freq`` +information comes from the ``CPUFreq`` core (and is the last frequency selected +by the current scaling governor for the given policy). + + +.. _turbo: + +Turbo P-states Support +====================== + +In the majority of cases, the entire range of P-states available to +``intel_pstate`` can be divided into two sub-ranges that correspond to +different types of processor behavior, above and below a boundary that +will be referred to as the "turbo threshold" in what follows. + +The P-states above the turbo threshold are referred to as "turbo P-states" and +the whole sub-range of P-states they belong to is referred to as the "turbo +range". These names are related to the Turbo Boost technology allowing a +multicore processor to opportunistically increase the P-state of one or more +cores if there is enough power to do that and if that is not going to cause the +thermal envelope of the processor package to be exceeded. + +Specifically, if software sets the P-state of a CPU core within the turbo range +(that is, above the turbo threshold), the processor is permitted to take over +performance scaling control for that core and put it into turbo P-states of its +choice going forward. However, that permission is interpreted differently by +different processor generations. Namely, the Sandy Bridge generation of +processors will never use any P-states above the last one set by software for +the given core, even if it is within the turbo range, whereas all of the later +processor generations will take it as a license to use any P-states from the +turbo range, even above the one set by software. In other words, on those +processors setting any P-state from the turbo range will enable the processor +to put the given core into all turbo P-states up to and including the maximum +supported one as it sees fit. + +One important property of turbo P-states is that they are not sustainable. More +precisely, there is no guarantee that any CPUs will be able to stay in any of +those states indefinitely, because the power distribution within the processor +package may change over time or the thermal envelope it was designed for might +be exceeded if a turbo P-state was used for too long. + +In turn, the P-states below the turbo threshold generally are sustainable. In +fact, if one of them is set by software, the processor is not expected to change +it to a lower one unless in a thermal stress or a power limit violation +situation (a higher P-state may still be used if it is set for another CPU in +the same package at the same time, for example). + +Some processors allow multiple cores to be in turbo P-states at the same time, +but the maximum P-state that can be set for them generally depends on the number +of cores running concurrently. The maximum turbo P-state that can be set for 3 +cores at the same time usually is lower than the analogous maximum P-state for +2 cores, which in turn usually is lower than the maximum turbo P-state that can +be set for 1 core. The one-core maximum turbo P-state is thus the maximum +supported one overall. + +The maximum supported turbo P-state, the turbo threshold (the maximum supported +non-turbo P-state) and the minimum supported P-state are specific to the +processor model and can be determined by reading the processor's model-specific +registers (MSRs). Moreover, some processors support the Configurable TDP +(Thermal Design Power) feature and, when that feature is enabled, the turbo +threshold effectively becomes a configurable value that can be set by the +platform firmware. + +Unlike ``_PSS`` objects in the ACPI tables, ``intel_pstate`` always exposes +the entire range of available P-states, including the whole turbo range, to the +``CPUFreq`` core and (in the passive mode) to generic scaling governors. This +generally causes turbo P-states to be set more often when ``intel_pstate`` is +used relative to ACPI-based CPU performance scaling (see `below `_ +for more information). + +Moreover, since ``intel_pstate`` always knows what the real turbo threshold is +(even if the Configurable TDP feature is enabled in the processor), its +``no_turbo`` attribute in ``sysfs`` (described `below `_) should +work as expected in all cases (that is, if set to disable turbo P-states, it +always should prevent ``intel_pstate`` from using them). + + +Processor Support +================= + +To handle a given processor ``intel_pstate`` requires a number of different +pieces of information on it to be known, including: + + * The minimum supported P-state. + + * The maximum supported `non-turbo P-state `_. + + * Whether or not turbo P-states are supported at all. + + * The maximum supported `one-core turbo P-state `_ (if turbo P-states + are supported). + + * The scaling formula to translate the driver's internal representation + of P-states into frequencies and the other way around. + +Generally, ways to obtain that information are specific to the processor model +or family. Although it often is possible to obtain all of it from the processor +itself (using model-specific registers), there are cases in which hardware +manuals need to be consulted to get to it too. + +For this reason, there is a list of supported processors in ``intel_pstate`` and +the driver initialization will fail if the detected processor is not in that +list, unless it supports the `HWP feature `_. [The interface to +obtain all of the information listed above is the same for all of the processors +supporting the HWP feature, which is why they all are supported by +``intel_pstate``.] + + +User Space Interface in ``sysfs`` +================================= + +Global Attributes +----------------- + +``intel_pstate`` exposes several global attributes (files) in ``sysfs`` to +control its functionality at the system level. They are located in the +``/sys/devices/system/cpu/cpufreq/intel_pstate/`` directory and affect all +CPUs. + +Some of them are not present if the ``intel_pstate=per_cpu_perf_limits`` +argument is passed to the kernel in the command line. + +``max_perf_pct`` + Maximum P-state the driver is allowed to set in percent of the + maximum supported performance level (the highest supported `turbo + P-state `_). + + This attribute will not be exposed if the + ``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel + command line. + +``min_perf_pct`` + Minimum P-state the driver is allowed to set in percent of the + maximum supported performance level (the highest supported `turbo + P-state `_). + + This attribute will not be exposed if the + ``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel + command line. + +``num_pstates`` + Number of P-states supported by the processor (between 0 and 255 + inclusive) including both turbo and non-turbo P-states (see + `Turbo P-states Support`_). + + The value of this attribute is not affected by the ``no_turbo`` + setting described `below `_. + + This attribute is read-only. + +``turbo_pct`` + Ratio of the `turbo range `_ size to the size of the entire + range of supported P-states, in percent. + + This attribute is read-only. + +.. _no_turbo_attr: + +``no_turbo`` + If set (equal to 1), the driver is not allowed to set any turbo P-states + (see `Turbo P-states Support`_). If unset (equalt to 0, which is the + default), turbo P-states can be set by the driver. + [Note that ``intel_pstate`` does not support the general ``boost`` + attribute (supported by some other scaling drivers) which is replaced + by this one.] + + This attrubute does not affect the maximum supported frequency value + supplied to the ``CPUFreq`` core and exposed via the policy interface, + but it affects the maximum possible value of per-policy P-state limits + (see `Interpretation of Policy Attributes`_ below for details). + +.. _status_attr: + +``status`` + Operation mode of the driver: "active", "passive" or "off". + + "active" + The driver is functional and in the `active mode + `_. + + "passive" + The driver is functional and in the `passive mode + `_. + + "off" + The driver is not functional (it is not registered as a scaling + driver with the ``CPUFreq`` core). + + This attribute can be written to in order to change the driver's + operation mode or to unregister it. The string written to it must be + one of the possible values of it and, if successful, the write will + cause the driver to switch over to the operation mode represented by + that string - or to be unregistered in the "off" case. [Actually, + switching over from the active mode to the passive mode or the other + way around causes the driver to be unregistered and registered again + with a different set of callbacks, so all of its settings (the global + as well as the per-policy ones) are then reset to their default + values, possibly depending on the target operation mode.] + + That only is supported in some configurations, though (for example, if + the `HWP feature is enabled in the processor `_, + the operation mode of the driver cannot be changed), and if it is not + supported in the current configuration, writes to this attribute with + fail with an appropriate error. + +Interpretation of Policy Attributes +----------------------------------- + +The interpretation of some ``CPUFreq`` policy attributes described in +:doc:`cpufreq` is special with ``intel_pstate`` as the current scaling driver +and it generally depends on the driver's `operation mode `_. + +First of all, the values of the ``cpuinfo_max_freq``, ``cpuinfo_min_freq`` and +``scaling_cur_freq`` attributes are produced by applying a processor-specific +multiplier to the internal P-state representation used by ``intel_pstate``. +Also, the values of the ``scaling_max_freq`` and ``scaling_min_freq`` +attributes are capped by the frequency corresponding to the maximum P-state that +the driver is allowed to set. + +If the ``no_turbo`` `global attribute `_ is set, the driver is +not allowed to use turbo P-states, so the maximum value of ``scaling_max_freq`` +and ``scaling_min_freq`` is limited to the maximum non-turbo P-state frequency. +Accordingly, setting ``no_turbo`` causes ``scaling_max_freq`` and +``scaling_min_freq`` to go down to that value if they were above it before. +However, the old values of ``scaling_max_freq`` and ``scaling_min_freq`` will be +restored after unsetting ``no_turbo``, unless these attributes have been written +to after ``no_turbo`` was set. + +If ``no_turbo`` is not set, the maximum possible value of ``scaling_max_freq`` +and ``scaling_min_freq`` corresponds to the maximum supported turbo P-state, +which also is the value of ``cpuinfo_max_freq`` in either case. + +Next, the following policy attributes have special meaning if +``intel_pstate`` works in the `active mode `_: + +``scaling_available_governors`` + List of P-state selection algorithms provided by ``intel_pstate``. + +``scaling_governor`` + P-state selection algorithm provided by ``intel_pstate`` currently in + use with the given policy. + +``scaling_cur_freq`` + Frequency of the average P-state of the CPU represented by the given + policy for the time interval between the last two invocations of the + driver's utilization update callback by the CPU scheduler for that CPU. + +The meaning of these attributes in the `passive mode `_ is the +same as for other scaling drivers. + +Additionally, the value of the ``scaling_driver`` attribute for ``intel_pstate`` +depends on the operation mode of the driver. Namely, it is either +"intel_pstate" (in the `active mode `_) or "intel_cpufreq" (in the +`passive mode `_). + +Coordination of P-State Limits +------------------------------ + +``intel_pstate`` allows P-state limits to be set in two ways: with the help of +the ``max_perf_pct`` and ``min_perf_pct`` `global attributes +`_ or via the ``scaling_max_freq`` and ``scaling_min_freq`` +``CPUFreq`` policy attributes. The coordination between those limits is based +on the following rules, regardless of the current operation mode of the driver: + + 1. All CPUs are affected by the global limits (that is, none of them can be + requested to run faster than the global maximum and none of them can be + requested to run slower than the global minimum). + + 2. Each individual CPU is affected by its own per-policy limits (that is, it + cannot be requested to run faster than its own per-policy maximum and it + cannot be requested to run slower than its own per-policy minimum). + + 3. The global and per-policy limits can be set independently. + +If the `HWP feature is enabled in the processor `_, the +resulting effective values are written into its registers whenever the limits +change in order to request its internal P-state selection logic to always set +P-states within these limits. Otherwise, the limits are taken into account by +scaling governors (in the `passive mode `_) and by the driver +every time before setting a new P-state for a CPU. + +Additionally, if the ``intel_pstate=per_cpu_perf_limits`` command line argument +is passed to the kernel, ``max_perf_pct`` and ``min_perf_pct`` are not exposed +at all and the only way to set the limits is by using the policy attributes. + + +Energy vs Performance Hints +--------------------------- + +If ``intel_pstate`` works in the `active mode with the HWP feature enabled +`_ in the processor, additional attributes are present +in every ``CPUFreq`` policy directory in ``sysfs``. They are intended to allow +user space to help ``intel_pstate`` to adjust the processor's internal P-state +selection logic by focusing it on performance or on energy-efficiency, or +somewhere between the two extremes: + +``energy_performance_preference`` + Current value of the energy vs performance hint for the given policy + (or the CPU represented by it). + + The hint can be changed by writing to this attribute. + +``energy_performance_available_preferences`` + List of strings that can be written to the + ``energy_performance_preference`` attribute. + + They represent different energy vs performance hints and should be + self-explanatory, except that ``default`` represents whatever hint + value was set by the platform firmware. + +Strings written to the ``energy_performance_preference`` attribute are +internally translated to integer values written to the processor's +Energy-Performance Preference (EPP) knob (if supported) or its +Energy-Performance Bias (EPB) knob. + +[Note that tasks may by migrated from one CPU to another by the scheduler's +load-balancing algorithm and if different energy vs performance hints are +set for those CPUs, that may lead to undesirable outcomes. To avoid such +issues it is better to set the same energy vs performance hint for all CPUs +or to pin every task potentially sensitive to them to a specific CPU.] + +.. _acpi-cpufreq: + +``intel_pstate`` vs ``acpi-cpufreq`` +==================================== + +On the majority of systems supported by ``intel_pstate``, the ACPI tables +provided by the platform firmware contain ``_PSS`` objects returning information +that can be used for CPU performance scaling (refer to the `ACPI specification`_ +for details on the ``_PSS`` objects and the format of the information returned +by them). + +The information returned by the ACPI ``_PSS`` objects is used by the +``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate`` +the ``acpi-cpufreq`` driver uses the same hardware CPU performance scaling +interface, but the set of P-states it can use is limited by the ``_PSS`` +output. + +On those systems each ``_PSS`` object returns a list of P-states supported by +the corresponding CPU which basically is a subset of the P-states range that can +be used by ``intel_pstate`` on the same system, with one exception: the whole +`turbo range `_ is represented by one item in it (the topmost one). By +convention, the frequency returned by ``_PSS`` for that item is greater by 1 MHz +than the frequency of the highest non-turbo P-state listed by it, but the +corresponding P-state representation (following the hardware specification) +returned for it matches the maximum supported turbo P-state (or is the +special value 255 meaning essentially "go as high as you can get"). + +The list of P-states returned by ``_PSS`` is reflected by the table of +available frequencies supplied by ``acpi-cpufreq`` to the ``CPUFreq`` core and +scaling governors and the minimum and maximum supported frequencies reported by +it come from that list as well. In particular, given the special representation +of the turbo range described above, this means that the maximum supported +frequency reported by ``acpi-cpufreq`` is higher by 1 MHz than the frequency +of the highest supported non-turbo P-state listed by ``_PSS`` which, of course, +affects decisions made by the scaling governors, except for ``powersave`` and +``performance``. + +For example, if a given governor attempts to select a frequency proportional to +estimated CPU load and maps the load of 100% to the maximum supported frequency +(possibly multiplied by a constant), then it will tend to choose P-states below +the turbo threshold if ``acpi-cpufreq`` is used as the scaling driver, because +in that case the turbo range corresponds to a small fraction of the frequency +band it can use (1 MHz vs 1 GHz or more). In consequence, it will only go to +the turbo range for the highest loads and the other loads above 50% that might +benefit from running at turbo frequencies will be given non-turbo P-states +instead. + +One more issue related to that may appear on systems supporting the +`Configurable TDP feature `_ allowing the platform firmware to set the +turbo threshold. Namely, if that is not coordinated with the lists of P-states +returned by ``_PSS`` properly, there may be more than one item corresponding to +a turbo P-state in those lists and there may be a problem with avoiding the +turbo range (if desirable or necessary). Usually, to avoid using turbo +P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state listed +by ``_PSS``, but that is not sufficient when there are other turbo P-states in +the list returned by it. + +Apart from the above, ``acpi-cpufreq`` works like ``intel_pstate`` in the +`passive mode `_, except that the number of P-states it can set +is limited to the ones listed by the ACPI ``_PSS`` objects. + + +Kernel Command Line Options for ``intel_pstate`` +================================================ + +Several kernel command line options can be used to pass early-configuration-time +parameters to ``intel_pstate`` in order to enforce specific behavior of it. All +of them have to be prepended with the ``intel_pstate=`` prefix. + +``disable`` + Do not register ``intel_pstate`` as the scaling driver even if the + processor is supported by it. + +``passive`` + Register ``intel_pstate`` in the `passive mode `_ to + start with. + + This option implies the ``no_hwp`` one described below. + +``force`` + Register ``intel_pstate`` as the scaling driver instead of + ``acpi-cpufreq`` even if the latter is preferred on the given system. + + This may prevent some platform features (such as thermal controls and + power capping) that rely on the availability of ACPI P-states + information from functioning as expected, so it should be used with + caution. + + This option does not work with processors that are not supported by + ``intel_pstate`` and on platforms where the ``pcc-cpufreq`` scaling + driver is used instead of ``acpi-cpufreq``. + +``no_hwp`` + Do not enable the `hardware-managed P-states (HWP) feature + `_ even if it is supported by the processor. + +``hwp_only`` + Register ``intel_pstate`` as the scaling driver only if the + `hardware-managed P-states (HWP) feature `_ is + supported by the processor. + +``support_acpi_ppc`` + Take ACPI ``_PPC`` performance limits into account. + + If the preferred power management profile in the FADT (Fixed ACPI + Description Table) is set to "Enterprise Server" or "Performance + Server", the ACPI ``_PPC`` limits are taken into account by default + and this option has no effect. + +``per_cpu_perf_limits`` + Use per-logical-CPU P-State limits (see `Coordination of P-state + Limits`_ for details). + + +Diagnostics and Tuning +====================== + +Trace Events +------------ + +There are two static trace events that can be used for ``intel_pstate`` +diagnostics. One of them is the ``cpu_frequency`` trace event generally used +by ``CPUFreq``, and the other one is the ``pstate_sample`` trace event specific +to ``intel_pstate``. Both of them are triggered by ``intel_pstate`` only if +it works in the `active mode `_. + +The following sequence of shell commands can be used to enable them and see +their output (if the kernel is generally configured to support event tracing):: + + # cd /sys/kernel/debug/tracing/ + # echo 1 > events/power/pstate_sample/enable + # echo 1 > events/power/cpu_frequency/enable + # cat trace + gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107 scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 freq=2474476 + cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2 + +If ``intel_pstate`` works in the `passive mode `_, the +``cpu_frequency`` trace event will be triggered either by the ``schedutil`` +scaling governor (for the policies it is attached to), or by the ``CPUFreq`` +core (for the policies with other scaling governors). + +``ftrace`` +---------- + +The ``ftrace`` interface can be used for low-level diagnostics of +``intel_pstate``. For example, to check how often the function to set a +P-state is called, the ``ftrace`` filter can be set to to +:c:func:`intel_pstate_set_pstate`:: + + # cd /sys/kernel/debug/tracing/ + # cat available_filter_functions | grep -i pstate + intel_pstate_set_pstate + intel_pstate_cpu_init + ... + # echo intel_pstate_set_pstate > set_ftrace_filter + # echo function > current_tracer + # cat trace | head -15 + # tracer: function + # + # entries-in-buffer/entries-written: 80/80 #P:4 + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # ||| / delay + # TASK-PID CPU# |||| TIMESTAMP FUNCTION + # | | | |||| | | + Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func + gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func + gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func + -0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func + +Tuning Interface in ``debugfs`` +------------------------------- + +The ``powersave`` algorithm provided by ``intel_pstate`` for `the Core line of +processors in the active mode `_ is based on a `PID controller`_ +whose parameters were chosen to address a number of different use cases at the +same time. However, it still is possible to fine-tune it to a specific workload +and the ``debugfs`` interface under ``/sys/kernel/debug/pstate_snb/`` is +provided for this purpose. [Note that the ``pstate_snb`` directory will be +present only if the specific P-state selection algorithm matching the interface +in it actually is in use.] + +The following files present in that directory can be used to modify the PID +controller parameters at run time: + +| ``deadband`` +| ``d_gain_pct`` +| ``i_gain_pct`` +| ``p_gain_pct`` +| ``sample_rate_ms`` +| ``setpoint`` + +Note, however, that achieving desirable results this way generally requires +expert-level understanding of the power vs performance tradeoff, so extra care +is recommended when attempting to do that. + + +.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf +.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html +.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf +.. _PID controller: https://en.wikipedia.org/wiki/PID_controller diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt deleted file mode 100644 index 3fdcdfd968ba..000000000000 --- a/Documentation/cpu-freq/intel-pstate.txt +++ /dev/null @@ -1,281 +0,0 @@ -Intel P-State driver --------------------- - -This driver provides an interface to control the P-State selection for the -SandyBridge+ Intel processors. - -The following document explains P-States: -http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf -As stated in the document, P-State doesn’t exactly mean a frequency. However, for -the sake of the relationship with cpufreq, P-State and frequency are used -interchangeably. - -Understanding the cpufreq core governors and policies are important before -discussing more details about the Intel P-State driver. Based on what callbacks -a cpufreq driver provides to the cpufreq core, it can support two types of -drivers: -- with target_index() callback: In this mode, the drivers using cpufreq core -simply provide the minimum and maximum frequency limits and an additional -interface target_index() to set the current frequency. The cpufreq subsystem -has a number of scaling governors ("performance", "powersave", "ondemand", -etc.). Depending on which governor is in use, cpufreq core will call for -transitions to a specific frequency using target_index() callback. -- setpolicy() callback: In this mode, drivers do not provide target_index() -callback, so cpufreq core can't request a transition to a specific frequency. -The driver provides minimum and maximum frequency limits and callbacks to set a -policy. The policy in cpufreq sysfs is referred to as the "scaling governor". -The cpufreq core can request the driver to operate in any of the two policies: -"performance" and "powersave". The driver decides which frequency to use based -on the above policy selection considering minimum and maximum frequency limits. - -The Intel P-State driver falls under the latter category, which implements the -setpolicy() callback. This driver decides what P-State to use based on the -requested policy from the cpufreq core. If the processor is capable of -selecting its next P-State internally, then the driver will offload this -responsibility to the processor (aka HWP: Hardware P-States). If not, the -driver implements algorithms to select the next P-State. - -Since these policies are implemented in the driver, they are not same as the -cpufreq scaling governors implementation, even if they have the same name in -the cpufreq sysfs (scaling_governors). For example the "performance" policy is -similar to cpufreq’s "performance" governor, but "powersave" is completely -different than the cpufreq "powersave" governor. The strategy here is similar -to cpufreq "ondemand", where the requested P-State is related to the system load. - -Sysfs Interface - -In addition to the frequency-controlling interfaces provided by the cpufreq -core, the driver provides its own sysfs files to control the P-State selection. -These files have been added to /sys/devices/system/cpu/intel_pstate/. -Any changes made to these files are applicable to all CPUs (even in a -multi-package system, Refer to later section on placing "Per-CPU limits"). - - max_perf_pct: Limits the maximum P-State that will be requested by - the driver. It states it as a percentage of the available performance. The - available (P-State) performance may be reduced by the no_turbo - setting described below. - - min_perf_pct: Limits the minimum P-State that will be requested by - the driver. It states it as a percentage of the max (non-turbo) - performance level. - - no_turbo: Limits the driver to selecting P-State below the turbo - frequency range. - - turbo_pct: Displays the percentage of the total performance that - is supported by hardware that is in the turbo range. This number - is independent of whether turbo has been disabled or not. - - num_pstates: Displays the number of P-States that are supported - by hardware. This number is independent of whether turbo has - been disabled or not. - -For example, if a system has these parameters: - Max 1 core turbo ratio: 0x21 (Max 1 core ratio is the maximum P-State) - Max non turbo ratio: 0x17 - Minimum ratio : 0x08 (Here the ratio is called max efficiency ratio) - -Sysfs will show : - max_perf_pct:100, which corresponds to 1 core ratio - min_perf_pct:24, max_efficiency_ratio / max 1 Core ratio - no_turbo:0, turbo is not disabled - num_pstates:26 = (max 1 Core ratio - Max Efficiency Ratio + 1) - turbo_pct:39 = (max 1 core ratio - max non turbo ratio) / num_pstates - -Refer to "Intel® 64 and IA-32 Architectures Software Developer’s Manual -Volume 3: System Programming Guide" to understand ratios. - -There is one more sysfs attribute in /sys/devices/system/cpu/intel_pstate/ -that can be used for controlling the operation mode of the driver: - - status: Three settings are possible: - "off" - The driver is not in use at this time. - "active" - The driver works as a P-state governor (default). - "passive" - The driver works as a regular cpufreq one and collaborates - with the generic cpufreq governors (it sets P-states as - requested by those governors). - The current setting is returned by reads from this attribute. Writing one - of the above strings to it changes the operation mode as indicated by that - string, if possible. If HW-managed P-states (HWP) are enabled, it is not - possible to change the driver's operation mode and attempts to write to - this attribute will fail. - -cpufreq sysfs for Intel P-State - -Since this driver registers with cpufreq, cpufreq sysfs is also presented. -There are some important differences, which need to be considered. - -scaling_cur_freq: This displays the real frequency which was used during -the last sample period instead of what is requested. Some other cpufreq driver, -like acpi-cpufreq, displays what is requested (Some changes are on the -way to fix this for acpi-cpufreq driver). The same is true for frequencies -displayed at /proc/cpuinfo. - -scaling_governor: This displays current active policy. Since each CPU has a -cpufreq sysfs, it is possible to set a scaling governor to each CPU. But this -is not possible with Intel P-States, as there is one common policy for all -CPUs. Here, the last requested policy will be applicable to all CPUs. It is -suggested that one use the cpupower utility to change policy to all CPUs at the -same time. - -scaling_setspeed: This attribute can never be used with Intel P-State. - -scaling_max_freq/scaling_min_freq: This interface can be used similarly to -the max_perf_pct/min_perf_pct of Intel P-State sysfs. However since frequencies -are converted to nearest possible P-State, this is prone to rounding errors. -This method is not preferred to limit performance. - -affected_cpus: Not used -related_cpus: Not used - -For contemporary Intel processors, the frequency is controlled by the -processor itself and the P-State exposed to software is related to -performance levels. The idea that frequency can be set to a single -frequency is fictional for Intel Core processors. Even if the scaling -driver selects a single P-State, the actual frequency the processor -will run at is selected by the processor itself. - -Per-CPU limits - -The kernel command line option "intel_pstate=per_cpu_perf_limits" forces -the intel_pstate driver to use per-CPU performance limits. When it is set, -the sysfs control interface described above is subject to limitations. -- The following controls are not available for both read and write - /sys/devices/system/cpu/intel_pstate/max_perf_pct - /sys/devices/system/cpu/intel_pstate/min_perf_pct -- The following controls can be used to set performance limits, as far as the -architecture of the processor permits: - /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq - /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq - /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor -- User can still observe turbo percent and number of P-States from - /sys/devices/system/cpu/intel_pstate/turbo_pct - /sys/devices/system/cpu/intel_pstate/num_pstates -- User can read write system wide turbo status - /sys/devices/system/cpu/no_turbo - -Support of energy performance hints -It is possible to provide hints to the HWP algorithms in the processor -to be more performance centric to more energy centric. When the driver -is using HWP, two additional cpufreq sysfs attributes are presented for -each logical CPU. -These attributes are: - - energy_performance_available_preferences - - energy_performance_preference - -To get list of supported hints: -$ cat energy_performance_available_preferences - default performance balance_performance balance_power power - -The current preference can be read or changed via cpufreq sysfs -attribute "energy_performance_preference". Reading from this attribute -will display current effective setting. User can write any of the valid -preference string to this attribute. User can always restore to power-on -default by writing "default". - -Since threads can migrate to different CPUs, this is possible that the -new CPU may have different energy performance preference than the previous -one. To avoid such issues, either threads can be pinned to specific CPUs -or set the same energy performance preference value to all CPUs. - -Tuning Intel P-State driver - -When the performance can be tuned using PID (Proportional Integral -Derivative) controller, debugfs files are provided for adjusting performance. -They are presented under: -/sys/kernel/debug/pstate_snb/ - -The PID tunable parameters are: - deadband - d_gain_pct - i_gain_pct - p_gain_pct - sample_rate_ms - setpoint - -To adjust these parameters, some understanding of driver implementation is -necessary. There are some tweeks described here, but be very careful. Adjusting -them requires expert level understanding of power and performance relationship. -These limits are only useful when the "powersave" policy is active. - --To make the system more responsive to load changes, sample_rate_ms can -be adjusted (current default is 10ms). --To make the system use higher performance, even if the load is lower, setpoint -can be adjusted to a lower number. This will also lead to faster ramp up time -to reach the maximum P-State. -If there are no derivative and integral coefficients, The next P-State will be -equal to: - current P-State - ((setpoint - current cpu load) * p_gain_pct) - -For example, if the current PID parameters are (Which are defaults for the core -processors like SandyBridge): - deadband = 0 - d_gain_pct = 0 - i_gain_pct = 0 - p_gain_pct = 20 - sample_rate_ms = 10 - setpoint = 97 - -If the current P-State = 0x08 and current load = 100, this will result in the -next P-State = 0x08 - ((97 - 100) * 0.2) = 8.6 (rounded to 9). Here the P-State -goes up by only 1. If during next sample interval the current load doesn't -change and still 100, then P-State goes up by one again. This process will -continue as long as the load is more than the setpoint until the maximum P-State -is reached. - -For the same load at setpoint = 60, this will result in the next P-State -= 0x08 - ((60 - 100) * 0.2) = 16 -So by changing the setpoint from 97 to 60, there is an increase of the -next P-State from 9 to 16. So this will make processor execute at higher -P-State for the same CPU load. If the load continues to be more than the -setpoint during next sample intervals, then P-State will go up again till the -maximum P-State is reached. But the ramp up time to reach the maximum P-State -will be much faster when the setpoint is 60 compared to 97. - -Debugging Intel P-State driver - -Event tracing -To debug P-State transition, the Linux event tracing interface can be used. -There are two specific events, which can be enabled (Provided the kernel -configs related to event tracing are enabled). - -# cd /sys/kernel/debug/tracing/ -# echo 1 > events/power/pstate_sample/enable -# echo 1 > events/power/cpu_frequency/enable -# cat trace -gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107 - scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 - freq=2474476 -cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2 - - -Using ftrace - -If function level tracing is required, the Linux ftrace interface can be used. -For example if we want to check how often a function to set a P-State is -called, we can set ftrace filter to intel_pstate_set_pstate. - -# cd /sys/kernel/debug/tracing/ -# cat available_filter_functions | grep -i pstate -intel_pstate_set_pstate -intel_pstate_cpu_init -... - -# echo intel_pstate_set_pstate > set_ftrace_filter -# echo function > current_tracer -# cat trace | head -15 -# tracer: function -# -# entries-in-buffer/entries-written: 80/80 #P:4 -# -# _-----=> irqs-off -# / _----=> need-resched -# | / _---=> hardirq/softirq -# || / _--=> preempt-depth -# ||| / delay -# TASK-PID CPU# |||| TIMESTAMP FUNCTION -# | | | |||| | | - Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func - gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func - gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func - -0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func From 60d4553bdc1a0099adb544e12cafd7bbc7f1d484 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 14 May 2017 02:23:04 +0200 Subject: [PATCH 013/254] PM / wakeup: Fix up wakeup_source_report_event() Commit 8a537ece3d94 (PM / wakeup: Integrate mechanism to abort transitions in progress) modified wakeup_source_report_event() and wakeup_source_activate() to make it possible to call pm_system_wakeup() from the latter if so indicated by the caller of the former (via a new function argument added by that commit), but it overlooked the fact that in some situations wakeup_source_report_event() is called to signal a "hard" event (ie. such that should abort a system suspend in progress) after pm_stay_awake() has been called for the same wakeup source object, in which case the pm_system_wakeup() will not trigger. To work around this issue, modify wakeup_source_activate() and wakeup_source_report_event() again so that pm_system_wakeup() is called by the latter directly (if its last argument is true), in which case the additional argument does not need to be passed to wakeup_source_activate() any more, so drop it from there. Fixes: 8a537ece3d94 (PM / wakeup: Integrate mechanism to abort transitions in progress) Reported-by: David E. Box Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index f62082fdd670..9c36b27996fc 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -512,13 +512,12 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws) /** * wakup_source_activate - Mark given wakeup source as active. * @ws: Wakeup source to handle. - * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. * * Update the @ws' statistics and, if @ws has just been activated, notify the PM * core of the event by incrementing the counter of of wakeup events being * processed. */ -static void wakeup_source_activate(struct wakeup_source *ws, bool hard) +static void wakeup_source_activate(struct wakeup_source *ws) { unsigned int cec; @@ -526,9 +525,6 @@ static void wakeup_source_activate(struct wakeup_source *ws, bool hard) "unregistered wakeup source\n")) return; - if (hard) - pm_system_wakeup(); - ws->active = true; ws->active_count++; ws->last_time = ktime_get(); @@ -554,7 +550,10 @@ static void wakeup_source_report_event(struct wakeup_source *ws, bool hard) ws->wakeup_count++; if (!ws->active) - wakeup_source_activate(ws, hard); + wakeup_source_activate(ws); + + if (hard) + pm_system_wakeup(); } /** From 967b08c25a091867b04261fa34addedc950256f1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 14 May 2017 02:23:12 +0200 Subject: [PATCH 014/254] RTC: rtc-cmos: Fix wakeup from suspend-to-idle Commit eed4d47efe95 (ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle) modified the core suspend-to-idle code to filter out spurious SCI interrupts received while suspended, which requires ACPI event source handlers to report wakeup events in a way that will trigger a wakeup from suspend to idle (or abort system suspends in progress, which is equivalent). That needs to be done in the rtc-cmos driver too, which was overlooked by the above commit, so do that now. Fixes: eed4d47efe95 (ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle) Reported-by: David E. Box Signed-off-by: Rafael J. Wysocki --- drivers/rtc/rtc-cmos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index f4a96dbdabf2..dabe47b9be72 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -1085,7 +1085,7 @@ static u32 rtc_handler(void *context) } spin_unlock_irqrestore(&rtc_lock, flags); - pm_wakeup_event(dev, 0); + pm_wakeup_hard_event(dev); acpi_clear_event(ACPI_EVENT_RTC); acpi_disable_event(ACPI_EVENT_RTC, 0); return ACPI_INTERRUPT_HANDLED; From 216c4e9db4c9d1d2a382b42880442dc632cd47d9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 May 2017 22:40:06 +0300 Subject: [PATCH 015/254] PowerCap: Fix an error code in powercap_register_zone() In the current code we accidentally return the successful result from idr_alloc() instead of a negative error pointer. The caller is looking for an error pointer and so it treats the returned value as a valid pointer. This one might be a bit serious because if it lets people get around the kernel's protection for remapping NULL. I'm not sure. Fixes: 75d2364ea0ca (PowerCap: Add class driver) Signed-off-by: Dan Carpenter Reviewed-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/powercap/powercap_sys.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c index 14bde0db8c24..5b10b50f8686 100644 --- a/drivers/powercap/powercap_sys.c +++ b/drivers/powercap/powercap_sys.c @@ -538,6 +538,7 @@ struct powercap_zone *powercap_register_zone( power_zone->id = result; idr_init(&power_zone->idr); + result = -ENOMEM; power_zone->name = kstrdup(name, GFP_KERNEL); if (!power_zone->name) goto err_name_alloc; From 0bae5fd3330be0517fba697e6b228601d421fade Mon Sep 17 00:00:00 2001 From: Pushkar Jambhlekar Date: Thu, 11 May 2017 10:31:24 +0530 Subject: [PATCH 016/254] PM / hibernate: Declare variables as static Fixing sparse warnings: 'symbol not declared. Should it be static?' Signed-off-by: Pushkar Jambhlekar Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d79a38de425a..a628cccafa4a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1422,7 +1422,7 @@ static unsigned int nr_meta_pages; * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. */ -unsigned int alloc_normal, alloc_highmem; +static unsigned int alloc_normal, alloc_highmem; /* * Memory bitmap used for marking saveable pages (during hibernation) or * hibernation image pages (during restore) From be0408d74d9c95de1baf6c2563b6e6d1dfb17b88 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 May 2017 14:12:29 +0200 Subject: [PATCH 017/254] cpufreq: dbx500: add a Kconfig symbol Moving the cooling code into the cpufreq driver caused a possible build failure when the cpu_thermal helper code is a loadable module or disabled: drivers/cpufreq/dbx500-cpufreq.o: In function `dbx500_cpufreq_ready': dbx500-cpufreq.c:(.text.dbx500_cpufreq_ready+0x4): undefined reference to `cpufreq_cooling_register' This adds the same dependency that we have in other cpufreq drivers, forcing the driver to be disabled when we can't possibly link it. Fixes: 19678ffb9fd6 (cpufreq: dbx500: Manage cooling device from cpufreq driver) Signed-off-by: Arnd Bergmann Acked-by: Viresh Kumar Reviewed-by: Linus Walleij Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.arm | 9 +++++++++ drivers/cpufreq/Makefile | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 74ed7e9a7f27..2011fec2d6ad 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -71,6 +71,15 @@ config ARM_HIGHBANK_CPUFREQ If in doubt, say N. +config ARM_DB8500_CPUFREQ + tristate "ST-Ericsson DB8500 cpufreq" if COMPILE_TEST && !ARCH_U8500 + default ARCH_U8500 + depends on HAS_IOMEM + depends on !CPU_THERMAL || THERMAL + help + This adds the CPUFreq driver for ST-Ericsson Ux500 (DB8500) SoC + series. + config ARM_IMX6Q_CPUFREQ tristate "Freescale i.MX6 cpufreq support" depends on ARCH_MXC diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index b7e78f063c4f..ab3a42cd29ef 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -53,7 +53,7 @@ obj-$(CONFIG_ARM_DT_BL_CPUFREQ) += arm_big_little_dt.o obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ) += brcmstb-avs-cpufreq.o obj-$(CONFIG_ARCH_DAVINCI) += davinci-cpufreq.o -obj-$(CONFIG_UX500_SOC_DB8500) += dbx500-cpufreq.o +obj-$(CONFIG_ARM_DB8500_CPUFREQ) += dbx500-cpufreq.o obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ) += exynos5440-cpufreq.o obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ) += highbank-cpufreq.o obj-$(CONFIG_ARM_IMX6Q_CPUFREQ) += imx6q-cpufreq.o From f83914fdfcc3ecb62a5a83eeb609ff59a9c2052d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 12 May 2017 14:34:37 +0200 Subject: [PATCH 018/254] ALSA: usb-audio: fix Amanero Combo384 quirk on big-endian hosts Add missing endianness conversion when using the USB device-descriptor bcdDevice field when applying the Amanero Combo384 (endianness!) quirk. Fixes: 3eff682d765b ("ALSA: usb-audio: Support both DSD LE/BE Amanero firmware versions") Cc: Jussi Laako Signed-off-by: Johan Hovold Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 01eff6ce6401..d7b0b0a3a2db 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1364,7 +1364,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, /* Amanero Combo384 USB interface with native DSD support */ case USB_ID(0x16d0, 0x071a): if (fp->altsetting == 2) { - switch (chip->dev->descriptor.bcdDevice) { + switch (le16_to_cpu(chip->dev->descriptor.bcdDevice)) { case 0x199: return SNDRV_PCM_FMTBIT_DSD_U32_LE; case 0x19b: From a2b7cbdd2559aff06cebc28a7150f81c307a90d3 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 19 Apr 2017 11:39:20 -0700 Subject: [PATCH 019/254] netfilter: ctnetlink: Make some parameters integer to avoid enum mismatch Not all parameters passed to ctnetlink_parse_tuple() and ctnetlink_exp_dump_tuple() match the enum type in the signatures of these functions. Since this is intended change the argument type of to be an unsigned integer value. Signed-off-by: Matthias Kaehlcke Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dcf561b5c97a..fa752626029e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1007,9 +1007,8 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { static int ctnetlink_parse_tuple(const struct nlattr * const cda[], - struct nf_conntrack_tuple *tuple, - enum ctattr_type type, u_int8_t l3num, - struct nf_conntrack_zone *zone) + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -2447,7 +2446,7 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = { static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, - enum ctattr_expect type) + u32 type) { struct nlattr *nest_parms; From d110a3942aca78d14929bc648aeb83ee0b245a61 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 6 May 2017 20:28:02 +0800 Subject: [PATCH 020/254] netfilter: don't setup nat info for confirmed ct We cannot setup nat info if the ct has been confirmed already, else, different cpu may race to handle the same ct. In extreme situation, we may hit the "BUG_ON(nf_nat_initialized(ct, maniptype))" in the nf_nat_setup_info. Also running the following commands will easily hit NF_CT_ASSERT in nf_conntrack_alter_reply: # nft flush ruleset # ping -c 2 -W 1 1.1.1.111 & # nft add table t # nft add chain t c {type nat hook postrouting priority 0 \;} # nft add rule t c snat to 4.5.6.7 WARNING: CPU: 1 PID: 10065 at net/netfilter/nf_conntrack_core.c:1472 nf_conntrack_alter_reply+0x9a/0x1a0 [nf_conntrack] [...] Call Trace: nf_nat_setup_info+0xad/0x840 [nf_nat] ? deactivate_slab+0x65d/0x6c0 nft_nat_eval+0xcd/0x100 [nft_nat] nft_do_chain+0xff/0x5d0 [nf_tables] ? mark_held_locks+0x6f/0xa0 ? __local_bh_enable_ip+0x70/0xa0 ? trace_hardirqs_on_caller+0x11f/0x190 ? ipt_do_table+0x310/0x610 ? trace_hardirqs_on+0xd/0x10 ? __local_bh_enable_ip+0x70/0xa0 ? ipt_do_table+0x32b/0x610 ? __lock_acquire+0x2ac/0x1580 ? ipt_do_table+0x32b/0x610 nft_nat_do_chain+0x65/0x80 [nft_chain_nat_ipv4] nf_nat_ipv4_fn+0x1ae/0x240 [nf_nat_ipv4] nf_nat_ipv4_out+0x4a/0xf0 [nf_nat_ipv4] nft_nat_ipv4_out+0x15/0x20 [nft_chain_nat_ipv4] nf_hook_slow+0x2c/0xf0 ip_output+0x154/0x270 So for the confirmed ct, just ignore it and return NF_ACCEPT. Fixes: 9a08ecfe74d7 ("netfilter: don't attach a nat extension by default") Signed-off-by: Liping Zhang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index b48d6b5aae8a..ef0be325a0c6 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -409,6 +409,10 @@ nf_nat_setup_info(struct nf_conn *ct, { struct nf_conntrack_tuple curr_tuple, new_tuple; + /* Can't setup nat info for confirmed ct. */ + if (nf_ct_is_confirmed(ct)) + return NF_ACCEPT; + NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || maniptype == NF_NAT_MANIP_DST); BUG_ON(nf_nat_initialized(ct, maniptype)); From d91fc59cd77c719f33eda65c194ad8f95a055190 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 7 May 2017 22:01:55 +0800 Subject: [PATCH 021/254] netfilter: introduce nf_conntrack_helper_put helper function And convert module_put invocation to nf_conntrack_helper_put, this is prepared for the followup patch, which will add a refcnt for cthelper, so we can reject the deleting request when cthelper is in use. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 2 ++ net/netfilter/nf_conntrack_helper.c | 6 ++++++ net/netfilter/nft_ct.c | 4 ++-- net/netfilter/xt_CT.c | 6 +++--- net/openvswitch/conntrack.c | 4 ++-- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index e04fa7691e5d..c1c12411103a 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -79,6 +79,8 @@ struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name, struct nf_conntrack_helper *nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum); +void nf_conntrack_helper_put(struct nf_conntrack_helper *helper); + void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3a60efa7799b..e17006b6e434 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -181,6 +181,12 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) } EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); +void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) +{ + module_put(helper->me); +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); + struct nf_conn_help * nf_ct_helper_ext_add(struct nf_conn *ct, struct nf_conntrack_helper *helper, gfp_t gfp) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a34ceb38fc55..1678e9e75e8e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -826,9 +826,9 @@ static void nft_ct_helper_obj_destroy(struct nft_object *obj) struct nft_ct_helper_obj *priv = nft_obj_data(obj); if (priv->helper4) - module_put(priv->helper4->me); + nf_conntrack_helper_put(priv->helper4); if (priv->helper6) - module_put(priv->helper6->me); + nf_conntrack_helper_put(priv->helper6); } static void nft_ct_helper_obj_eval(struct nft_object *obj, diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index bb7ad82dcd56..623ef37de886 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); if (help == NULL) { - module_put(helper->me); + nf_conntrack_helper_put(helper); return -ENOMEM; } @@ -263,7 +263,7 @@ out: err4: help = nfct_help(ct); if (help) - module_put(help->helper->me); + nf_conntrack_helper_put(help->helper); err3: nf_ct_tmpl_free(ct); err2: @@ -346,7 +346,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, if (ct) { help = nfct_help(ct); if (help) - module_put(help->helper->me); + nf_conntrack_helper_put(help->helper); nf_ct_netns_put(par->net, par->family); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index bf602e33c40a..08679ebb3068 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1123,7 +1123,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); if (!help) { - module_put(helper->me); + nf_conntrack_helper_put(helper); return -ENOMEM; } @@ -1584,7 +1584,7 @@ void ovs_ct_free_action(const struct nlattr *a) static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) { if (ct_info->helper) - module_put(ct_info->helper->me); + nf_conntrack_helper_put(ct_info->helper); if (ct_info->ct) nf_ct_tmpl_free(ct_info->ct); } From 9338d7b4418e9996a7642867d8f6b482a6040ed6 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 7 May 2017 22:01:56 +0800 Subject: [PATCH 022/254] netfilter: nfnl_cthelper: reject del request if helper obj is in use We can still delete the ct helper even if it is in use, this will cause a use-after-free error. In more detail, I mean: # nfct helper add ssdp inet udp # iptables -t raw -A OUTPUT -p udp -j CT --helper ssdp # nfct helper delete ssdp //--> oops, succeed! BUG: unable to handle kernel paging request at 000026ca IP: 0x26ca [...] Call Trace: ? ipv4_helper+0x62/0x80 [nf_conntrack_ipv4] nf_hook_slow+0x21/0xb0 ip_output+0xe9/0x100 ? ip_fragment.constprop.54+0xc0/0xc0 ip_local_out+0x33/0x40 ip_send_skb+0x16/0x80 udp_send_skb+0x84/0x240 udp_sendmsg+0x35d/0xa50 So add reference count to fix this issue, if ct helper is used by others, reject the delete request. Apply this patch: # nfct helper delete ssdp nfct v1.4.3: netlink error: Device or resource busy Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 2 ++ net/netfilter/nf_conntrack_helper.c | 6 ++++++ net/netfilter/nfnetlink_cthelper.c | 17 +++++++++++------ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index c1c12411103a..c519bb5b5bb8 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -9,6 +9,7 @@ #ifndef _NF_CONNTRACK_HELPER_H #define _NF_CONNTRACK_HELPER_H +#include #include #include #include @@ -26,6 +27,7 @@ struct nf_conntrack_helper { struct hlist_node hnode; /* Internal use. */ char name[NF_CT_HELPER_NAME_LEN]; /* name of the module */ + refcount_t refcnt; struct module *me; /* pointer to self */ const struct nf_conntrack_expect_policy *expect_policy; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index e17006b6e434..7f6100ca63be 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -174,6 +174,10 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) #endif if (h != NULL && !try_module_get(h->me)) h = NULL; + if (h != NULL && !refcount_inc_not_zero(&h->refcnt)) { + module_put(h->me); + h = NULL; + } rcu_read_unlock(); @@ -183,6 +187,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) { + refcount_dec(&helper->refcnt); module_put(helper->me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); @@ -423,6 +428,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) } } } + refcount_set(&me->refcnt, 1); hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); nf_ct_helper_count++; out: diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 950bf6eadc65..be678a323598 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -686,6 +686,7 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, tuple_set = true; } + ret = -ENOENT; list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; j++; @@ -699,16 +700,20 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, tuple.dst.protonum != cur->tuple.dst.protonum)) continue; - found = true; - nf_conntrack_helper_unregister(cur); - kfree(cur->expect_policy); + if (refcount_dec_if_one(&cur->refcnt)) { + found = true; + nf_conntrack_helper_unregister(cur); + kfree(cur->expect_policy); - list_del(&nlcth->list); - kfree(nlcth); + list_del(&nlcth->list); + kfree(nlcth); + } else { + ret = -EBUSY; + } } /* Make sure we return success if we flush and there is no helpers */ - return (found || j == 0) ? 0 : -ENOENT; + return (found || j == 0) ? 0 : ret; } static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { From 324318f0248c31be8a08984146e7e4dd7cdd091d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 9 May 2017 16:17:37 -0400 Subject: [PATCH 023/254] netfilter: xtables: zero padding in data_to_user When looking up an iptables rule, the iptables binary compares the aligned match and target data (XT_ALIGN). In some cases this can exceed the actual data size to include padding bytes. Before commit f77bc5b23fb1 ("iptables: use match, target and data copy_to_user helpers") the malloc()ed bytes were overwritten by the kernel with kzalloced contents, zeroing the padding and making the comparison succeed. After this patch, the kernel copies and clears only data, leaving the padding bytes undefined. Extend the clear operation from data size to aligned data size to include the padding bytes, if any. Padding bytes can be observed in both match and target, and the bug triggered, by issuing a rule with match icmp and target ACCEPT: iptables -t mangle -A INPUT -i lo -p icmp --icmp-type 1 -j ACCEPT iptables -t mangle -D INPUT -i lo -p icmp --icmp-type 1 -j ACCEPT Fixes: f77bc5b23fb1 ("iptables: use match, target and data copy_to_user helpers") Reported-by: Paul Moore Reported-by: Richard Guy Briggs Signed-off-by: Willem de Bruijn Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- net/bridge/netfilter/ebtables.c | 9 ++++++--- net/netfilter/x_tables.c | 9 ++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index be378cf47fcc..b3044c2c62cb 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -294,7 +294,7 @@ int xt_match_to_user(const struct xt_entry_match *m, int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u); int xt_data_to_user(void __user *dst, const void *src, - int usersize, int size); + int usersize, int size, int aligned_size); void *xt_copy_counters_from_user(const void __user *user, unsigned int len, struct xt_counters_info *info, bool compat); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9ec0c9f908fa..9c6e619f452b 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1373,7 +1373,8 @@ static inline int ebt_obj_to_user(char __user *um, const char *_name, strlcpy(name, _name, sizeof(name)); if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) || put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) || - xt_data_to_user(um + entrysize, data, usersize, datasize)) + xt_data_to_user(um + entrysize, data, usersize, datasize, + XT_ALIGN(datasize))) return -EFAULT; return 0; @@ -1658,7 +1659,8 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, if (match->compat_to_user(cm->data, m->data)) return -EFAULT; } else { - if (xt_data_to_user(cm->data, m->data, match->usersize, msize)) + if (xt_data_to_user(cm->data, m->data, match->usersize, msize, + COMPAT_XT_ALIGN(msize))) return -EFAULT; } @@ -1687,7 +1689,8 @@ static int compat_target_to_user(struct ebt_entry_target *t, if (target->compat_to_user(cm->data, t->data)) return -EFAULT; } else { - if (xt_data_to_user(cm->data, t->data, target->usersize, tsize)) + if (xt_data_to_user(cm->data, t->data, target->usersize, tsize, + COMPAT_XT_ALIGN(tsize))) return -EFAULT; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 8876b7da6884..d17769599c10 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -283,12 +283,13 @@ static int xt_obj_to_user(u16 __user *psize, u16 size, &U->u.user.revision, K->u.kernel.TYPE->revision) int xt_data_to_user(void __user *dst, const void *src, - int usersize, int size) + int usersize, int size, int aligned_size) { usersize = usersize ? : size; if (copy_to_user(dst, src, usersize)) return -EFAULT; - if (usersize != size && clear_user(dst + usersize, size - usersize)) + if (usersize != aligned_size && + clear_user(dst + usersize, aligned_size - usersize)) return -EFAULT; return 0; @@ -298,7 +299,9 @@ EXPORT_SYMBOL_GPL(xt_data_to_user); #define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ - C_SIZE ? : K->u.kernel.TYPE->TYPE##size) + C_SIZE ? : K->u.kernel.TYPE->TYPE##size, \ + C_SIZE ? COMPAT_XT_ALIGN(C_SIZE) : \ + XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u) From 87e94dbc210a720a34be5c1174faee5c84be963e Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Thu, 11 May 2017 18:56:38 +0200 Subject: [PATCH 024/254] netfilter: synproxy: fix conntrackd interaction This patch fixes the creation of connection tracking entry from netlink when synproxy is used. It was missing the addition of the synproxy extension. This was causing kernel crashes when a conntrack entry created by conntrackd was used after the switch of traffic from active node to the passive node. Signed-off-by: Eric Leblond Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index fa752626029e..9799a50bc604 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -45,6 +45,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NF_NAT_NEEDED #include #include @@ -1827,6 +1829,8 @@ ctnetlink_create_conntrack(struct net *net, nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); nf_ct_labels_ext_add(ct); + nfct_seqadj_ext_add(ct); + nfct_synproxy_ext_add(ct); /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; From fa803605eef39372e53d7813002d73a3fcf10c88 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 14 May 2017 21:35:22 +0800 Subject: [PATCH 025/254] netfilter: nf_tables: can't assume lock is acquired when dumping set elems When dumping the elements related to a specified set, we may invoke the nf_tables_dump_set with the NFNL_SUBSYS_NFTABLES lock not acquired. So we should use the proper rcu operation to avoid race condition, just like other nft dump operations. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 78 +++++++++++++++++++++++++---------- net/netfilter/nft_set_hash.c | 2 +- 2 files changed, 57 insertions(+), 23 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 559225029740..5f4a4d48b871 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3367,35 +3367,50 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, return nf_tables_fill_setelem(args->skb, set, elem); } +struct nft_set_dump_ctx { + const struct nft_set *set; + struct nft_ctx ctx; +}; + static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { + struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); - u8 genmask = nft_genmask_cur(net); + struct nft_af_info *afi; + struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; - struct nft_ctx ctx; - struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; + bool set_found = false; struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; - int event, err; + int event; - err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, - NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy, - NULL); - if (err < 0) - return err; + rcu_read_lock(); + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (afi != dump_ctx->ctx.afi) + continue; - err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, - (void *)nla, genmask); - if (err < 0) - return err; + list_for_each_entry_rcu(table, &afi->tables, list) { + if (table != dump_ctx->ctx.table) + continue; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); - if (IS_ERR(set)) - return PTR_ERR(set); + list_for_each_entry_rcu(set, &table->sets, list) { + if (set == dump_ctx->set) { + set_found = true; + break; + } + } + break; + } + break; + } + + if (!set_found) { + rcu_read_unlock(); + return -ENOENT; + } event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM); portid = NETLINK_CB(cb->skb).portid; @@ -3407,11 +3422,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx.afi->family; + nfmsg->nfgen_family = afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff); + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) + if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) goto nla_put_failure; @@ -3422,12 +3437,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.cb = cb; args.skb = skb; - args.iter.genmask = nft_genmask_cur(ctx.net); + args.iter.genmask = nft_genmask_cur(net); args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; - set->ops->walk(&ctx, set, &args.iter); + set->ops->walk(&dump_ctx->ctx, set, &args.iter); + rcu_read_unlock(); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); @@ -3441,9 +3457,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; nla_put_failure: + rcu_read_unlock(); return -ENOSPC; } +static int nf_tables_dump_set_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -3465,7 +3488,18 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_set, + .done = nf_tables_dump_set_done, }; + struct nft_set_dump_ctx *dump_ctx; + + dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); + if (!dump_ctx) + return -ENOMEM; + + dump_ctx->set = set; + dump_ctx->ctx = ctx; + + c.data = dump_ctx; return netlink_dump_start(nlsk, skb, nlh, &c); } return -EOPNOTSUPP; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 8ec086b6b56b..3d3a6df4ce70 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -222,7 +222,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; int err; - err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); + err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC); iter->err = err; if (err) return; From 71df14b0ce094be46d105b5a3ededd83b8e779a0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 May 2017 11:17:29 +0100 Subject: [PATCH 026/254] netfilter: nf_tables: missing sanitization in data from userspace Do not assume userspace always sends us NFT_DATA_VALUE for bitwise and cmp expressions. Although NFT_DATA_VERDICT does not make any sense, it is still possible to handcraft a netlink message using this incorrect data type. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 19 ++++++++++++++----- net/netfilter/nft_cmp.c | 12 ++++++++++-- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 877d9acd91ef..96bd4f325b0f 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -83,17 +83,26 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (d1.len != priv->len) - return -EINVAL; + if (d1.len != priv->len) { + err = -EINVAL; + goto err1; + } err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, tb[NFTA_BITWISE_XOR]); if (err < 0) - return err; - if (d2.len != priv->len) - return -EINVAL; + goto err1; + if (d2.len != priv->len) { + err = -EINVAL; + goto err2; + } return 0; +err2: + nft_data_uninit(&priv->xor, d2.type); +err1: + nft_data_uninit(&priv->mask, d1.type); + return err; } static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 2b96effeadc1..8c9d0fb19118 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -201,10 +201,18 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) if (err < 0) return ERR_PTR(err); + if (desc.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err1; + } + if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ) return &nft_cmp_fast_ops; - else - return &nft_cmp_ops; + + return &nft_cmp_ops; +err1: + nft_data_uninit(&data, desc.type); + return ERR_PTR(-EINVAL); } struct nft_expr_type nft_cmp_type __read_mostly = { From 591054469b3eef34bc097c30fae8ededddf8d796 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 May 2017 11:17:34 +0100 Subject: [PATCH 027/254] netfilter: nf_tables: revisit chain/object refcounting from elements Andreas reports that the following incremental update using our commit protocol doesn't work. # nft -f incremental-update.nft delete element ip filter client_to_any { 10.180.86.22 : goto CIn_1 } delete chain ip filter CIn_1 ... Error: Could not process rule: Device or resource busy The existing code is not well-integrated into the commit phase protocol, since element deletions do not result in refcount decrement from the preparation phase. This results in bogus EBUSY errors like the one above. Two new functions come with this patch: * nft_set_elem_activate() function is used from the abort path, to restore the set element refcounting on objects that occurred from the preparation phase. * nft_set_elem_deactivate() that is called from nft_del_setelem() to decrement set element refcounting on objects from the preparation phase in the commit protocol. The nft_data_uninit() has been renamed to nft_data_release() since this function does not uninitialize any data store in the data register, instead just releases the references to objects. Moreover, a new function nft_data_hold() has been introduced to be used from nft_set_elem_activate(). Reported-by: Andreas Schultz Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 82 +++++++++++++++++++++++++++---- net/netfilter/nft_bitwise.c | 4 +- net/netfilter/nft_cmp.c | 2 +- net/netfilter/nft_immediate.c | 5 +- net/netfilter/nft_range.c | 4 +- 6 files changed, 81 insertions(+), 18 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 028faec8fc27..8a8bab8d7b15 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -176,7 +176,7 @@ struct nft_data_desc { int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, unsigned int size, struct nft_data_desc *desc, const struct nlattr *nla); -void nft_data_uninit(const struct nft_data *data, enum nft_data_types type); +void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5f4a4d48b871..da314be0c048 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3627,9 +3627,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); - nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE); + nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) - nft_data_uninit(nft_set_ext_data(ext), set->dtype); + nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) @@ -3638,6 +3638,18 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +/* Only called from commit path, nft_set_elem_deactivate() already deals with + * the refcounting from the preparation phase. + */ +static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); + kfree(elem); +} + static int nft_setelem_parse_flags(const struct nft_set *set, const struct nlattr *attr, u32 *flags) { @@ -3849,9 +3861,9 @@ err4: kfree(elem.priv); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_uninit(&data, d2.type); + nft_data_release(&data, d2.type); err2: - nft_data_uninit(&elem.key.val, d1.type); + nft_data_release(&elem.key.val, d1.type); err1: return err; } @@ -3896,6 +3908,53 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return err; } +/** + * nft_data_hold - hold a nft_data item + * + * @data: struct nft_data to release + * @type: type of data + * + * Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded, + * NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and + * NFT_GOTO verdicts. This function must be called on active data objects + * from the second phase of the commit protocol. + */ +static void nft_data_hold(const struct nft_data *data, enum nft_data_types type) +{ + if (type == NFT_DATA_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + data->verdict.chain->use++; + break; + } + } +} + +static void nft_set_elem_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + nft_data_hold(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + (*nft_set_ext_obj(ext))->use++; +} + +static void nft_set_elem_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + nft_data_release(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + (*nft_set_ext_obj(ext))->use--; +} + static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -3961,6 +4020,8 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, kfree(elem.priv); elem.priv = priv; + nft_set_elem_deactivate(ctx->net, set, &elem); + nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -3970,7 +4031,7 @@ err4: err3: kfree(elem.priv); err2: - nft_data_uninit(&elem.key.val, desc.type); + nft_data_release(&elem.key.val, desc.type); err1: return err; } @@ -4777,8 +4838,8 @@ static void nf_tables_commit_release(struct nft_trans *trans) nft_set_destroy(nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: - nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv, true); + nf_tables_set_elem_destroy(nft_trans_elem_set(trans), + nft_trans_elem(trans).priv); break; case NFT_MSG_DELOBJ: nft_obj_destroy(nft_trans_obj(trans)); @@ -5013,6 +5074,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; + nft_set_elem_activate(net, te->set, &te->elem); te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; @@ -5498,7 +5560,7 @@ int nft_data_init(const struct nft_ctx *ctx, EXPORT_SYMBOL_GPL(nft_data_init); /** - * nft_data_uninit - release a nft_data item + * nft_data_release - release a nft_data item * * @data: struct nft_data to release * @type: type of data @@ -5506,7 +5568,7 @@ EXPORT_SYMBOL_GPL(nft_data_init); * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded, * all others need to be released by calling this function. */ -void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) +void nft_data_release(const struct nft_data *data, enum nft_data_types type) { if (type < NFT_DATA_VERDICT) return; @@ -5517,7 +5579,7 @@ void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) WARN_ON(1); } } -EXPORT_SYMBOL_GPL(nft_data_uninit); +EXPORT_SYMBOL_GPL(nft_data_release); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 96bd4f325b0f..fff8073e2a56 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -99,9 +99,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, return 0; err2: - nft_data_uninit(&priv->xor, d2.type); + nft_data_release(&priv->xor, d2.type); err1: - nft_data_uninit(&priv->mask, d1.type); + nft_data_release(&priv->mask, d1.type); return err; } diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 8c9d0fb19118..c2945eb3397c 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -211,7 +211,7 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return &nft_cmp_ops; err1: - nft_data_uninit(&data, desc.type); + nft_data_release(&data, desc.type); return ERR_PTR(-EINVAL); } diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 728baf88295a..4717d7796927 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -65,7 +65,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx, return 0; err1: - nft_data_uninit(&priv->data, desc.type); + nft_data_release(&priv->data, desc.type); return err; } @@ -73,7 +73,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); - return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); + + return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 9edc74eedc10..cedb96c3619f 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -102,9 +102,9 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr priv->len = desc_from.len; return 0; err2: - nft_data_uninit(&priv->data_to, desc_to.type); + nft_data_release(&priv->data_to, desc_to.type); err1: - nft_data_uninit(&priv->data_from, desc_from.type); + nft_data_release(&priv->data_from, desc_from.type); return err; } From fa16b69f1299004b60b625f181143500a246e5cb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 16 May 2017 09:11:33 +0200 Subject: [PATCH 028/254] ALSA: hda - No loopback on ALC299 codec ALC299 has no loopback mixer, but the driver still tries to add a beep control over the mixer NID which leads to the error at accessing it. This patch fixes it by properly declaring mixer_nid=0 for this codec. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195775 Fixes: 28f1f9b26cee ("ALSA: hda/realtek - Add new codec ID ALC299") Cc: stable@vger.kernel.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 58df440013c5..9c22ad694534 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6465,8 +6465,11 @@ static int patch_alc269(struct hda_codec *codec) break; case 0x10ec0225: case 0x10ec0295: + spec->codec_variant = ALC269_TYPE_ALC225; + break; case 0x10ec0299: spec->codec_variant = ALC269_TYPE_ALC225; + spec->gen.mixer_nid = 0; /* no loopback on ALC299 */ break; case 0x10ec0234: case 0x10ec0274: From c953d63548207a085abcb12a15fefc8a11ffdf0a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 16 May 2017 09:30:18 +0800 Subject: [PATCH 029/254] ebtables: arpreply: Add the standard target sanity check The info->target comes from userspace and it would be used directly. So we need to add the sanity check to make sure it is a valid standard target, although the ebtables tool has already checked it. Kernel needs to validate anything coming from userspace. If the target is set as an evil value, it would break the ebtables and cause a panic. Because the non-standard target is treated as one offset. Now add one helper function ebt_invalid_target, and we would replace the macro INVALID_TARGET later. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 5 +++++ net/bridge/netfilter/ebt_arpreply.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index a30efb437e6d..e0cbf17af780 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -125,4 +125,9 @@ extern unsigned int ebt_do_table(struct sk_buff *skb, /* True if the target is not a standard target */ #define INVALID_TARGET (info->target < -NUM_STANDARD_TARGETS || info->target >= 0) +static inline bool ebt_invalid_target(int target) +{ + return (target < -NUM_STANDARD_TARGETS || target >= 0); +} + #endif diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c index 5929309beaa1..db85230e49c3 100644 --- a/net/bridge/netfilter/ebt_arpreply.c +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -68,6 +68,9 @@ static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par) if (e->ethproto != htons(ETH_P_ARP) || e->invflags & EBT_IPROTO) return -EINVAL; + if (ebt_invalid_target(info->target)) + return -EINVAL; + return 0; } From 0daaecacb83bc6b656a56393ab77a31c28139bc7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 12 May 2017 10:44:08 -0700 Subject: [PATCH 030/254] xfs: fix indlen accounting error on partial delalloc conversion The delalloc -> real block conversion path uses an incorrect calculation in the case where the middle part of a delalloc extent is being converted. This is documented as a rare situation because XFS generally attempts to maximize contiguity by converting as much of a delalloc extent as possible. If this situation does occur, the indlen reservation for the two new delalloc extents left behind by the conversion of the middle range is calculated and compared with the original reservation. If more blocks are required, the delta is allocated from the global block pool. This delta value can be characterized as the difference between the new total requirement (temp + temp2) and the currently available reservation minus those blocks that have already been allocated (startblockval(PREV.br_startblock) - allocated). The problem is that the current code does not account for previously allocated blocks correctly. It subtracts the current allocation count from the (new - old) delta rather than the old indlen reservation. This means that more indlen blocks than have been allocated end up stashed in the remaining extents and free space accounting is broken as a result. Fix up the calculation to subtract the allocated block count from the original extent indlen and thus correctly allocate the reservation delta based on the difference between the new total requirement and the unused blocks from the original reservation. Also remove a bogus assert that contradicts the fact that the new indlen reservation can be larger than the original indlen reservation. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f02eb7673392..8adb91b05588 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2065,8 +2065,10 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(bma->ip, temp); temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + diff = (int)(temp + temp2 - + (startblockval(PREV.br_startblock) - + (bma->cur ? + bma->cur->bc_private.b.allocated : 0))); if (diff > 0) { error = xfs_mod_fdblocks(bma->ip->i_mount, -((int64_t)diff), false); @@ -2123,7 +2125,6 @@ xfs_bmap_add_extent_delay_real( temp = da_new; if (bma->cur) temp += bma->cur->bc_private.b.allocated; - ASSERT(temp <= da_old); if (temp < da_old) xfs_mod_fdblocks(bma->ip->i_mount, (int64_t)(da_old - temp), false); From 6eadbf4c8ba816c10d1c97bed9aa861d9fd17809 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:08 -0700 Subject: [PATCH 031/254] xfs: BMAPX shouldn't barf on inline-format directories When we're fulfilling a BMAPX request, jump out early if the data fork is in local format. This prevents us from hitting a debugging check in bmapi_read and barfing errors back to userspace. The on-disk extent count check later isn't sufficient for IF_DELALLOC mode because da extents are in memory and not on disk. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_bmap_util.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2b954308a1d6..2e8851ee6759 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -582,9 +582,13 @@ xfs_getbmap( } break; default: + /* Local format data forks report no extents. */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + bmv->bmv_entries = 0; + return 0; + } if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) return -EINVAL; if (xfs_get_extsz_hint(ip) || From 6e747506dde195d3d05fe2bb8ef78aceba28a5e3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:11 -0700 Subject: [PATCH 032/254] xfs: fix warnings about unused stack variables Reduce stack usage and get rid of compiler warnings by eliminating unused variables. Signed-off-by: Darrick J. Wong Reviewed-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_bmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8adb91b05588..a7048eafa8e6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1280,7 +1280,6 @@ xfs_bmap_read_extents( xfs_bmbt_rec_t *frp; xfs_fsblock_t nextbno; xfs_extnum_t num_recs; - xfs_extnum_t start; num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { @@ -1303,7 +1302,6 @@ xfs_bmap_read_extents( * Copy records into the extent records. */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; for (j = 0; j < num_recs; j++, i++, frp++) { xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); trp->l0 = be64_to_cpu(frp->l0); From 892d2a5f705723b2cb488bfb38bcbdcf83273184 Mon Sep 17 00:00:00 2001 From: Zorro Lang Date: Mon, 15 May 2017 08:40:02 -0700 Subject: [PATCH 033/254] xfs: bad assertion for delalloc an extent that start at i_size By run fsstress long enough time enough in RHEL-7, I find an assertion failure (harder to reproduce on linux-4.11, but problem is still there): XFS: Assertion failed: (iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c The assertion is in xfs_getbmap() funciton: if (map[i].br_startblock == DELAYSTARTBLOCK && --> map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); When map[i].br_startoff == XFS_B_TO_FSB(mp, XFS_ISIZE(ip)), the startoff is just at EOF. But we only need to make sure delalloc extents that are within EOF, not include EOF. Signed-off-by: Zorro Lang Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2e8851ee6759..9e3cc2146d5b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -716,7 +716,7 @@ xfs_getbmap( * extents. */ if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); if (map[i].br_startblock == HOLESTARTBLOCK && From ea9a46e1c49251331dbfda19ced7114337966178 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:10 -0700 Subject: [PATCH 034/254] xfs: only return detailed fsmap info if the caller has CAP_SYS_ADMIN There were a number of handwaving complaints that one could "possibly" use inode numbers and extent maps to fingerprint a filesystem hosting multiple containers and somehow use the information to guess at the contents of other containers and attack them. Despite the total lack of any demonstration that this is actually possible, it's easier to restrict access now and broaden it later, so use the rmapbt fsmap backends only if the caller has CAP_SYS_ADMIN. Unprivileged users will just have to make do with only getting the free space and static metadata placement information. Signed-off-by: Darrick J. Wong Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_fsmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 3683819887a5..814ed729881d 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -828,6 +828,7 @@ xfs_getfsmap( struct xfs_fsmap dkeys[2]; /* per-dev keys */ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; struct xfs_getfsmap_info info = { NULL }; + bool use_rmap; int i; int error = 0; @@ -837,12 +838,14 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; + use_rmap = capable(CAP_SYS_ADMIN) && + xfs_sb_version_hasrmapbt(&mp->m_sb); head->fmh_entries = 0; /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else handlers[0].fn = xfs_getfsmap_datadev_bnobt; From 4fa8324461b824eaea9b6695464395710fe20c44 Mon Sep 17 00:00:00 2001 From: Derek Basehore Date: Thu, 11 May 2017 14:34:24 +0200 Subject: [PATCH 035/254] scsi: sd: Ignore sync cache failures when not supported Some external hard drives don't support the sync command even though the hard drive has write cache enabled. In this case, upon suspend request, sync cache failures are ignored if the error code in the sense header is ILLEGAL_REQUEST. There's not much we can do for these drives, so we shouldn't fail to suspend for this error case. The drive may stay powered if that's the setup for the port it's plugged into. Signed-off-by: Derek Basehore Signed-off-by: Thierry Escande Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index de9e2f2ef662..b6bb4e0ce0e3 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1582,17 +1582,21 @@ out: return retval; } -static int sd_sync_cache(struct scsi_disk *sdkp) +static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { int retries, res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; - struct scsi_sense_hdr sshdr; + struct scsi_sense_hdr my_sshdr; if (!scsi_device_online(sdp)) return -ENODEV; + /* caller might not be interested in sense, but we need it */ + if (!sshdr) + sshdr = &my_sshdr; + for (retries = 3; retries > 0; --retries) { unsigned char cmd[10] = { 0 }; @@ -1601,7 +1605,7 @@ static int sd_sync_cache(struct scsi_disk *sdkp) * Leave the rest of the command zero to indicate * flush everything. */ - res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, &sshdr, + res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, sshdr, timeout, SD_MAX_RETRIES, 0, RQF_PM, NULL); if (res == 0) break; @@ -1611,11 +1615,12 @@ static int sd_sync_cache(struct scsi_disk *sdkp) sd_print_result(sdkp, "Synchronize Cache(10) failed", res); if (driver_byte(res) & DRIVER_SENSE) - sd_print_sense_hdr(sdkp, &sshdr); + sd_print_sense_hdr(sdkp, sshdr); + /* we need to evaluate the error return */ - if (scsi_sense_valid(&sshdr) && - (sshdr.asc == 0x3a || /* medium not present */ - sshdr.asc == 0x20)) /* invalid command */ + if (scsi_sense_valid(sshdr) && + (sshdr->asc == 0x3a || /* medium not present */ + sshdr->asc == 0x20)) /* invalid command */ /* this is no error here */ return 0; @@ -3459,7 +3464,7 @@ static void sd_shutdown(struct device *dev) if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); - sd_sync_cache(sdkp); + sd_sync_cache(sdkp, NULL); } if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) { @@ -3471,6 +3476,7 @@ static void sd_shutdown(struct device *dev) static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) { struct scsi_disk *sdkp = dev_get_drvdata(dev); + struct scsi_sense_hdr sshdr; int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ @@ -3478,12 +3484,23 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); - ret = sd_sync_cache(sdkp); + ret = sd_sync_cache(sdkp, &sshdr); + if (ret) { /* ignore OFFLINE device */ if (ret == -ENODEV) - ret = 0; - goto done; + return 0; + + if (!scsi_sense_valid(&sshdr) || + sshdr.sense_key != ILLEGAL_REQUEST) + return ret; + + /* + * sshdr.sense_key == ILLEGAL_REQUEST means this drive + * doesn't support sync. There's not much to do and + * suspend shouldn't fail. + */ + ret = 0; } } @@ -3495,7 +3512,6 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) ret = 0; } -done: return ret; } From dd6e1f71b785a6ac2511e2ddb86315f292873e59 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 11 May 2017 17:24:44 -0500 Subject: [PATCH 036/254] scsi: libfc: fix incorrect variable assignment Previous assignment was causing the use of the uninitialized variable _explan_ inside fc_seq_ls_rjt() function, which in this particular case is being called by fc_seq_els_rsp_send(). [mkp: fixed typo] Addresses-Coverity-ID: 1398125 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/libfc/fc_rport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index b44c3136eb51..520325867e2b 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -1422,7 +1422,7 @@ static void fc_rport_recv_rtv_req(struct fc_rport_priv *rdata, fp = fc_frame_alloc(lport, sizeof(*rtv)); if (!fp) { rjt_data.reason = ELS_RJT_UNAB; - rjt_data.reason = ELS_EXPL_INSUF_RES; + rjt_data.explan = ELS_EXPL_INSUF_RES; fc_seq_els_rsp_send(in_fp, ELS_LS_RJT, &rjt_data); goto drop; } From 845d9e8df2fa879e6494e786f290e1fd5560ac8c Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:38 -0700 Subject: [PATCH 037/254] scsi: lpfc: Fix used-RPI accounting problem. With 255 vports created a link trasition can casue a crash. When going through discovery after a link bounce the driver is using rpis before the cmd FCOE_POST_HDR_TEMPLATES completes. By doing that the next rpi bumps the rpi range out of the boundary. The fix it to increment the next_rpi only when the FCOE_POST_HDR_TEMPLATE succeeds. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 3 ++- drivers/scsi/lpfc/lpfc_init.c | 24 +++++------------------- drivers/scsi/lpfc/lpfc_sli.c | 8 ++++++++ drivers/scsi/lpfc/lpfc_sli4.h | 1 + 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 67827e397431..3f9f6d5f8c69 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -8667,7 +8667,8 @@ lpfc_cmpl_els_fdisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, lpfc_do_scr_ns_plogi(phba, vport); goto out; fdisc_failed: - if (vport->fc_vport->vport_state != FC_VPORT_NO_FABRIC_RSCS) + if (vport->fc_vport && + (vport->fc_vport->vport_state != FC_VPORT_NO_FABRIC_RSCS)) lpfc_vport_set_state(vport, FC_VPORT_FAILED); /* Cancel discovery timer */ lpfc_can_disctmo(vport); diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 4b1eb98c228d..b1b181a756dc 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -6525,7 +6525,6 @@ lpfc_sli4_create_rpi_hdr(struct lpfc_hba *phba) uint16_t rpi_limit, curr_rpi_range; struct lpfc_dmabuf *dmabuf; struct lpfc_rpi_hdr *rpi_hdr; - uint32_t rpi_count; /* * If the SLI4 port supports extents, posting the rpi header isn't @@ -6538,8 +6537,7 @@ lpfc_sli4_create_rpi_hdr(struct lpfc_hba *phba) return NULL; /* The limit on the logical index is just the max_rpi count. */ - rpi_limit = phba->sli4_hba.max_cfg_param.rpi_base + - phba->sli4_hba.max_cfg_param.max_rpi - 1; + rpi_limit = phba->sli4_hba.max_cfg_param.max_rpi; spin_lock_irq(&phba->hbalock); /* @@ -6550,18 +6548,10 @@ lpfc_sli4_create_rpi_hdr(struct lpfc_hba *phba) curr_rpi_range = phba->sli4_hba.next_rpi; spin_unlock_irq(&phba->hbalock); - /* - * The port has a limited number of rpis. The increment here - * is LPFC_RPI_HDR_COUNT - 1 to account for the starting value - * and to allow the full max_rpi range per port. - */ - if ((curr_rpi_range + (LPFC_RPI_HDR_COUNT - 1)) > rpi_limit) - rpi_count = rpi_limit - curr_rpi_range; - else - rpi_count = LPFC_RPI_HDR_COUNT; - - if (!rpi_count) + /* Reached full RPI range */ + if (curr_rpi_range == rpi_limit) return NULL; + /* * First allocate the protocol header region for the port. The * port expects a 4KB DMA-mapped memory region that is 4K aligned. @@ -6595,13 +6585,9 @@ lpfc_sli4_create_rpi_hdr(struct lpfc_hba *phba) /* The rpi_hdr stores the logical index only. */ rpi_hdr->start_rpi = curr_rpi_range; + rpi_hdr->next_rpi = phba->sli4_hba.next_rpi + LPFC_RPI_HDR_COUNT; list_add_tail(&rpi_hdr->list, &phba->sli4_hba.lpfc_rpi_hdr_list); - /* - * The next_rpi stores the next logical module-64 rpi value used - * to post physical rpis in subsequent rpi postings. - */ - phba->sli4_hba.next_rpi += rpi_count; spin_unlock_irq(&phba->hbalock); return rpi_hdr; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 2a4fc00dfa9b..e2d25ae5ba45 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -17137,6 +17137,14 @@ lpfc_sli4_post_rpi_hdr(struct lpfc_hba *phba, struct lpfc_rpi_hdr *rpi_page) "status x%x add_status x%x, mbx status x%x\n", shdr_status, shdr_add_status, rc); rc = -ENXIO; + } else { + /* + * The next_rpi stores the next logical module-64 rpi value used + * to post physical rpis in subsequent rpi postings. + */ + spin_lock_irq(&phba->hbalock); + phba->sli4_hba.next_rpi = rpi_page->next_rpi; + spin_unlock_irq(&phba->hbalock); } return rc; } diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index da46471337c8..915e8d5581bd 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -698,6 +698,7 @@ struct lpfc_rpi_hdr { struct lpfc_dmabuf *dmabuf; uint32_t page_count; uint32_t start_rpi; + uint16_t next_rpi; }; struct lpfc_rsrc_blks { From 0c9c6a75141810acade82add4f4708959a5d3a1d Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:39 -0700 Subject: [PATCH 038/254] scsi: lpfc: Fix system crash when port is reset. The driver panic when using the els_wq during port reset. Check for NULL els_wq before dereferencing. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 8 ++++++-- drivers/scsi/lpfc/lpfc_hbadisc.c | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 6d7840b096e6..62571fa9c6ad 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -1228,7 +1228,11 @@ lpfc_sli_read_hs(struct lpfc_hba *phba) static inline struct lpfc_sli_ring * lpfc_phba_elsring(struct lpfc_hba *phba) { - if (phba->sli_rev == LPFC_SLI_REV4) - return phba->sli4_hba.els_wq->pring; + if (phba->sli_rev == LPFC_SLI_REV4) { + if (phba->sli4_hba.els_wq) + return phba->sli4_hba.els_wq->pring; + else + return NULL; + } return &phba->sli.sli3_ring[LPFC_ELS_RING]; } diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 0482c5580331..dcc9b3858778 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -693,9 +693,9 @@ lpfc_work_done(struct lpfc_hba *phba) pring = lpfc_phba_elsring(phba); status = (ha_copy & (HA_RXMASK << (4*LPFC_ELS_RING))); status >>= (4*LPFC_ELS_RING); - if ((status & HA_RXMASK) || - (pring->flag & LPFC_DEFERRED_RING_EVENT) || - (phba->hba_flag & HBA_SP_QUEUE_EVT)) { + if (pring && (status & HA_RXMASK || + pring->flag & LPFC_DEFERRED_RING_EVENT || + phba->hba_flag & HBA_SP_QUEUE_EVT)) { if (pring->flag & LPFC_STOP_IOCB_EVENT) { pring->flag |= LPFC_DEFERRED_RING_EVENT; /* Set the lpfc data pending flag */ From 547077a44b3b49f56c0f05c0b46c8c617dea591d Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:40 -0700 Subject: [PATCH 039/254] scsi: lpfc: Adding additional stats counters for nvme. More debug messages added for nvme statistics. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_attr.c | 24 ++++++++++------- drivers/scsi/lpfc/lpfc_debugfs.c | 27 +++++++++++-------- drivers/scsi/lpfc/lpfc_nvmet.c | 46 ++++++++++++++++++++++++-------- drivers/scsi/lpfc/lpfc_nvmet.h | 12 +++++---- drivers/scsi/lpfc/lpfc_sli.c | 38 ++++++++++++++++++++++---- drivers/scsi/lpfc/lpfc_sli4.h | 2 +- 6 files changed, 106 insertions(+), 43 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 4830370bfab1..41ec7451689b 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -205,8 +205,9 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, atomic_read(&tgtp->xmt_ls_rsp_error)); len += snprintf(buf+len, PAGE_SIZE-len, - "FCP: Rcv %08x Drop %08x\n", + "FCP: Rcv %08x Release %08x Drop %08x\n", atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->xmt_fcp_release), atomic_read(&tgtp->rcv_fcp_cmd_drop)); if (atomic_read(&tgtp->rcv_fcp_cmd_in) != @@ -218,15 +219,12 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, } len += snprintf(buf+len, PAGE_SIZE-len, - "FCP Rsp: RD %08x rsp %08x WR %08x rsp %08x\n", + "FCP Rsp: RD %08x rsp %08x WR %08x rsp %08x " + "drop %08x\n", atomic_read(&tgtp->xmt_fcp_read), atomic_read(&tgtp->xmt_fcp_read_rsp), atomic_read(&tgtp->xmt_fcp_write), - atomic_read(&tgtp->xmt_fcp_rsp)); - - len += snprintf(buf+len, PAGE_SIZE-len, - "FCP Rsp: abort %08x drop %08x\n", - atomic_read(&tgtp->xmt_fcp_abort), + atomic_read(&tgtp->xmt_fcp_rsp), atomic_read(&tgtp->xmt_fcp_drop)); len += snprintf(buf+len, PAGE_SIZE-len, @@ -236,10 +234,16 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, atomic_read(&tgtp->xmt_fcp_rsp_drop)); len += snprintf(buf+len, PAGE_SIZE-len, - "ABORT: Xmt %08x Err %08x Cmpl %08x", + "ABORT: Xmt %08x Cmpl %08x\n", + atomic_read(&tgtp->xmt_fcp_abort), + atomic_read(&tgtp->xmt_fcp_abort_cmpl)); + + len += snprintf(buf + len, PAGE_SIZE - len, + "ABORT: Sol %08x Usol %08x Err %08x Cmpl %08x", + atomic_read(&tgtp->xmt_abort_sol), + atomic_read(&tgtp->xmt_abort_unsol), atomic_read(&tgtp->xmt_abort_rsp), - atomic_read(&tgtp->xmt_abort_rsp_error), - atomic_read(&tgtp->xmt_abort_cmpl)); + atomic_read(&tgtp->xmt_abort_rsp_error)); len += snprintf(buf+len, PAGE_SIZE-len, "\n"); return len; diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index fce549a91911..a41daedeb967 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -797,11 +797,6 @@ lpfc_debugfs_nvmestat_data(struct lpfc_vport *vport, char *buf, int size) atomic_read(&tgtp->xmt_fcp_write), atomic_read(&tgtp->xmt_fcp_rsp)); - len += snprintf(buf + len, size - len, - "FCP Rsp: abort %08x drop %08x\n", - atomic_read(&tgtp->xmt_fcp_abort), - atomic_read(&tgtp->xmt_fcp_drop)); - len += snprintf(buf + len, size - len, "FCP Rsp Cmpl: %08x err %08x drop %08x\n", atomic_read(&tgtp->xmt_fcp_rsp_cmpl), @@ -809,10 +804,16 @@ lpfc_debugfs_nvmestat_data(struct lpfc_vport *vport, char *buf, int size) atomic_read(&tgtp->xmt_fcp_rsp_drop)); len += snprintf(buf + len, size - len, - "ABORT: Xmt %08x Err %08x Cmpl %08x", + "ABORT: Xmt %08x Cmpl %08x\n", + atomic_read(&tgtp->xmt_fcp_abort), + atomic_read(&tgtp->xmt_fcp_abort_cmpl)); + + len += snprintf(buf + len, size - len, + "ABORT: Sol %08x Usol %08x Err %08x Cmpl %08x", + atomic_read(&tgtp->xmt_abort_sol), + atomic_read(&tgtp->xmt_abort_unsol), atomic_read(&tgtp->xmt_abort_rsp), - atomic_read(&tgtp->xmt_abort_rsp_error), - atomic_read(&tgtp->xmt_abort_cmpl)); + atomic_read(&tgtp->xmt_abort_rsp_error)); len += snprintf(buf + len, size - len, "\n"); @@ -1959,6 +1960,7 @@ lpfc_debugfs_nvmestat_write(struct file *file, const char __user *buf, atomic_set(&tgtp->rcv_ls_req_out, 0); atomic_set(&tgtp->rcv_ls_req_drop, 0); atomic_set(&tgtp->xmt_ls_abort, 0); + atomic_set(&tgtp->xmt_ls_abort_cmpl, 0); atomic_set(&tgtp->xmt_ls_rsp, 0); atomic_set(&tgtp->xmt_ls_drop, 0); atomic_set(&tgtp->xmt_ls_rsp_error, 0); @@ -1967,19 +1969,22 @@ lpfc_debugfs_nvmestat_write(struct file *file, const char __user *buf, atomic_set(&tgtp->rcv_fcp_cmd_in, 0); atomic_set(&tgtp->rcv_fcp_cmd_out, 0); atomic_set(&tgtp->rcv_fcp_cmd_drop, 0); - atomic_set(&tgtp->xmt_fcp_abort, 0); atomic_set(&tgtp->xmt_fcp_drop, 0); atomic_set(&tgtp->xmt_fcp_read_rsp, 0); atomic_set(&tgtp->xmt_fcp_read, 0); atomic_set(&tgtp->xmt_fcp_write, 0); atomic_set(&tgtp->xmt_fcp_rsp, 0); + atomic_set(&tgtp->xmt_fcp_release, 0); atomic_set(&tgtp->xmt_fcp_rsp_cmpl, 0); atomic_set(&tgtp->xmt_fcp_rsp_error, 0); atomic_set(&tgtp->xmt_fcp_rsp_drop, 0); + atomic_set(&tgtp->xmt_fcp_abort, 0); + atomic_set(&tgtp->xmt_fcp_abort_cmpl, 0); + atomic_set(&tgtp->xmt_abort_sol, 0); + atomic_set(&tgtp->xmt_abort_unsol, 0); atomic_set(&tgtp->xmt_abort_rsp, 0); atomic_set(&tgtp->xmt_abort_rsp_error, 0); - atomic_set(&tgtp->xmt_abort_cmpl, 0); } return nbytes; } @@ -3143,7 +3148,7 @@ __lpfc_idiag_print_rqpair(struct lpfc_queue *qp, struct lpfc_queue *datqp, "\t\t%s RQ info: ", rqtype); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "AssocCQID[%02d]: RQ-STAT[nopost:x%x nobuf:x%x " - "trunc:x%x rcv:x%llx]\n", + "posted:x%x rcv:x%llx]\n", qp->assoc_qid, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index 94434e621c33..bb12e2c9fbf4 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -502,6 +502,7 @@ lpfc_nvmet_xmt_ls_rsp(struct nvmet_fc_target_port *tgtport, "6150 LS Drop IO x%x: Prep\n", ctxp->oxid); lpfc_in_buf_free(phba, &nvmebuf->dbuf); + atomic_inc(&nvmep->xmt_ls_abort); lpfc_nvmet_unsol_ls_issue_abort(phba, ctxp, ctxp->sid, ctxp->oxid); return -ENOMEM; @@ -545,6 +546,7 @@ lpfc_nvmet_xmt_ls_rsp(struct nvmet_fc_target_port *tgtport, lpfc_nlp_put(nvmewqeq->context1); lpfc_in_buf_free(phba, &nvmebuf->dbuf); + atomic_inc(&nvmep->xmt_ls_abort); lpfc_nvmet_unsol_ls_issue_abort(phba, ctxp, ctxp->sid, ctxp->oxid); return -ENXIO; } @@ -692,6 +694,7 @@ static void lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *rsp) { + struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private; struct lpfc_nvmet_rcv_ctx *ctxp = container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req); struct lpfc_hba *phba = ctxp->phba; @@ -710,6 +713,8 @@ lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport, lpfc_nvmeio_data(phba, "NVMET FCP FREE: xri x%x ste %d\n", ctxp->oxid, ctxp->state, 0); + atomic_inc(&lpfc_nvmep->xmt_fcp_release); + if (aborting) return; @@ -796,6 +801,7 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba) atomic_set(&tgtp->rcv_ls_req_out, 0); atomic_set(&tgtp->rcv_ls_req_drop, 0); atomic_set(&tgtp->xmt_ls_abort, 0); + atomic_set(&tgtp->xmt_ls_abort_cmpl, 0); atomic_set(&tgtp->xmt_ls_rsp, 0); atomic_set(&tgtp->xmt_ls_drop, 0); atomic_set(&tgtp->xmt_ls_rsp_error, 0); @@ -803,18 +809,21 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba) atomic_set(&tgtp->rcv_fcp_cmd_in, 0); atomic_set(&tgtp->rcv_fcp_cmd_out, 0); atomic_set(&tgtp->rcv_fcp_cmd_drop, 0); - atomic_set(&tgtp->xmt_fcp_abort, 0); atomic_set(&tgtp->xmt_fcp_drop, 0); atomic_set(&tgtp->xmt_fcp_read_rsp, 0); atomic_set(&tgtp->xmt_fcp_read, 0); atomic_set(&tgtp->xmt_fcp_write, 0); atomic_set(&tgtp->xmt_fcp_rsp, 0); + atomic_set(&tgtp->xmt_fcp_release, 0); atomic_set(&tgtp->xmt_fcp_rsp_cmpl, 0); atomic_set(&tgtp->xmt_fcp_rsp_error, 0); atomic_set(&tgtp->xmt_fcp_rsp_drop, 0); + atomic_set(&tgtp->xmt_fcp_abort, 0); + atomic_set(&tgtp->xmt_fcp_abort_cmpl, 0); + atomic_set(&tgtp->xmt_abort_unsol, 0); + atomic_set(&tgtp->xmt_abort_sol, 0); atomic_set(&tgtp->xmt_abort_rsp, 0); atomic_set(&tgtp->xmt_abort_rsp_error, 0); - atomic_set(&tgtp->xmt_abort_cmpl, 0); } return error; } @@ -1011,6 +1020,7 @@ lpfc_nvmet_unsol_ls_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, oxid = 0; size = 0; sid = 0; + ctxp = NULL; goto dropit; } @@ -1117,6 +1127,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, oxid = 0; size = 0; sid = 0; + ctxp = NULL; goto dropit; } @@ -1193,8 +1204,11 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, atomic_inc(&tgtp->rcv_fcp_cmd_drop); lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6159 FCP Drop IO x%x: err x%x\n", - ctxp->oxid, rc); + "6159 FCP Drop IO x%x: err x%x: x%x x%x x%x\n", + ctxp->oxid, rc, + atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->rcv_fcp_cmd_out), + atomic_read(&tgtp->xmt_fcp_release)); dropit: lpfc_nvmeio_data(phba, "NVMET FCP DROP: xri x%x sz %d from %06x\n", oxid, size, sid); @@ -1206,7 +1220,7 @@ dropit: if (nvmebuf) { nvmebuf->iocbq->hba_wqidx = 0; /* We assume a rcv'ed cmd ALWAYs fits into 1 buffer */ - lpfc_nvmet_rq_post(phba, NULL, &nvmebuf->hbuf); + lpfc_nvmet_rq_post(phba, ctxp, &nvmebuf->hbuf); } #endif } @@ -1812,7 +1826,8 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, result = wcqe->parameter; tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; - atomic_inc(&tgtp->xmt_abort_cmpl); + if (ctxp->flag & LPFC_NVMET_ABORT_OP) + atomic_inc(&tgtp->xmt_fcp_abort_cmpl); ctxp->state = LPFC_NVMET_STE_DONE; @@ -1827,6 +1842,7 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, } ctxp->flag &= ~LPFC_NVMET_ABORT_OP; spin_unlock_irqrestore(&ctxp->ctxlock, flags); + atomic_inc(&tgtp->xmt_abort_rsp); lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS, "6165 ABORT cmpl: xri x%x flg x%x (%d) " @@ -1877,7 +1893,8 @@ lpfc_nvmet_unsol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, result = wcqe->parameter; tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; - atomic_inc(&tgtp->xmt_abort_cmpl); + if (ctxp->flag & LPFC_NVMET_ABORT_OP) + atomic_inc(&tgtp->xmt_fcp_abort_cmpl); if (!ctxp) { /* if context is clear, related io alrady complete */ @@ -1907,6 +1924,7 @@ lpfc_nvmet_unsol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, } ctxp->flag &= ~LPFC_NVMET_ABORT_OP; spin_unlock_irqrestore(&ctxp->ctxlock, flags); + atomic_inc(&tgtp->xmt_abort_rsp); lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS, "6316 ABTS cmpl xri x%x flg x%x (%x) " @@ -1953,7 +1971,7 @@ lpfc_nvmet_xmt_ls_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, result = wcqe->parameter; tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; - atomic_inc(&tgtp->xmt_abort_cmpl); + atomic_inc(&tgtp->xmt_ls_abort_cmpl); lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS, "6083 Abort cmpl: ctx %p WCQE: %08x %08x %08x %08x\n", @@ -2104,6 +2122,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, /* Issue ABTS for this WQE based on iotag */ ctxp->abort_wqeq = lpfc_sli_get_iocbq(phba); if (!ctxp->abort_wqeq) { + atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS, "6161 ABORT failed: No wqeqs: " "xri: x%x\n", ctxp->oxid); @@ -2128,6 +2147,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, /* driver queued commands are in process of being flushed */ if (phba->hba_flag & HBA_NVME_IOQ_FLUSH) { spin_unlock_irqrestore(&phba->hbalock, flags); + atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_ERR, LOG_NVME, "6163 Driver in reset cleanup - flushing " "NVME Req now. hba_flag x%x oxid x%x\n", @@ -2140,6 +2160,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, /* Outstanding abort is in progress */ if (abts_wqeq->iocb_flag & LPFC_DRIVER_ABORTED) { spin_unlock_irqrestore(&phba->hbalock, flags); + atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_ERR, LOG_NVME, "6164 Outstanding NVME I/O Abort Request " "still pending on oxid x%x\n", @@ -2190,9 +2211,12 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, abts_wqeq->context2 = ctxp; rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, abts_wqeq); spin_unlock_irqrestore(&phba->hbalock, flags); - if (rc == WQE_SUCCESS) + if (rc == WQE_SUCCESS) { + atomic_inc(&tgtp->xmt_abort_sol); return 0; + } + atomic_inc(&tgtp->xmt_abort_rsp_error); ctxp->flag &= ~LPFC_NVMET_ABORT_OP; lpfc_sli_release_iocbq(phba, abts_wqeq); lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS, @@ -2231,11 +2255,11 @@ lpfc_nvmet_unsol_fcp_issue_abort(struct lpfc_hba *phba, rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, abts_wqeq); spin_unlock_irqrestore(&phba->hbalock, flags); if (rc == WQE_SUCCESS) { - atomic_inc(&tgtp->xmt_abort_rsp); return 0; } aerr: + atomic_inc(&tgtp->xmt_abort_rsp_error); ctxp->flag &= ~LPFC_NVMET_ABORT_OP; atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS, @@ -2279,7 +2303,7 @@ lpfc_nvmet_unsol_ls_issue_abort(struct lpfc_hba *phba, rc = lpfc_sli4_issue_wqe(phba, LPFC_ELS_RING, abts_wqeq); spin_unlock_irqrestore(&phba->hbalock, flags); if (rc == WQE_SUCCESS) { - atomic_inc(&tgtp->xmt_abort_rsp); + atomic_inc(&tgtp->xmt_abort_unsol); return 0; } diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h index 128759fe6650..837210a3e7c8 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.h +++ b/drivers/scsi/lpfc/lpfc_nvmet.h @@ -34,6 +34,7 @@ struct lpfc_nvmet_tgtport { atomic_t rcv_ls_req_out; atomic_t rcv_ls_req_drop; atomic_t xmt_ls_abort; + atomic_t xmt_ls_abort_cmpl; /* Stats counters - lpfc_nvmet_xmt_ls_rsp */ atomic_t xmt_ls_rsp; @@ -47,9 +48,9 @@ struct lpfc_nvmet_tgtport { atomic_t rcv_fcp_cmd_in; atomic_t rcv_fcp_cmd_out; atomic_t rcv_fcp_cmd_drop; + atomic_t xmt_fcp_release; /* Stats counters - lpfc_nvmet_xmt_fcp_op */ - atomic_t xmt_fcp_abort; atomic_t xmt_fcp_drop; atomic_t xmt_fcp_read_rsp; atomic_t xmt_fcp_read; @@ -62,12 +63,13 @@ struct lpfc_nvmet_tgtport { atomic_t xmt_fcp_rsp_drop; - /* Stats counters - lpfc_nvmet_unsol_issue_abort */ + /* Stats counters - lpfc_nvmet_xmt_fcp_abort */ + atomic_t xmt_fcp_abort; + atomic_t xmt_fcp_abort_cmpl; + atomic_t xmt_abort_sol; + atomic_t xmt_abort_unsol; atomic_t xmt_abort_rsp; atomic_t xmt_abort_rsp_error; - - /* Stats counters - lpfc_nvmet_xmt_abort_cmp */ - atomic_t xmt_abort_cmpl; }; struct lpfc_nvmet_rcv_ctx { diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index e2d25ae5ba45..333c5094b97d 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -512,6 +512,7 @@ lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq, } else { return -EINVAL; } + hq->RQ_buf_posted += hq->entry_repost; writel(doorbell.word0, hq->db_regaddr); } return put_index; @@ -12788,6 +12789,7 @@ lpfc_sli4_sp_handle_rcqe(struct lpfc_hba *phba, struct lpfc_rcqe *rcqe) struct fc_frame_header *fc_hdr; struct lpfc_queue *hrq = phba->sli4_hba.hdr_rq; struct lpfc_queue *drq = phba->sli4_hba.dat_rq; + struct lpfc_nvmet_tgtport *tgtp; struct hbq_dmabuf *dma_buf; uint32_t status, rq_id; unsigned long iflags; @@ -12808,7 +12810,6 @@ lpfc_sli4_sp_handle_rcqe(struct lpfc_hba *phba, struct lpfc_rcqe *rcqe) case FC_STATUS_RQ_BUF_LEN_EXCEEDED: lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "2537 Receive Frame Truncated!!\n"); - hrq->RQ_buf_trunc++; case FC_STATUS_RQ_SUCCESS: lpfc_sli4_rq_release(hrq, drq); spin_lock_irqsave(&phba->hbalock, iflags); @@ -12819,6 +12820,7 @@ lpfc_sli4_sp_handle_rcqe(struct lpfc_hba *phba, struct lpfc_rcqe *rcqe) goto out; } hrq->RQ_rcv_buf++; + hrq->RQ_buf_posted--; memcpy(&dma_buf->cq_event.cqe.rcqe_cmpl, rcqe, sizeof(*rcqe)); /* If a NVME LS event (type 0x28), treat it as Fast path */ @@ -12832,8 +12834,21 @@ lpfc_sli4_sp_handle_rcqe(struct lpfc_hba *phba, struct lpfc_rcqe *rcqe) spin_unlock_irqrestore(&phba->hbalock, iflags); workposted = true; break; - case FC_STATUS_INSUFF_BUF_NEED_BUF: case FC_STATUS_INSUFF_BUF_FRM_DISC: + if (phba->nvmet_support) { + tgtp = phba->targetport->private; + lpfc_printf_log(phba, KERN_ERR, LOG_SLI | LOG_NVME, + "6402 RQE Error x%x, posted %d err_cnt " + "%d: %x %x %x\n", + status, hrq->RQ_buf_posted, + hrq->RQ_no_posted_buf, + atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->rcv_fcp_cmd_out), + atomic_read(&tgtp->xmt_fcp_release)); + } + /* fallthrough */ + + case FC_STATUS_INSUFF_BUF_NEED_BUF: hrq->RQ_no_posted_buf++; /* Post more buffers if possible */ spin_lock_irqsave(&phba->hbalock, iflags); @@ -13135,6 +13150,7 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, struct lpfc_queue *drq; struct rqb_dmabuf *dma_buf; struct fc_frame_header *fc_hdr; + struct lpfc_nvmet_tgtport *tgtp; uint32_t status, rq_id; unsigned long iflags; uint32_t fctl, idx; @@ -13165,8 +13181,6 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, case FC_STATUS_RQ_BUF_LEN_EXCEEDED: lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "6126 Receive Frame Truncated!!\n"); - hrq->RQ_buf_trunc++; - break; case FC_STATUS_RQ_SUCCESS: lpfc_sli4_rq_release(hrq, drq); spin_lock_irqsave(&phba->hbalock, iflags); @@ -13178,6 +13192,7 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, } spin_unlock_irqrestore(&phba->hbalock, iflags); hrq->RQ_rcv_buf++; + hrq->RQ_buf_posted--; fc_hdr = (struct fc_frame_header *)dma_buf->hbuf.virt; /* Just some basic sanity checks on FCP Command frame */ @@ -13200,8 +13215,21 @@ lpfc_sli4_nvmet_handle_rcqe(struct lpfc_hba *phba, struct lpfc_queue *cq, drop: lpfc_in_buf_free(phba, &dma_buf->dbuf); break; - case FC_STATUS_INSUFF_BUF_NEED_BUF: case FC_STATUS_INSUFF_BUF_FRM_DISC: + if (phba->nvmet_support) { + tgtp = phba->targetport->private; + lpfc_printf_log(phba, KERN_ERR, LOG_SLI | LOG_NVME, + "6401 RQE Error x%x, posted %d err_cnt " + "%d: %x %x %x\n", + status, hrq->RQ_buf_posted, + hrq->RQ_no_posted_buf, + atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->rcv_fcp_cmd_out), + atomic_read(&tgtp->xmt_fcp_release)); + } + /* fallthrough */ + + case FC_STATUS_INSUFF_BUF_NEED_BUF: hrq->RQ_no_posted_buf++; /* Post more buffers if possible */ spin_lock_irqsave(&phba->hbalock, iflags); diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 915e8d5581bd..7a8cbeb6a745 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -195,7 +195,7 @@ struct lpfc_queue { /* defines for RQ stats */ #define RQ_no_posted_buf q_cnt_1 #define RQ_no_buf_found q_cnt_2 -#define RQ_buf_trunc q_cnt_3 +#define RQ_buf_posted q_cnt_3 #define RQ_rcv_buf q_cnt_4 uint64_t isr_timestamp; From 61f3d4bf4f8f062cf6be143c9b7adbc3a017ea6e Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:41 -0700 Subject: [PATCH 040/254] scsi: lpfc: Fix nvmet RQ resource needs for large block writes. Large block writes to the nvme target were failing because the default number of RQs posted was insufficient. Expand the NVMET RQs to 2048 RQEs and ensure a minimum of 512 RQEs are posted, no matter how many MRQs are configured. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_attr.c | 6 ++--- drivers/scsi/lpfc/lpfc_init.c | 23 ++++++++++------ drivers/scsi/lpfc/lpfc_nvmet.c | 2 +- drivers/scsi/lpfc/lpfc_nvmet.h | 1 + drivers/scsi/lpfc/lpfc_sli.c | 49 +++++++--------------------------- drivers/scsi/lpfc/lpfc_sli4.h | 2 +- 6 files changed, 31 insertions(+), 52 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 41ec7451689b..129d6cd7635b 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -60,9 +60,9 @@ #define LPFC_MIN_DEVLOSS_TMO 1 #define LPFC_MAX_DEVLOSS_TMO 255 -#define LPFC_DEF_MRQ_POST 256 -#define LPFC_MIN_MRQ_POST 32 -#define LPFC_MAX_MRQ_POST 512 +#define LPFC_DEF_MRQ_POST 512 +#define LPFC_MIN_MRQ_POST 512 +#define LPFC_MAX_MRQ_POST 2048 /* * Write key size should be multiple of 4. If write key is changed diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index b1b181a756dc..5f62e3a1dff6 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -3390,6 +3390,11 @@ lpfc_sli4_nvmet_sgl_update(struct lpfc_hba *phba) */ els_xri_cnt = lpfc_sli4_get_els_iocb_cnt(phba); nvmet_xri_cnt = phba->cfg_nvmet_mrq * phba->cfg_nvmet_mrq_post; + + /* Ensure we at least meet the minimun for the system */ + if (nvmet_xri_cnt < LPFC_NVMET_RQE_DEF_COUNT) + nvmet_xri_cnt = LPFC_NVMET_RQE_DEF_COUNT; + tot_cnt = phba->sli4_hba.max_cfg_param.max_xri - els_xri_cnt; if (nvmet_xri_cnt > tot_cnt) { phba->cfg_nvmet_mrq_post = tot_cnt / phba->cfg_nvmet_mrq; @@ -8158,7 +8163,7 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba) /* Create NVMET Receive Queue for header */ qdesc = lpfc_sli4_queue_alloc(phba, phba->sli4_hba.rq_esize, - phba->sli4_hba.rq_ecount); + LPFC_NVMET_RQE_DEF_COUNT); if (!qdesc) { lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "3146 Failed allocate " @@ -8180,7 +8185,7 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba) /* Create NVMET Receive Queue for data */ qdesc = lpfc_sli4_queue_alloc(phba, phba->sli4_hba.rq_esize, - phba->sli4_hba.rq_ecount); + LPFC_NVMET_RQE_DEF_COUNT); if (!qdesc) { lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "3156 Failed allocate " @@ -8770,9 +8775,6 @@ lpfc_sli4_queue_setup(struct lpfc_hba *phba) goto out_destroy; } - lpfc_rq_adjust_repost(phba, phba->sli4_hba.hdr_rq, LPFC_ELS_HBQ); - lpfc_rq_adjust_repost(phba, phba->sli4_hba.dat_rq, LPFC_ELS_HBQ); - rc = lpfc_rq_create(phba, phba->sli4_hba.hdr_rq, phba->sli4_hba.dat_rq, phba->sli4_hba.els_cq, LPFC_USOL); if (rc) { @@ -11096,7 +11098,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) struct lpfc_hba *phba; struct lpfc_vport *vport = NULL; struct Scsi_Host *shost = NULL; - int error, cnt; + int error, cnt, num; uint32_t cfg_mode, intr_mode; /* Allocate memory for HBA structure */ @@ -11131,8 +11133,13 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) } cnt = phba->cfg_iocb_cnt * 1024; - if (phba->nvmet_support) - cnt += phba->cfg_nvmet_mrq_post * phba->cfg_nvmet_mrq; + if (phba->nvmet_support) { + /* Ensure we at least meet the minimun for the system */ + num = (phba->cfg_nvmet_mrq_post * phba->cfg_nvmet_mrq); + if (num < LPFC_NVMET_RQE_DEF_COUNT) + num = LPFC_NVMET_RQE_DEF_COUNT; + cnt += num; + } /* Initialize and populate the iocb list per host */ lpfc_printf_log(phba, KERN_INFO, LOG_INIT, diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index bb12e2c9fbf4..dfa7296499cf 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -614,9 +614,9 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport, lpfc_nvmeio_data(phba, "NVMET FCP CMND: xri x%x op x%x len x%x\n", ctxp->oxid, rsp->op, rsp->rsplen); + ctxp->flag |= LPFC_NVMET_IO_INP; rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, nvmewqeq); if (rc == WQE_SUCCESS) { - ctxp->flag |= LPFC_NVMET_IO_INP; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS if (!phba->ktime_on) return 0; diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h index 837210a3e7c8..55f2a859dc70 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.h +++ b/drivers/scsi/lpfc/lpfc_nvmet.h @@ -22,6 +22,7 @@ ********************************************************************/ #define LPFC_NVMET_DEFAULT_SEGS (64 + 1) /* 256K IOs */ +#define LPFC_NVMET_RQE_DEF_COUNT 512 #define LPFC_NVMET_SUCCESS_LEN 12 /* Used for NVME Target */ diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 333c5094b97d..f344abce4949 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -479,22 +479,23 @@ lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq, if (unlikely(!hq) || unlikely(!dq)) return -ENOMEM; put_index = hq->host_index; - temp_hrqe = hq->qe[hq->host_index].rqe; + temp_hrqe = hq->qe[put_index].rqe; temp_drqe = dq->qe[dq->host_index].rqe; if (hq->type != LPFC_HRQ || dq->type != LPFC_DRQ) return -EINVAL; - if (hq->host_index != dq->host_index) + if (put_index != dq->host_index) return -EINVAL; /* If the host has not yet processed the next entry then we are done */ - if (((hq->host_index + 1) % hq->entry_count) == hq->hba_index) + if (((put_index + 1) % hq->entry_count) == hq->hba_index) return -EBUSY; lpfc_sli_pcimem_bcopy(hrqe, temp_hrqe, hq->entry_size); lpfc_sli_pcimem_bcopy(drqe, temp_drqe, dq->entry_size); /* Update the host index to point to the next slot */ - hq->host_index = ((hq->host_index + 1) % hq->entry_count); + hq->host_index = ((put_index + 1) % hq->entry_count); dq->host_index = ((dq->host_index + 1) % dq->entry_count); + hq->RQ_buf_posted++; /* Ring The Header Receive Queue Doorbell */ if (!(hq->host_index % hq->entry_repost)) { @@ -512,7 +513,6 @@ lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq, } else { return -EINVAL; } - hq->RQ_buf_posted += hq->entry_repost; writel(doorbell.word0, hq->db_regaddr); } return put_index; @@ -6905,14 +6905,9 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) INIT_LIST_HEAD(&rqbp->rqb_buffer_list); rqbp->rqb_alloc_buffer = lpfc_sli4_nvmet_alloc; rqbp->rqb_free_buffer = lpfc_sli4_nvmet_free; - rqbp->entry_count = 256; + rqbp->entry_count = LPFC_NVMET_RQE_DEF_COUNT; rqbp->buffer_count = 0; - /* Divide by 4 and round down to multiple of 16 */ - rc = (phba->cfg_nvmet_mrq_post >> 2) & 0xfff8; - phba->sli4_hba.nvmet_mrq_hdr[i]->entry_repost = rc; - phba->sli4_hba.nvmet_mrq_data[i]->entry_repost = rc; - lpfc_post_rq_buffer( phba, phba->sli4_hba.nvmet_mrq_hdr[i], phba->sli4_hba.nvmet_mrq_data[i], @@ -14892,34 +14887,6 @@ out: return status; } -/** - * lpfc_rq_adjust_repost - Adjust entry_repost for an RQ - * @phba: HBA structure that indicates port to create a queue on. - * @rq: The queue structure to use for the receive queue. - * @qno: The associated HBQ number - * - * - * For SLI4 we need to adjust the RQ repost value based on - * the number of buffers that are initially posted to the RQ. - */ -void -lpfc_rq_adjust_repost(struct lpfc_hba *phba, struct lpfc_queue *rq, int qno) -{ - uint32_t cnt; - - /* sanity check on queue memory */ - if (!rq) - return; - cnt = lpfc_hbq_defs[qno]->entry_count; - - /* Recalc repost for RQs based on buffers initially posted */ - cnt = (cnt >> 3); - if (cnt < LPFC_QUEUE_MIN_REPOST) - cnt = LPFC_QUEUE_MIN_REPOST; - - rq->entry_repost = cnt; -} - /** * lpfc_rq_create - Create a Receive Queue on the HBA * @phba: HBA structure that indicates port to create a queue on. @@ -15105,6 +15072,7 @@ lpfc_rq_create(struct lpfc_hba *phba, struct lpfc_queue *hrq, hrq->subtype = subtype; hrq->host_index = 0; hrq->hba_index = 0; + hrq->entry_repost = LPFC_RQ_REPOST; /* now create the data queue */ lpfc_sli4_config(phba, mbox, LPFC_MBOX_SUBSYSTEM_FCOE, @@ -15186,6 +15154,7 @@ lpfc_rq_create(struct lpfc_hba *phba, struct lpfc_queue *hrq, drq->subtype = subtype; drq->host_index = 0; drq->hba_index = 0; + drq->entry_repost = LPFC_RQ_REPOST; /* link the header and data RQs onto the parent cq child list */ list_add_tail(&hrq->list, &cq->child_list); @@ -15343,6 +15312,7 @@ lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, hrq->subtype = subtype; hrq->host_index = 0; hrq->hba_index = 0; + hrq->entry_repost = LPFC_RQ_REPOST; drq->db_format = LPFC_DB_RING_FORMAT; drq->db_regaddr = phba->sli4_hba.RQDBregaddr; @@ -15351,6 +15321,7 @@ lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, drq->subtype = subtype; drq->host_index = 0; drq->hba_index = 0; + drq->entry_repost = LPFC_RQ_REPOST; list_add_tail(&hrq->list, &cq->child_list); list_add_tail(&drq->list, &cq->child_list); diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 7a8cbeb6a745..422bde85c9f1 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -156,6 +156,7 @@ struct lpfc_queue { uint32_t entry_size; /* Size of each queue entry. */ uint32_t entry_repost; /* Count of entries before doorbell is rung */ #define LPFC_QUEUE_MIN_REPOST 8 +#define LPFC_RQ_REPOST 64 uint32_t queue_id; /* Queue ID assigned by the hardware */ uint32_t assoc_qid; /* Queue ID associated with, for CQ/WQ/MQ */ uint32_t page_count; /* Number of pages allocated for this queue */ @@ -763,7 +764,6 @@ int lpfc_rq_create(struct lpfc_hba *, struct lpfc_queue *, int lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, struct lpfc_queue **drqp, struct lpfc_queue **cqp, uint32_t subtype); -void lpfc_rq_adjust_repost(struct lpfc_hba *, struct lpfc_queue *, int); int lpfc_eq_destroy(struct lpfc_hba *, struct lpfc_queue *); int lpfc_cq_destroy(struct lpfc_hba *, struct lpfc_queue *); int lpfc_mq_destroy(struct lpfc_hba *, struct lpfc_queue *); From 3120046a970aee08a0787fb6792590f1e0047f62 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:42 -0700 Subject: [PATCH 041/254] scsi: lpfc: Fix NVMEI driver not decrementing counter causing bad rport state. During driver boot, a latency in the NVMET driver side causes the incoming NVMEI PRLI to get rejected by the NVMET driver. When this happens, the NVMEI driver runs out of PRLI retries. Bouncing the link does not fix the situation. If the NVMEI driver decides, on PRLI completion failures, to retry the PRLI, always decrement the fc4_prli_sent counter. This allows the PRLI completion to resolve to UNMAPPED when NVMET rejects the PRLI. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 3f9f6d5f8c69..3085895464d9 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -2077,16 +2077,19 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, if (irsp->ulpStatus) { /* Check for retry */ + ndlp->fc4_prli_sent--; if (lpfc_els_retry(phba, cmdiocb, rspiocb)) { /* ELS command is being retried */ - ndlp->fc4_prli_sent--; goto out; } + /* PRLI failed */ lpfc_printf_vlog(vport, KERN_ERR, LOG_ELS, - "2754 PRLI failure DID:%06X Status:x%x/x%x\n", + "2754 PRLI failure DID:%06X Status:x%x/x%x, " + "data: x%x\n", ndlp->nlp_DID, irsp->ulpStatus, - irsp->un.ulpWord[4]); + irsp->un.ulpWord[4], ndlp->fc4_prli_sent); + /* Do not call DSM for lpfc_els_abort'ed ELS cmds */ if (lpfc_error_lost_link(irsp)) goto out; From 7869da183a7cfc8a2189f6eddd3bc558be40d5e3 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:43 -0700 Subject: [PATCH 042/254] scsi: lpfc: Fix NMI watchdog assertions when running nvmet IOPS tests After running IOPS test for 30 second we get kernel:NMI watchdog: Watchdog detected hard LOCKUP on cpu 0 The driver is speend too much time in its ISR. In ISR EQ and CQ processing routines, if we hit the entry_repost numbers of EQE/CQEs just break out of the routine as opposed to hitting the doorbell with NOARM and continue processing. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_debugfs.c | 30 +++++++++++++++--------------- drivers/scsi/lpfc/lpfc_sli.c | 16 ++++++---------- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index a41daedeb967..7284533f4df2 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -3075,11 +3075,11 @@ __lpfc_idiag_print_wq(struct lpfc_queue *qp, char *wqtype, qp->assoc_qid, qp->q_cnt_1, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "\t\tWQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]", + "\t\tWQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]", qp->queue_id, qp->entry_count, qp->entry_size, qp->host_index, - qp->hba_index); + qp->hba_index, qp->entry_repost); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n"); return len; @@ -3126,11 +3126,11 @@ __lpfc_idiag_print_cq(struct lpfc_queue *qp, char *cqtype, qp->assoc_qid, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "\tCQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]", + "\tCQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]", qp->queue_id, qp->entry_count, qp->entry_size, qp->host_index, - qp->hba_index); + qp->hba_index, qp->entry_repost); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n"); @@ -3152,16 +3152,16 @@ __lpfc_idiag_print_rqpair(struct lpfc_queue *qp, struct lpfc_queue *datqp, qp->assoc_qid, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "\t\tHQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]\n", + "\t\tHQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]\n", qp->queue_id, qp->entry_count, qp->entry_size, - qp->host_index, qp->hba_index); + qp->host_index, qp->hba_index, qp->entry_repost); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "\t\tDQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]\n", + "\t\tDQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]\n", datqp->queue_id, datqp->entry_count, datqp->entry_size, datqp->host_index, - datqp->hba_index); + datqp->hba_index, datqp->entry_repost); return len; } @@ -3247,10 +3247,10 @@ __lpfc_idiag_print_eq(struct lpfc_queue *qp, char *eqtype, eqtype, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, (unsigned long long)qp->q_cnt_4); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, - "EQID[%02d], QE-CNT[%04d], QE-SIZE[%04d], " - "HOST-IDX[%04d], PORT-IDX[%04d]", + "EQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " + "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]", qp->queue_id, qp->entry_count, qp->entry_size, - qp->host_index, qp->hba_index); + qp->host_index, qp->hba_index, qp->entry_repost); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n"); return len; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index f344abce4949..cc45e9191062 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -12961,7 +12961,7 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe, while ((cqe = lpfc_sli4_cq_get(cq))) { workposted |= lpfc_sli4_sp_handle_mcqe(phba, cqe); if (!(++ecount % cq->entry_repost)) - lpfc_sli4_cq_release(cq, LPFC_QUEUE_NOARM); + break; cq->CQ_mbox++; } break; @@ -12975,7 +12975,7 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe, workposted |= lpfc_sli4_sp_handle_cqe(phba, cq, cqe); if (!(++ecount % cq->entry_repost)) - lpfc_sli4_cq_release(cq, LPFC_QUEUE_NOARM); + break; } /* Track the max number of CQEs processed in 1 EQ */ @@ -13227,10 +13227,6 @@ drop: case FC_STATUS_INSUFF_BUF_NEED_BUF: hrq->RQ_no_posted_buf++; /* Post more buffers if possible */ - spin_lock_irqsave(&phba->hbalock, iflags); - phba->hba_flag |= HBA_POST_RECEIVE_BUFFER; - spin_unlock_irqrestore(&phba->hbalock, iflags); - workposted = true; break; } out: @@ -13384,7 +13380,7 @@ process_cq: while ((cqe = lpfc_sli4_cq_get(cq))) { workposted |= lpfc_sli4_fp_handle_cqe(phba, cq, cqe); if (!(++ecount % cq->entry_repost)) - lpfc_sli4_cq_release(cq, LPFC_QUEUE_NOARM); + break; } /* Track the max number of CQEs processed in 1 EQ */ @@ -13475,7 +13471,7 @@ lpfc_sli4_fof_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe) while ((cqe = lpfc_sli4_cq_get(cq))) { workposted |= lpfc_sli4_fp_handle_cqe(phba, cq, cqe); if (!(++ecount % cq->entry_repost)) - lpfc_sli4_cq_release(cq, LPFC_QUEUE_NOARM); + break; } /* Track the max number of CQEs processed in 1 EQ */ @@ -13557,7 +13553,7 @@ lpfc_sli4_fof_intr_handler(int irq, void *dev_id) while ((eqe = lpfc_sli4_eq_get(eq))) { lpfc_sli4_fof_handle_eqe(phba, eqe); if (!(++ecount % eq->entry_repost)) - lpfc_sli4_eq_release(eq, LPFC_QUEUE_NOARM); + break; eq->EQ_processed++; } @@ -13674,7 +13670,7 @@ lpfc_sli4_hba_intr_handler(int irq, void *dev_id) lpfc_sli4_hba_handle_eqe(phba, eqe, hba_eqidx); if (!(++ecount % fpeq->entry_repost)) - lpfc_sli4_eq_release(fpeq, LPFC_QUEUE_NOARM); + break; fpeq->EQ_processed++; } From 3c603be9798758dde794daa622e0f7017dbff3a7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:44 -0700 Subject: [PATCH 043/254] scsi: lpfc: Separate NVMET data buffer pool fir ELS/CT. Using 2048 byte buffer and onle 128 bytes is needed. Create nee LFPC_NVMET_DATA_BUF_SIZE define to use for NVMET RQ/MRQs. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 1 + drivers/scsi/lpfc/lpfc_crtn.h | 1 + drivers/scsi/lpfc/lpfc_hw4.h | 1 + drivers/scsi/lpfc/lpfc_init.c | 7 ++++++- drivers/scsi/lpfc/lpfc_mem.c | 33 ++++++++++++++++++++++++++------- drivers/scsi/lpfc/lpfc_sli.c | 19 +++++++++++++++---- 6 files changed, 50 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 62571fa9c6ad..c4b38491da8e 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -943,6 +943,7 @@ struct lpfc_hba { struct pci_pool *lpfc_mbuf_pool; struct pci_pool *lpfc_hrb_pool; /* header receive buffer pool */ struct pci_pool *lpfc_drb_pool; /* data receive buffer pool */ + struct pci_pool *lpfc_nvmet_drb_pool; /* data receive buffer pool */ struct pci_pool *lpfc_hbq_pool; /* SLI3 hbq buffer pool */ struct pci_pool *txrdy_payload_pool; struct lpfc_dma_pool lpfc_mbuf_safety_pool; diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index 1c55408ac718..fb7fc48a1324 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -271,6 +271,7 @@ int lpfc_sli4_fcf_rr_next_proc(struct lpfc_vport *, uint16_t); void lpfc_sli4_clear_fcf_rr_bmask(struct lpfc_hba *); int lpfc_mem_alloc(struct lpfc_hba *, int align); +int lpfc_nvmet_mem_alloc(struct lpfc_hba *phba); int lpfc_mem_alloc_active_rrq_pool_s4(struct lpfc_hba *); void lpfc_mem_free(struct lpfc_hba *); void lpfc_mem_free_all(struct lpfc_hba *); diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index 1d12f2be36bc..df97c6b7433b 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -1356,6 +1356,7 @@ struct lpfc_mbx_wq_destroy { #define LPFC_HDR_BUF_SIZE 128 #define LPFC_DATA_BUF_SIZE 2048 +#define LPFC_NVMET_DATA_BUF_SIZE 128 struct rq_context { uint32_t word0; #define lpfc_rq_context_rqe_count_SHIFT 16 /* Version 0 Only */ diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 5f62e3a1dff6..26b6a843d32d 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -5956,16 +5956,21 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) for (i = 0; i < lpfc_enable_nvmet_cnt; i++) { if (wwn == lpfc_enable_nvmet[i]) { #if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) + if (lpfc_nvmet_mem_alloc(phba)) + break; + + phba->nvmet_support = 1; /* a match */ + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "6017 NVME Target %016llx\n", wwn); - phba->nvmet_support = 1; /* a match */ #else lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "6021 Can't enable NVME Target." " NVME_TARGET_FC infrastructure" " is not in kernel\n"); #endif + break; } } } diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c index 5986c7957199..91060afc9721 100644 --- a/drivers/scsi/lpfc/lpfc_mem.c +++ b/drivers/scsi/lpfc/lpfc_mem.c @@ -214,6 +214,21 @@ fail_free_drb_pool: return -ENOMEM; } +int +lpfc_nvmet_mem_alloc(struct lpfc_hba *phba) +{ + phba->lpfc_nvmet_drb_pool = + pci_pool_create("lpfc_nvmet_drb_pool", + phba->pcidev, LPFC_NVMET_DATA_BUF_SIZE, + SGL_ALIGN_SZ, 0); + if (!phba->lpfc_nvmet_drb_pool) { + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "6024 Can't enable NVME Target - no memory\n"); + return -ENOMEM; + } + return 0; +} + /** * lpfc_mem_free - Frees memory allocated by lpfc_mem_alloc * @phba: HBA to free memory for @@ -232,6 +247,9 @@ lpfc_mem_free(struct lpfc_hba *phba) /* Free HBQ pools */ lpfc_sli_hbqbuf_free_all(phba); + if (phba->lpfc_nvmet_drb_pool) + pci_pool_destroy(phba->lpfc_nvmet_drb_pool); + phba->lpfc_nvmet_drb_pool = NULL; if (phba->lpfc_drb_pool) pci_pool_destroy(phba->lpfc_drb_pool); phba->lpfc_drb_pool = NULL; @@ -624,20 +642,20 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) kfree(dma_buf); return NULL; } - dma_buf->dbuf.virt = pci_pool_alloc(phba->lpfc_drb_pool, GFP_KERNEL, - &dma_buf->dbuf.phys); + dma_buf->dbuf.virt = pci_pool_alloc(phba->lpfc_nvmet_drb_pool, + GFP_KERNEL, &dma_buf->dbuf.phys); if (!dma_buf->dbuf.virt) { pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, dma_buf->hbuf.phys); kfree(dma_buf); return NULL; } - dma_buf->total_size = LPFC_DATA_BUF_SIZE; + dma_buf->total_size = LPFC_NVMET_DATA_BUF_SIZE; dma_buf->context = kzalloc(sizeof(struct lpfc_nvmet_rcv_ctx), GFP_KERNEL); if (!dma_buf->context) { - pci_pool_free(phba->lpfc_drb_pool, dma_buf->dbuf.virt, + pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, dma_buf->dbuf.phys); pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, dma_buf->hbuf.phys); @@ -648,7 +666,7 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) dma_buf->iocbq = lpfc_sli_get_iocbq(phba); if (!dma_buf->iocbq) { kfree(dma_buf->context); - pci_pool_free(phba->lpfc_drb_pool, dma_buf->dbuf.virt, + pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, dma_buf->dbuf.phys); pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, dma_buf->hbuf.phys); @@ -678,7 +696,7 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) if (!dma_buf->sglq) { lpfc_sli_release_iocbq(phba, dma_buf->iocbq); kfree(dma_buf->context); - pci_pool_free(phba->lpfc_drb_pool, dma_buf->dbuf.virt, + pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, dma_buf->dbuf.phys); pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, dma_buf->hbuf.phys); @@ -718,7 +736,8 @@ lpfc_sli4_nvmet_free(struct lpfc_hba *phba, struct rqb_dmabuf *dmab) lpfc_sli_release_iocbq(phba, dmab->iocbq); kfree(dmab->context); pci_pool_free(phba->lpfc_hrb_pool, dmab->hbuf.virt, dmab->hbuf.phys); - pci_pool_free(phba->lpfc_drb_pool, dmab->dbuf.virt, dmab->dbuf.phys); + pci_pool_free(phba->lpfc_nvmet_drb_pool, + dmab->dbuf.virt, dmab->dbuf.phys); kfree(dmab); } diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index cc45e9191062..49d5c4700054 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -15079,7 +15079,12 @@ lpfc_rq_create(struct lpfc_hba *phba, struct lpfc_queue *hrq, if (phba->sli4_hba.pc_sli4_params.rqv == LPFC_Q_CREATE_VERSION_1) { bf_set(lpfc_rq_context_rqe_count_1, &rq_create->u.request.context, hrq->entry_count); - rq_create->u.request.context.buffer_size = LPFC_DATA_BUF_SIZE; + if (subtype == LPFC_NVMET) + rq_create->u.request.context.buffer_size = + LPFC_NVMET_DATA_BUF_SIZE; + else + rq_create->u.request.context.buffer_size = + LPFC_DATA_BUF_SIZE; bf_set(lpfc_rq_context_rqe_size, &rq_create->u.request.context, LPFC_RQE_SIZE_8); bf_set(lpfc_rq_context_page_size, &rq_create->u.request.context, @@ -15116,8 +15121,14 @@ lpfc_rq_create(struct lpfc_hba *phba, struct lpfc_queue *hrq, LPFC_RQ_RING_SIZE_4096); break; } - bf_set(lpfc_rq_context_buf_size, &rq_create->u.request.context, - LPFC_DATA_BUF_SIZE); + if (subtype == LPFC_NVMET) + bf_set(lpfc_rq_context_buf_size, + &rq_create->u.request.context, + LPFC_NVMET_DATA_BUF_SIZE); + else + bf_set(lpfc_rq_context_buf_size, + &rq_create->u.request.context, + LPFC_DATA_BUF_SIZE); } bf_set(lpfc_rq_context_cq_id, &rq_create->u.request.context, cq->queue_id); @@ -15263,7 +15274,7 @@ lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, cq->queue_id); bf_set(lpfc_rq_context_data_size, &rq_create->u.request.context, - LPFC_DATA_BUF_SIZE); + LPFC_NVMET_DATA_BUF_SIZE); bf_set(lpfc_rq_context_hdr_size, &rq_create->u.request.context, LPFC_HDR_BUF_SIZE); From 6c621a2229b084da0d926967f84b059a10c26ede Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:45 -0700 Subject: [PATCH 044/254] scsi: lpfc: Separate NVMET RQ buffer posting from IO resources SGL/iocbq/context Currently IO resources are mapped 1 to 1 with RQ buffers posted Added logic to separate RQE buffers from IO op resources (sgl/iocbq/context). During initialization, the driver will determine how many SGLs it will allocate for NVMET (based on what the firmware reports) and associate a NVMET IOCBq and NVMET context structure with each one. Now that hdr/data buffers are immediately reposted back to the RQ, 512 RQEs for each MRQ is sufficient. Also, since NVMET data buffers are now 128 bytes, lpfc_nvmet_mrq_post is not necessary anymore as we will always post the max (512) buffers per NVMET MRQ. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 11 +- drivers/scsi/lpfc/lpfc_attr.c | 11 -- drivers/scsi/lpfc/lpfc_crtn.h | 8 +- drivers/scsi/lpfc/lpfc_init.c | 92 ++----------- drivers/scsi/lpfc/lpfc_mem.c | 73 +--------- drivers/scsi/lpfc/lpfc_nvmet.c | 244 +++++++++++++++++++++++++-------- drivers/scsi/lpfc/lpfc_nvmet.h | 1 + drivers/scsi/lpfc/lpfc_sli.c | 78 ++++++++++- drivers/scsi/lpfc/lpfc_sli4.h | 4 +- 9 files changed, 290 insertions(+), 232 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index c4b38491da8e..72641b1d3ab8 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -141,6 +141,13 @@ struct lpfc_dmabuf { uint32_t buffer_tag; /* used for tagged queue ring */ }; +struct lpfc_nvmet_ctxbuf { + struct list_head list; + struct lpfc_nvmet_rcv_ctx *context; + struct lpfc_iocbq *iocbq; + struct lpfc_sglq *sglq; +}; + struct lpfc_dma_pool { struct lpfc_dmabuf *elements; uint32_t max_count; @@ -163,9 +170,6 @@ struct rqb_dmabuf { struct lpfc_dmabuf dbuf; uint16_t total_size; uint16_t bytes_recv; - void *context; - struct lpfc_iocbq *iocbq; - struct lpfc_sglq *sglq; struct lpfc_queue *hrq; /* ptr to associated Header RQ */ struct lpfc_queue *drq; /* ptr to associated Data RQ */ }; @@ -777,7 +781,6 @@ struct lpfc_hba { uint32_t cfg_nvme_oas; uint32_t cfg_nvme_io_channel; uint32_t cfg_nvmet_mrq; - uint32_t cfg_nvmet_mrq_post; uint32_t cfg_enable_nvmet; uint32_t cfg_nvme_enable_fb; uint32_t cfg_nvmet_fb_size; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 129d6cd7635b..65264582915a 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -3315,14 +3315,6 @@ LPFC_ATTR_R(nvmet_mrq, 1, 1, 16, "Specify number of RQ pairs for processing NVMET cmds"); -/* - * lpfc_nvmet_mrq_post: Specify number buffers to post on every MRQ - * - */ -LPFC_ATTR_R(nvmet_mrq_post, LPFC_DEF_MRQ_POST, - LPFC_MIN_MRQ_POST, LPFC_MAX_MRQ_POST, - "Specify number of buffers to post on every MRQ"); - /* * lpfc_enable_fc4_type: Defines what FC4 types are supported. * Supported Values: 1 - register just FCP @@ -5158,7 +5150,6 @@ struct device_attribute *lpfc_hba_attrs[] = { &dev_attr_lpfc_suppress_rsp, &dev_attr_lpfc_nvme_io_channel, &dev_attr_lpfc_nvmet_mrq, - &dev_attr_lpfc_nvmet_mrq_post, &dev_attr_lpfc_nvme_enable_fb, &dev_attr_lpfc_nvmet_fb_size, &dev_attr_lpfc_enable_bg, @@ -6198,7 +6189,6 @@ lpfc_get_cfgparam(struct lpfc_hba *phba) lpfc_enable_fc4_type_init(phba, lpfc_enable_fc4_type); lpfc_nvmet_mrq_init(phba, lpfc_nvmet_mrq); - lpfc_nvmet_mrq_post_init(phba, lpfc_nvmet_mrq_post); /* Initialize first burst. Target vs Initiator are different. */ lpfc_nvme_enable_fb_init(phba, lpfc_nvme_enable_fb); @@ -6295,7 +6285,6 @@ lpfc_nvme_mod_param_dep(struct lpfc_hba *phba) /* Not NVME Target mode. Turn off Target parameters. */ phba->nvmet_support = 0; phba->cfg_nvmet_mrq = 0; - phba->cfg_nvmet_mrq_post = 0; phba->cfg_nvmet_fb_size = 0; } diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index fb7fc48a1324..cc95abd130b4 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -75,6 +75,8 @@ void lpfc_init_vpi_cmpl(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_cancel_all_vport_retry_delay_timer(struct lpfc_hba *); void lpfc_retry_pport_discovery(struct lpfc_hba *); void lpfc_release_rpi(struct lpfc_hba *, struct lpfc_vport *, uint16_t); +int lpfc_init_iocb_list(struct lpfc_hba *phba, int cnt); +void lpfc_free_iocb_list(struct lpfc_hba *phba); void lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_mbx_cmpl_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *); @@ -246,16 +248,14 @@ struct hbq_dmabuf *lpfc_sli4_rb_alloc(struct lpfc_hba *); void lpfc_sli4_rb_free(struct lpfc_hba *, struct hbq_dmabuf *); struct rqb_dmabuf *lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba); void lpfc_sli4_nvmet_free(struct lpfc_hba *phba, struct rqb_dmabuf *dmab); -void lpfc_nvmet_rq_post(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp, - struct lpfc_dmabuf *mp); +void lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, + struct lpfc_nvmet_ctxbuf *ctxp); int lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport, struct fc_frame_header *fc_hdr); void lpfc_sli4_build_dflt_fcf_record(struct lpfc_hba *, struct fcf_record *, uint16_t); int lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq, struct lpfc_rqe *hrqe, struct lpfc_rqe *drqe); -int lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hq, - struct lpfc_queue *dq, int count); int lpfc_free_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hq); void lpfc_unregister_fcf(struct lpfc_hba *); void lpfc_unregister_fcf_rescan(struct lpfc_hba *); diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 26b6a843d32d..86b0b26dfeea 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -1099,7 +1099,7 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba) list_for_each_entry_safe(ctxp, ctxp_next, &nvmet_aborts, list) { ctxp->flag &= ~(LPFC_NVMET_XBUSY | LPFC_NVMET_ABORT_OP); - lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf); + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); } } @@ -3381,7 +3381,7 @@ lpfc_sli4_nvmet_sgl_update(struct lpfc_hba *phba) { struct lpfc_sglq *sglq_entry = NULL, *sglq_entry_next = NULL; uint16_t i, lxri, xri_cnt, els_xri_cnt; - uint16_t nvmet_xri_cnt, tot_cnt; + uint16_t nvmet_xri_cnt; LIST_HEAD(nvmet_sgl_list); int rc; @@ -3389,20 +3389,9 @@ lpfc_sli4_nvmet_sgl_update(struct lpfc_hba *phba) * update on pci function's nvmet xri-sgl list */ els_xri_cnt = lpfc_sli4_get_els_iocb_cnt(phba); - nvmet_xri_cnt = phba->cfg_nvmet_mrq * phba->cfg_nvmet_mrq_post; - /* Ensure we at least meet the minimun for the system */ - if (nvmet_xri_cnt < LPFC_NVMET_RQE_DEF_COUNT) - nvmet_xri_cnt = LPFC_NVMET_RQE_DEF_COUNT; - - tot_cnt = phba->sli4_hba.max_cfg_param.max_xri - els_xri_cnt; - if (nvmet_xri_cnt > tot_cnt) { - phba->cfg_nvmet_mrq_post = tot_cnt / phba->cfg_nvmet_mrq; - nvmet_xri_cnt = phba->cfg_nvmet_mrq * phba->cfg_nvmet_mrq_post; - lpfc_printf_log(phba, KERN_INFO, LOG_SLI, - "6301 NVMET post-sgl count changed to %d\n", - phba->cfg_nvmet_mrq_post); - } + /* For NVMET, ALL remaining XRIs are dedicated for IO processing */ + nvmet_xri_cnt = phba->sli4_hba.max_cfg_param.max_xri - els_xri_cnt; if (nvmet_xri_cnt > phba->sli4_hba.nvmet_xri_cnt) { /* els xri-sgl expanded */ @@ -5835,6 +5824,8 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) spin_lock_init(&phba->sli4_hba.abts_nvme_buf_list_lock); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvme_buf_list); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list); + INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_ctx_list); + /* Fast-path XRI aborted CQ Event work queue list */ INIT_LIST_HEAD(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue); } @@ -6279,7 +6270,7 @@ lpfc_unset_driver_resource_phase2(struct lpfc_hba *phba) * * This routine is invoked to free the driver's IOCB list and memory. **/ -static void +void lpfc_free_iocb_list(struct lpfc_hba *phba) { struct lpfc_iocbq *iocbq_entry = NULL, *iocbq_next = NULL; @@ -6307,7 +6298,7 @@ lpfc_free_iocb_list(struct lpfc_hba *phba) * 0 - successful * other values - error **/ -static int +int lpfc_init_iocb_list(struct lpfc_hba *phba, int iocb_count) { struct lpfc_iocbq *iocbq_entry = NULL; @@ -8321,46 +8312,6 @@ lpfc_sli4_queue_destroy(struct lpfc_hba *phba) INIT_LIST_HEAD(&phba->sli4_hba.lpfc_wq_list); } -int -lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, - struct lpfc_queue *drq, int count) -{ - int rc, i; - struct lpfc_rqe hrqe; - struct lpfc_rqe drqe; - struct lpfc_rqb *rqbp; - struct rqb_dmabuf *rqb_buffer; - LIST_HEAD(rqb_buf_list); - - rqbp = hrq->rqbp; - for (i = 0; i < count; i++) { - rqb_buffer = (rqbp->rqb_alloc_buffer)(phba); - if (!rqb_buffer) - break; - rqb_buffer->hrq = hrq; - rqb_buffer->drq = drq; - list_add_tail(&rqb_buffer->hbuf.list, &rqb_buf_list); - } - while (!list_empty(&rqb_buf_list)) { - list_remove_head(&rqb_buf_list, rqb_buffer, struct rqb_dmabuf, - hbuf.list); - - hrqe.address_lo = putPaddrLow(rqb_buffer->hbuf.phys); - hrqe.address_hi = putPaddrHigh(rqb_buffer->hbuf.phys); - drqe.address_lo = putPaddrLow(rqb_buffer->dbuf.phys); - drqe.address_hi = putPaddrHigh(rqb_buffer->dbuf.phys); - rc = lpfc_sli4_rq_put(hrq, drq, &hrqe, &drqe); - if (rc < 0) { - (rqbp->rqb_free_buffer)(phba, rqb_buffer); - } else { - list_add_tail(&rqb_buffer->hbuf.list, - &rqbp->rqb_buffer_list); - rqbp->buffer_count++; - } - } - return 1; -} - int lpfc_free_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *rq) { @@ -11103,7 +11054,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) struct lpfc_hba *phba; struct lpfc_vport *vport = NULL; struct Scsi_Host *shost = NULL; - int error, cnt, num; + int error; uint32_t cfg_mode, intr_mode; /* Allocate memory for HBA structure */ @@ -11137,27 +11088,6 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) goto out_unset_pci_mem_s4; } - cnt = phba->cfg_iocb_cnt * 1024; - if (phba->nvmet_support) { - /* Ensure we at least meet the minimun for the system */ - num = (phba->cfg_nvmet_mrq_post * phba->cfg_nvmet_mrq); - if (num < LPFC_NVMET_RQE_DEF_COUNT) - num = LPFC_NVMET_RQE_DEF_COUNT; - cnt += num; - } - - /* Initialize and populate the iocb list per host */ - lpfc_printf_log(phba, KERN_INFO, LOG_INIT, - "2821 initialize iocb list %d total %d\n", - phba->cfg_iocb_cnt, cnt); - error = lpfc_init_iocb_list(phba, cnt); - - if (error) { - lpfc_printf_log(phba, KERN_ERR, LOG_INIT, - "1413 Failed to initialize iocb list.\n"); - goto out_unset_driver_resource_s4; - } - INIT_LIST_HEAD(&phba->active_rrq_list); INIT_LIST_HEAD(&phba->fcf.fcf_pri_list); @@ -11166,7 +11096,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) if (error) { lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "1414 Failed to set up driver resource.\n"); - goto out_free_iocb_list; + goto out_unset_driver_resource_s4; } /* Get the default values for Model Name and Description */ @@ -11266,8 +11196,6 @@ out_destroy_shost: lpfc_destroy_shost(phba); out_unset_driver_resource: lpfc_unset_driver_resource_phase2(phba); -out_free_iocb_list: - lpfc_free_iocb_list(phba); out_unset_driver_resource_s4: lpfc_sli4_driver_resource_unset(phba); out_unset_pci_mem_s4: diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c index 91060afc9721..fcc05a1517c2 100644 --- a/drivers/scsi/lpfc/lpfc_mem.c +++ b/drivers/scsi/lpfc/lpfc_mem.c @@ -629,8 +629,6 @@ struct rqb_dmabuf * lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) { struct rqb_dmabuf *dma_buf; - struct lpfc_iocbq *nvmewqe; - union lpfc_wqe128 *wqe; dma_buf = kzalloc(sizeof(struct rqb_dmabuf), GFP_KERNEL); if (!dma_buf) @@ -651,60 +649,6 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) return NULL; } dma_buf->total_size = LPFC_NVMET_DATA_BUF_SIZE; - - dma_buf->context = kzalloc(sizeof(struct lpfc_nvmet_rcv_ctx), - GFP_KERNEL); - if (!dma_buf->context) { - pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, - dma_buf->dbuf.phys); - pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, - dma_buf->hbuf.phys); - kfree(dma_buf); - return NULL; - } - - dma_buf->iocbq = lpfc_sli_get_iocbq(phba); - if (!dma_buf->iocbq) { - kfree(dma_buf->context); - pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, - dma_buf->dbuf.phys); - pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, - dma_buf->hbuf.phys); - kfree(dma_buf); - lpfc_printf_log(phba, KERN_ERR, LOG_NVME, - "2621 Ran out of nvmet iocb/WQEs\n"); - return NULL; - } - dma_buf->iocbq->iocb_flag = LPFC_IO_NVMET; - nvmewqe = dma_buf->iocbq; - wqe = (union lpfc_wqe128 *)&nvmewqe->wqe; - /* Initialize WQE */ - memset(wqe, 0, sizeof(union lpfc_wqe)); - /* Word 7 */ - bf_set(wqe_ct, &wqe->generic.wqe_com, SLI4_CT_RPI); - bf_set(wqe_class, &wqe->generic.wqe_com, CLASS3); - bf_set(wqe_pu, &wqe->generic.wqe_com, 1); - /* Word 10 */ - bf_set(wqe_nvme, &wqe->fcp_tsend.wqe_com, 1); - bf_set(wqe_ebde_cnt, &wqe->generic.wqe_com, 0); - bf_set(wqe_qosd, &wqe->generic.wqe_com, 0); - - dma_buf->iocbq->context1 = NULL; - spin_lock(&phba->sli4_hba.sgl_list_lock); - dma_buf->sglq = __lpfc_sli_get_nvmet_sglq(phba, dma_buf->iocbq); - spin_unlock(&phba->sli4_hba.sgl_list_lock); - if (!dma_buf->sglq) { - lpfc_sli_release_iocbq(phba, dma_buf->iocbq); - kfree(dma_buf->context); - pci_pool_free(phba->lpfc_nvmet_drb_pool, dma_buf->dbuf.virt, - dma_buf->dbuf.phys); - pci_pool_free(phba->lpfc_hrb_pool, dma_buf->hbuf.virt, - dma_buf->hbuf.phys); - kfree(dma_buf); - lpfc_printf_log(phba, KERN_ERR, LOG_NVME, - "6132 Ran out of nvmet XRIs\n"); - return NULL; - } return dma_buf; } @@ -723,18 +667,6 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) void lpfc_sli4_nvmet_free(struct lpfc_hba *phba, struct rqb_dmabuf *dmab) { - unsigned long flags; - - __lpfc_clear_active_sglq(phba, dmab->sglq->sli4_lxritag); - dmab->sglq->state = SGL_FREED; - dmab->sglq->ndlp = NULL; - - spin_lock_irqsave(&phba->sli4_hba.sgl_list_lock, flags); - list_add_tail(&dmab->sglq->list, &phba->sli4_hba.lpfc_nvmet_sgl_list); - spin_unlock_irqrestore(&phba->sli4_hba.sgl_list_lock, flags); - - lpfc_sli_release_iocbq(phba, dmab->iocbq); - kfree(dmab->context); pci_pool_free(phba->lpfc_hrb_pool, dmab->hbuf.virt, dmab->hbuf.phys); pci_pool_free(phba->lpfc_nvmet_drb_pool, dmab->dbuf.virt, dmab->dbuf.phys); @@ -822,6 +754,11 @@ lpfc_rq_buf_free(struct lpfc_hba *phba, struct lpfc_dmabuf *mp) rc = lpfc_sli4_rq_put(rqb_entry->hrq, rqb_entry->drq, &hrqe, &drqe); if (rc < 0) { (rqbp->rqb_free_buffer)(phba, rqb_entry); + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "6409 Cannot post to RQ %d: %x %x\n", + rqb_entry->hrq->queue_id, + rqb_entry->hrq->host_index, + rqb_entry->hrq->hba_index); } else { list_add_tail(&rqb_entry->hbuf.list, &rqbp->rqb_buffer_list); rqbp->buffer_count++; diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index dfa7296499cf..fcc77ae0c71c 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -142,7 +142,7 @@ out: } /** - * lpfc_nvmet_rq_post - Repost a NVMET RQ DMA buffer and clean up context + * lpfc_nvmet_ctxbuf_post - Repost a NVMET RQ DMA buffer and clean up context * @phba: HBA buffer is associated with * @ctxp: context to clean up * @mp: Buffer to free @@ -155,24 +155,24 @@ out: * Returns: None **/ void -lpfc_nvmet_rq_post(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp, - struct lpfc_dmabuf *mp) +lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) { - if (ctxp) { - if (ctxp->flag) - lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS, - "6314 rq_post ctx xri x%x flag x%x\n", - ctxp->oxid, ctxp->flag); + struct lpfc_nvmet_rcv_ctx *ctxp = ctx_buf->context; + unsigned long iflag; - if (ctxp->txrdy) { - pci_pool_free(phba->txrdy_payload_pool, ctxp->txrdy, - ctxp->txrdy_phys); - ctxp->txrdy = NULL; - ctxp->txrdy_phys = 0; - } - ctxp->state = LPFC_NVMET_STE_FREE; + if (ctxp->txrdy) { + pci_pool_free(phba->txrdy_payload_pool, ctxp->txrdy, + ctxp->txrdy_phys); + ctxp->txrdy = NULL; + ctxp->txrdy_phys = 0; } - lpfc_rq_buf_free(phba, mp); + ctxp->state = LPFC_NVMET_STE_FREE; + + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_lock, iflag); + list_add_tail(&ctx_buf->list, + &phba->sli4_hba.lpfc_nvmet_ctx_list); + phba->sli4_hba.nvmet_ctx_cnt++; + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_lock, iflag); } #ifdef CONFIG_SCSI_LPFC_DEBUG_FS @@ -718,7 +718,7 @@ lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport, if (aborting) return; - lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf); + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); } static struct nvmet_fc_target_template lpfc_tgttemplate = { @@ -739,17 +739,128 @@ static struct nvmet_fc_target_template lpfc_tgttemplate = { .target_priv_sz = sizeof(struct lpfc_nvmet_tgtport), }; +void +lpfc_nvmet_cleanup_io_context(struct lpfc_hba *phba) +{ + struct lpfc_nvmet_ctxbuf *ctx_buf, *next_ctx_buf; + unsigned long flags; + + list_for_each_entry_safe( + ctx_buf, next_ctx_buf, + &phba->sli4_hba.lpfc_nvmet_ctx_list, list) { + spin_lock_irqsave( + &phba->sli4_hba.abts_nvme_buf_list_lock, flags); + list_del_init(&ctx_buf->list); + spin_unlock_irqrestore( + &phba->sli4_hba.abts_nvme_buf_list_lock, flags); + __lpfc_clear_active_sglq(phba, + ctx_buf->sglq->sli4_lxritag); + ctx_buf->sglq->state = SGL_FREED; + ctx_buf->sglq->ndlp = NULL; + + spin_lock_irqsave(&phba->sli4_hba.sgl_list_lock, flags); + list_add_tail(&ctx_buf->sglq->list, + &phba->sli4_hba.lpfc_nvmet_sgl_list); + spin_unlock_irqrestore(&phba->sli4_hba.sgl_list_lock, + flags); + + lpfc_sli_release_iocbq(phba, ctx_buf->iocbq); + kfree(ctx_buf->context); + } +} + +int +lpfc_nvmet_setup_io_context(struct lpfc_hba *phba) +{ + struct lpfc_nvmet_ctxbuf *ctx_buf; + struct lpfc_iocbq *nvmewqe; + union lpfc_wqe128 *wqe; + int i; + + lpfc_printf_log(phba, KERN_INFO, LOG_NVME, + "6403 Allocate NVMET resources for %d XRIs\n", + phba->sli4_hba.nvmet_xri_cnt); + + /* For all nvmet xris, allocate resources needed to process a + * received command on a per xri basis. + */ + for (i = 0; i < phba->sli4_hba.nvmet_xri_cnt; i++) { + ctx_buf = kzalloc(sizeof(*ctx_buf), GFP_KERNEL); + if (!ctx_buf) { + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6404 Ran out of memory for NVMET\n"); + return -ENOMEM; + } + + ctx_buf->context = kzalloc(sizeof(*ctx_buf->context), + GFP_KERNEL); + if (!ctx_buf->context) { + kfree(ctx_buf); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6405 Ran out of NVMET " + "context memory\n"); + return -ENOMEM; + } + ctx_buf->context->ctxbuf = ctx_buf; + + ctx_buf->iocbq = lpfc_sli_get_iocbq(phba); + if (!ctx_buf->iocbq) { + kfree(ctx_buf->context); + kfree(ctx_buf); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6406 Ran out of NVMET iocb/WQEs\n"); + return -ENOMEM; + } + ctx_buf->iocbq->iocb_flag = LPFC_IO_NVMET; + nvmewqe = ctx_buf->iocbq; + wqe = (union lpfc_wqe128 *)&nvmewqe->wqe; + /* Initialize WQE */ + memset(wqe, 0, sizeof(union lpfc_wqe)); + /* Word 7 */ + bf_set(wqe_ct, &wqe->generic.wqe_com, SLI4_CT_RPI); + bf_set(wqe_class, &wqe->generic.wqe_com, CLASS3); + bf_set(wqe_pu, &wqe->generic.wqe_com, 1); + /* Word 10 */ + bf_set(wqe_nvme, &wqe->fcp_tsend.wqe_com, 1); + bf_set(wqe_ebde_cnt, &wqe->generic.wqe_com, 0); + bf_set(wqe_qosd, &wqe->generic.wqe_com, 0); + + ctx_buf->iocbq->context1 = NULL; + spin_lock(&phba->sli4_hba.sgl_list_lock); + ctx_buf->sglq = __lpfc_sli_get_nvmet_sglq(phba, ctx_buf->iocbq); + spin_unlock(&phba->sli4_hba.sgl_list_lock); + if (!ctx_buf->sglq) { + lpfc_sli_release_iocbq(phba, ctx_buf->iocbq); + kfree(ctx_buf->context); + kfree(ctx_buf); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6407 Ran out of NVMET XRIs\n"); + return -ENOMEM; + } + spin_lock(&phba->sli4_hba.nvmet_io_lock); + list_add_tail(&ctx_buf->list, + &phba->sli4_hba.lpfc_nvmet_ctx_list); + spin_unlock(&phba->sli4_hba.nvmet_io_lock); + } + phba->sli4_hba.nvmet_ctx_cnt = phba->sli4_hba.nvmet_xri_cnt; + return 0; +} + int lpfc_nvmet_create_targetport(struct lpfc_hba *phba) { struct lpfc_vport *vport = phba->pport; struct lpfc_nvmet_tgtport *tgtp; struct nvmet_fc_port_info pinfo; - int error = 0; + int error; if (phba->targetport) return 0; + error = lpfc_nvmet_setup_io_context(phba); + if (error) + return error; + memset(&pinfo, 0, sizeof(struct nvmet_fc_port_info)); pinfo.node_name = wwn_to_u64(vport->fc_nodename.u.wwn); pinfo.port_name = wwn_to_u64(vport->fc_portname.u.wwn); @@ -778,13 +889,16 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba) &phba->pcidev->dev, &phba->targetport); #else - error = -ENOMEM; + error = -ENOENT; #endif if (error) { lpfc_printf_log(phba, KERN_ERR, LOG_NVME_DISC, "6025 Cannot register NVME targetport " "x%x\n", error); phba->targetport = NULL; + + lpfc_nvmet_cleanup_io_context(phba); + } else { tgtp = (struct lpfc_nvmet_tgtport *) phba->targetport->private; @@ -874,7 +988,7 @@ lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, list_for_each_entry_safe(ctxp, next_ctxp, &phba->sli4_hba.lpfc_abts_nvmet_ctx_list, list) { - if (ctxp->rqb_buffer->sglq->sli4_xritag != xri) + if (ctxp->ctxbuf->sglq->sli4_xritag != xri) continue; /* Check if we already received a free context call @@ -895,7 +1009,7 @@ lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, (ndlp->nlp_state == NLP_STE_UNMAPPED_NODE || ndlp->nlp_state == NLP_STE_MAPPED_NODE)) { lpfc_set_rrq_active(phba, ndlp, - ctxp->rqb_buffer->sglq->sli4_lxritag, + ctxp->ctxbuf->sglq->sli4_lxritag, rxid, 1); lpfc_sli4_abts_err_handler(phba, ndlp, axri); } @@ -904,8 +1018,8 @@ lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, "6318 XB aborted %x flg x%x (%x)\n", ctxp->oxid, ctxp->flag, released); if (released) - lpfc_nvmet_rq_post(phba, ctxp, - &ctxp->rqb_buffer->hbuf); + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); + if (rrq_empty) lpfc_worker_wake_up(phba); return; @@ -933,7 +1047,7 @@ lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport, list_for_each_entry_safe(ctxp, next_ctxp, &phba->sli4_hba.lpfc_abts_nvmet_ctx_list, list) { - if (ctxp->rqb_buffer->sglq->sli4_xritag != xri) + if (ctxp->ctxbuf->sglq->sli4_xritag != xri) continue; spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock); @@ -985,6 +1099,7 @@ lpfc_nvmet_destroy_targetport(struct lpfc_hba *phba) init_completion(&tgtp->tport_unreg_done); nvmet_fc_unregister_targetport(phba->targetport); wait_for_completion_timeout(&tgtp->tport_unreg_done, 5); + lpfc_nvmet_cleanup_io_context(phba); } phba->targetport = NULL; #endif @@ -1115,15 +1230,18 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp; struct lpfc_nvmet_tgtport *tgtp; struct fc_frame_header *fc_hdr; + struct lpfc_nvmet_ctxbuf *ctx_buf; uint32_t *payload; uint32_t size, oxid, sid, rc; + unsigned long iflag; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS uint32_t id; #endif + ctx_buf = NULL; if (!nvmebuf || !phba->targetport) { lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6157 FCP Drop IO\n"); + "6157 NVMET FCP Drop IO\n"); oxid = 0; size = 0; sid = 0; @@ -1131,6 +1249,23 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, goto dropit; } + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_lock, iflag); + if (phba->sli4_hba.nvmet_ctx_cnt) { + list_remove_head(&phba->sli4_hba.lpfc_nvmet_ctx_list, + ctx_buf, struct lpfc_nvmet_ctxbuf, list); + phba->sli4_hba.nvmet_ctx_cnt--; + } + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_lock, iflag); + + if (!ctx_buf) { + lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, + "6408 No NVMET ctx Drop IO\n"); + oxid = 0; + size = 0; + sid = 0; + ctxp = NULL; + goto dropit; + } tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; payload = (uint32_t *)(nvmebuf->dbuf.virt); @@ -1139,16 +1274,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, oxid = be16_to_cpu(fc_hdr->fh_ox_id); sid = sli4_sid_from_fc_hdr(fc_hdr); - ctxp = (struct lpfc_nvmet_rcv_ctx *)nvmebuf->context; - if (ctxp == NULL) { - atomic_inc(&tgtp->rcv_fcp_cmd_drop); - lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6158 FCP Drop IO x%x: Alloc\n", - oxid); - lpfc_nvmet_rq_post(phba, NULL, &nvmebuf->hbuf); - /* Cannot send ABTS without context */ - return; - } + ctxp = (struct lpfc_nvmet_rcv_ctx *)ctx_buf->context; memset(ctxp, 0, sizeof(ctxp->ctx)); ctxp->wqeq = NULL; ctxp->txrdy = NULL; @@ -1158,9 +1284,9 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, ctxp->oxid = oxid; ctxp->sid = sid; ctxp->state = LPFC_NVMET_STE_RCV; - ctxp->rqb_buffer = nvmebuf; ctxp->entry_cnt = 1; ctxp->flag = 0; + ctxp->ctxbuf = ctx_buf; spin_lock_init(&ctxp->ctxlock); #ifdef CONFIG_SCSI_LPFC_DEBUG_FS @@ -1192,6 +1318,9 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, * The calling sequence should be: * nvmet_fc_rcv_fcp_req -> lpfc_nvmet_xmt_fcp_op/cmp -> req->done * lpfc_nvmet_xmt_fcp_op_cmp should free the allocated ctxp. + * When we return from nvmet_fc_rcv_fcp_req, all relevant info in + * the NVME command / FC header is stored, so we are free to repost + * the buffer. */ rc = nvmet_fc_rcv_fcp_req(phba->targetport, &ctxp->ctx.fcp_req, payload, size); @@ -1199,6 +1328,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, /* Process FCP command */ if (rc == 0) { atomic_inc(&tgtp->rcv_fcp_cmd_out); + lpfc_rq_buf_free(phba, &nvmebuf->hbuf); /* repost */ return; } @@ -1213,15 +1343,17 @@ dropit: lpfc_nvmeio_data(phba, "NVMET FCP DROP: xri x%x sz %d from %06x\n", oxid, size, sid); if (oxid) { + lpfc_nvmet_defer_release(phba, ctxp); lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, sid, oxid); + lpfc_rq_buf_free(phba, &nvmebuf->hbuf); /* repost */ return; } - if (nvmebuf) { - nvmebuf->iocbq->hba_wqidx = 0; - /* We assume a rcv'ed cmd ALWAYs fits into 1 buffer */ - lpfc_nvmet_rq_post(phba, ctxp, &nvmebuf->hbuf); - } + if (ctx_buf) + lpfc_nvmet_ctxbuf_post(phba, ctx_buf); + + if (nvmebuf) + lpfc_rq_buf_free(phba, &nvmebuf->hbuf); /* repost */ #endif } @@ -1273,7 +1405,7 @@ lpfc_nvmet_unsol_fcp_event(struct lpfc_hba *phba, uint64_t isr_timestamp) { if (phba->nvmet_support == 0) { - lpfc_nvmet_rq_post(phba, NULL, &nvmebuf->hbuf); + lpfc_rq_buf_free(phba, &nvmebuf->hbuf); return; } lpfc_nvmet_unsol_fcp_buffer(phba, pring, nvmebuf, @@ -1474,7 +1606,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba, nvmewqe = ctxp->wqeq; if (nvmewqe == NULL) { /* Allocate buffer for command wqe */ - nvmewqe = ctxp->rqb_buffer->iocbq; + nvmewqe = ctxp->ctxbuf->iocbq; if (nvmewqe == NULL) { lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, "6110 lpfc_nvmet_prep_fcp_wqe: No " @@ -1501,7 +1633,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba, return NULL; } - sgl = (struct sli4_sge *)ctxp->rqb_buffer->sglq->sgl; + sgl = (struct sli4_sge *)ctxp->ctxbuf->sglq->sgl; switch (rsp->op) { case NVMET_FCOP_READDATA: case NVMET_FCOP_READDATA_RSP: @@ -1851,15 +1983,16 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, wcqe->word0, wcqe->total_data_placed, result, wcqe->word3); + cmdwqe->context2 = NULL; + cmdwqe->context3 = NULL; /* * if transport has released ctx, then can reuse it. Otherwise, * will be recycled by transport release call. */ if (released) - lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf); + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); - cmdwqe->context2 = NULL; - cmdwqe->context3 = NULL; + /* This is the iocbq for the abort, not the command */ lpfc_sli_release_iocbq(phba, cmdwqe); /* Since iaab/iaar are NOT set, there is no work left. @@ -1932,15 +2065,15 @@ lpfc_nvmet_unsol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, ctxp->oxid, ctxp->flag, released, wcqe->word0, wcqe->total_data_placed, result, wcqe->word3); + + cmdwqe->context2 = NULL; + cmdwqe->context3 = NULL; /* * if transport has released ctx, then can reuse it. Otherwise, * will be recycled by transport release call. */ if (released) - lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf); - - cmdwqe->context2 = NULL; - cmdwqe->context3 = NULL; + lpfc_nvmet_ctxbuf_post(phba, ctxp->ctxbuf); /* Since iaab/iaar are NOT set, there is no work left. * For LPFC_NVMET_XBUSY, lpfc_sli4_nvmet_xri_aborted @@ -2002,10 +2135,6 @@ lpfc_nvmet_unsol_issue_abort(struct lpfc_hba *phba, sid, xri, ctxp->wqeq->sli4_xritag); tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; - if (!ctxp->wqeq) { - ctxp->wqeq = ctxp->rqb_buffer->iocbq; - ctxp->wqeq->hba_wqidx = 0; - } ndlp = lpfc_findnode_did(phba->pport, sid); if (!ndlp || !NLP_CHK_NODE_ACT(ndlp) || @@ -2101,7 +2230,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; if (!ctxp->wqeq) { - ctxp->wqeq = ctxp->rqb_buffer->iocbq; + ctxp->wqeq = ctxp->ctxbuf->iocbq; ctxp->wqeq->hba_wqidx = 0; } @@ -2239,7 +2368,7 @@ lpfc_nvmet_unsol_fcp_issue_abort(struct lpfc_hba *phba, tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; if (!ctxp->wqeq) { - ctxp->wqeq = ctxp->rqb_buffer->iocbq; + ctxp->wqeq = ctxp->ctxbuf->iocbq; ctxp->wqeq->hba_wqidx = 0; } @@ -2294,6 +2423,7 @@ lpfc_nvmet_unsol_ls_issue_abort(struct lpfc_hba *phba, } abts_wqeq = ctxp->wqeq; wqe_abts = &abts_wqeq->wqe; + lpfc_nvmet_unsol_issue_abort(phba, ctxp, sid, xri); spin_lock_irqsave(&phba->hbalock, flags); diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h index 55f2a859dc70..6eb2f5d8d4ed 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.h +++ b/drivers/scsi/lpfc/lpfc_nvmet.h @@ -106,6 +106,7 @@ struct lpfc_nvmet_rcv_ctx { #define LPFC_NVMET_CTX_RLS 0x8 /* ctx free requested */ #define LPFC_NVMET_ABTS_RCV 0x10 /* ABTS received on exchange */ struct rqb_dmabuf *rqb_buffer; + struct lpfc_nvmet_ctxbuf *ctxbuf; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS uint64_t ts_isr_cmd; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 49d5c4700054..d68ee3ee299a 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -6513,6 +6513,49 @@ lpfc_set_host_data(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox) (phba->hba_flag & HBA_FCOE_MODE) ? "FCoE" : "FC"); } +static int +lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, + struct lpfc_queue *drq, int count) +{ + int rc, i; + struct lpfc_rqe hrqe; + struct lpfc_rqe drqe; + struct lpfc_rqb *rqbp; + struct rqb_dmabuf *rqb_buffer; + LIST_HEAD(rqb_buf_list); + + rqbp = hrq->rqbp; + for (i = 0; i < count; i++) { + /* IF RQ is already full, don't bother */ + if (rqbp->buffer_count + i >= rqbp->entry_count - 1) + break; + rqb_buffer = rqbp->rqb_alloc_buffer(phba); + if (!rqb_buffer) + break; + rqb_buffer->hrq = hrq; + rqb_buffer->drq = drq; + list_add_tail(&rqb_buffer->hbuf.list, &rqb_buf_list); + } + while (!list_empty(&rqb_buf_list)) { + list_remove_head(&rqb_buf_list, rqb_buffer, struct rqb_dmabuf, + hbuf.list); + + hrqe.address_lo = putPaddrLow(rqb_buffer->hbuf.phys); + hrqe.address_hi = putPaddrHigh(rqb_buffer->hbuf.phys); + drqe.address_lo = putPaddrLow(rqb_buffer->dbuf.phys); + drqe.address_hi = putPaddrHigh(rqb_buffer->dbuf.phys); + rc = lpfc_sli4_rq_put(hrq, drq, &hrqe, &drqe); + if (rc < 0) { + rqbp->rqb_free_buffer(phba, rqb_buffer); + } else { + list_add_tail(&rqb_buffer->hbuf.list, + &rqbp->rqb_buffer_list); + rqbp->buffer_count++; + } + } + return 1; +} + /** * lpfc_sli4_hba_setup - SLI4 device initialization PCI function * @phba: Pointer to HBA context object. @@ -6525,7 +6568,7 @@ lpfc_set_host_data(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox) int lpfc_sli4_hba_setup(struct lpfc_hba *phba) { - int rc, i; + int rc, i, cnt; LPFC_MBOXQ_t *mboxq; struct lpfc_mqe *mqe; uint8_t *vpd; @@ -6876,6 +6919,21 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) goto out_destroy_queue; } phba->sli4_hba.nvmet_xri_cnt = rc; + + cnt = phba->cfg_iocb_cnt * 1024; + /* We need 1 iocbq for every SGL, for IO processing */ + cnt += phba->sli4_hba.nvmet_xri_cnt; + /* Initialize and populate the iocb list per host */ + lpfc_printf_log(phba, KERN_INFO, LOG_INIT, + "2821 initialize iocb list %d total %d\n", + phba->cfg_iocb_cnt, cnt); + rc = lpfc_init_iocb_list(phba, cnt); + if (rc) { + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "1413 Failed to init iocb list.\n"); + goto out_destroy_queue; + } + lpfc_nvmet_create_targetport(phba); } else { /* update host scsi xri-sgl sizes and mappings */ @@ -6895,10 +6953,21 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) "and mapping: %d\n", rc); goto out_destroy_queue; } + + cnt = phba->cfg_iocb_cnt * 1024; + /* Initialize and populate the iocb list per host */ + lpfc_printf_log(phba, KERN_INFO, LOG_INIT, + "2820 initialize iocb list %d total %d\n", + phba->cfg_iocb_cnt, cnt); + rc = lpfc_init_iocb_list(phba, cnt); + if (rc) { + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "6301 Failed to init iocb list.\n"); + goto out_destroy_queue; + } } if (phba->nvmet_support && phba->cfg_nvmet_mrq) { - /* Post initial buffers to all RQs created */ for (i = 0; i < phba->cfg_nvmet_mrq; i++) { rqbp = phba->sli4_hba.nvmet_mrq_hdr[i]->rqbp; @@ -6911,7 +6980,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) lpfc_post_rq_buffer( phba, phba->sli4_hba.nvmet_mrq_hdr[i], phba->sli4_hba.nvmet_mrq_data[i], - phba->cfg_nvmet_mrq_post); + LPFC_NVMET_RQE_DEF_COUNT); } } @@ -7078,6 +7147,7 @@ out_unset_queue: /* Unset all the queues set up in this routine when error out */ lpfc_sli4_queue_unset(phba); out_destroy_queue: + lpfc_free_iocb_list(phba); lpfc_sli4_queue_destroy(phba); out_stop_timers: lpfc_stop_hba_timers(phba); @@ -18731,7 +18801,7 @@ lpfc_sli4_issue_wqe(struct lpfc_hba *phba, uint32_t ring_number, spin_lock_irqsave(&pring->ring_lock, iflags); ctxp = pwqe->context2; - sglq = ctxp->rqb_buffer->sglq; + sglq = ctxp->ctxbuf->sglq; if (pwqe->sli4_xritag == NO_XRI) { pwqe->sli4_lxritag = sglq->sli4_lxritag; pwqe->sli4_xritag = sglq->sli4_xritag; diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 422bde85c9f1..19e2f190ea2e 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -618,10 +618,12 @@ struct lpfc_sli4_hba { uint16_t scsi_xri_start; uint16_t els_xri_cnt; uint16_t nvmet_xri_cnt; + uint16_t nvmet_ctx_cnt; struct list_head lpfc_els_sgl_list; struct list_head lpfc_abts_els_sgl_list; struct list_head lpfc_nvmet_sgl_list; struct list_head lpfc_abts_nvmet_ctx_list; + struct list_head lpfc_nvmet_ctx_list; struct list_head lpfc_abts_scsi_buf_list; struct list_head lpfc_abts_nvme_buf_list; struct lpfc_sglq **lpfc_sglq_active_list; @@ -662,8 +664,6 @@ struct lpfc_sli4_hba { uint16_t num_online_cpu; uint16_t num_present_cpu; uint16_t curr_disp_cpu; - - uint16_t nvmet_mrq_post_idx; }; enum lpfc_sge_type { From a8cf5dfeb4d84248c0ad12386ae0cb36ee21589a Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:46 -0700 Subject: [PATCH 045/254] scsi: lpfc: Added recovery logic for running out of NVMET IO context resources Previous logic would just drop the IO. Added logic to queue the IO to wait for an IO context resource from an IO thats already in progress. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 1 + drivers/scsi/lpfc/lpfc_attr.c | 6 ++ drivers/scsi/lpfc/lpfc_crtn.h | 2 + drivers/scsi/lpfc/lpfc_debugfs.c | 6 ++ drivers/scsi/lpfc/lpfc_init.c | 2 + drivers/scsi/lpfc/lpfc_nvmet.c | 138 ++++++++++++++++++++++++++----- drivers/scsi/lpfc/lpfc_sli.c | 7 +- drivers/scsi/lpfc/lpfc_sli4.h | 6 +- 8 files changed, 144 insertions(+), 24 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 72641b1d3ab8..c47bde6205c9 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -170,6 +170,7 @@ struct rqb_dmabuf { struct lpfc_dmabuf dbuf; uint16_t total_size; uint16_t bytes_recv; + uint16_t idx; struct lpfc_queue *hrq; /* ptr to associated Header RQ */ struct lpfc_queue *drq; /* ptr to associated Data RQ */ }; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 65264582915a..bb2d9e238225 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -245,6 +245,12 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, atomic_read(&tgtp->xmt_abort_rsp), atomic_read(&tgtp->xmt_abort_rsp_error)); + len += snprintf(buf + len, PAGE_SIZE - len, + "IO_CTX: %08x outstanding %08x total %x", + phba->sli4_hba.nvmet_ctx_cnt, + phba->sli4_hba.nvmet_io_wait_cnt, + phba->sli4_hba.nvmet_io_wait_total); + len += snprintf(buf+len, PAGE_SIZE-len, "\n"); return len; } diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index cc95abd130b4..8912767e7bc8 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -77,6 +77,8 @@ void lpfc_retry_pport_discovery(struct lpfc_hba *); void lpfc_release_rpi(struct lpfc_hba *, struct lpfc_vport *, uint16_t); int lpfc_init_iocb_list(struct lpfc_hba *phba, int cnt); void lpfc_free_iocb_list(struct lpfc_hba *phba); +int lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, + struct lpfc_queue *drq, int count, int idx); void lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_mbx_cmpl_reg_login(struct lpfc_hba *, LPFC_MBOXQ_t *); diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 7284533f4df2..c7d1c9d37a64 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -842,6 +842,12 @@ lpfc_debugfs_nvmestat_data(struct lpfc_vport *vport, char *buf, int size) } spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock); } + + len += snprintf(buf + len, size - len, + "IO_CTX: %08x outstanding %08x total %08x\n", + phba->sli4_hba.nvmet_ctx_cnt, + phba->sli4_hba.nvmet_io_wait_cnt, + phba->sli4_hba.nvmet_io_wait_total); } else { if (!(phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)) return len; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 86b0b26dfeea..9f6c7e71814b 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -5825,6 +5825,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvme_buf_list); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_ctx_list); + INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_io_wait_list); /* Fast-path XRI aborted CQ Event work queue list */ INIT_LIST_HEAD(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue); @@ -5833,6 +5834,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) /* This abort list used by worker thread */ spin_lock_init(&phba->sli4_hba.sgl_list_lock); spin_lock_init(&phba->sli4_hba.nvmet_io_lock); + spin_lock_init(&phba->sli4_hba.nvmet_io_wait_lock); /* * Initialize driver internal slow-path work queues diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index fcc77ae0c71c..312f54278bd4 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -158,6 +158,12 @@ void lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) { struct lpfc_nvmet_rcv_ctx *ctxp = ctx_buf->context; + struct lpfc_nvmet_tgtport *tgtp; + struct fc_frame_header *fc_hdr; + struct rqb_dmabuf *nvmebuf; + struct lpfc_dmabuf *hbufp; + uint32_t *payload; + uint32_t size, oxid, sid, rc; unsigned long iflag; if (ctxp->txrdy) { @@ -168,6 +174,87 @@ lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) } ctxp->state = LPFC_NVMET_STE_FREE; + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_wait_lock, iflag); + if (phba->sli4_hba.nvmet_io_wait_cnt) { + hbufp = &nvmebuf->hbuf; + list_remove_head(&phba->sli4_hba.lpfc_nvmet_io_wait_list, + nvmebuf, struct rqb_dmabuf, + hbuf.list); + phba->sli4_hba.nvmet_io_wait_cnt--; + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_wait_lock, + iflag); + + fc_hdr = (struct fc_frame_header *)(nvmebuf->hbuf.virt); + oxid = be16_to_cpu(fc_hdr->fh_ox_id); + tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; + payload = (uint32_t *)(nvmebuf->dbuf.virt); + size = nvmebuf->bytes_recv; + sid = sli4_sid_from_fc_hdr(fc_hdr); + + ctxp = (struct lpfc_nvmet_rcv_ctx *)ctx_buf->context; + memset(ctxp, 0, sizeof(ctxp->ctx)); + ctxp->wqeq = NULL; + ctxp->txrdy = NULL; + ctxp->offset = 0; + ctxp->phba = phba; + ctxp->size = size; + ctxp->oxid = oxid; + ctxp->sid = sid; + ctxp->state = LPFC_NVMET_STE_RCV; + ctxp->entry_cnt = 1; + ctxp->flag = 0; + ctxp->ctxbuf = ctx_buf; + spin_lock_init(&ctxp->ctxlock); + +#ifdef CONFIG_SCSI_LPFC_DEBUG_FS + if (phba->ktime_on) { + ctxp->ts_cmd_nvme = ktime_get_ns(); + ctxp->ts_isr_cmd = ctxp->ts_cmd_nvme; + ctxp->ts_nvme_data = 0; + ctxp->ts_data_wqput = 0; + ctxp->ts_isr_data = 0; + ctxp->ts_data_nvme = 0; + ctxp->ts_nvme_status = 0; + ctxp->ts_status_wqput = 0; + ctxp->ts_isr_status = 0; + ctxp->ts_status_nvme = 0; + } +#endif + atomic_inc(&tgtp->rcv_fcp_cmd_in); + /* + * The calling sequence should be: + * nvmet_fc_rcv_fcp_req->lpfc_nvmet_xmt_fcp_op/cmp- req->done + * lpfc_nvmet_xmt_fcp_op_cmp should free the allocated ctxp. + * When we return from nvmet_fc_rcv_fcp_req, all relevant info + * the NVME command / FC header is stored. + * A buffer has already been reposted for this IO, so just free + * the nvmebuf. + */ + rc = nvmet_fc_rcv_fcp_req(phba->targetport, &ctxp->ctx.fcp_req, + payload, size); + + /* Process FCP command */ + if (rc == 0) { + atomic_inc(&tgtp->rcv_fcp_cmd_out); + nvmebuf->hrq->rqbp->rqb_free_buffer(phba, nvmebuf); + return; + } + + atomic_inc(&tgtp->rcv_fcp_cmd_drop); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, + "2582 FCP Drop IO x%x: err x%x: x%x x%x x%x\n", + ctxp->oxid, rc, + atomic_read(&tgtp->rcv_fcp_cmd_in), + atomic_read(&tgtp->rcv_fcp_cmd_out), + atomic_read(&tgtp->xmt_fcp_release)); + + lpfc_nvmet_defer_release(phba, ctxp); + lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, sid, oxid); + nvmebuf->hrq->rqbp->rqb_free_buffer(phba, nvmebuf); + return; + } + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_wait_lock, iflag); + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_lock, iflag); list_add_tail(&ctx_buf->list, &phba->sli4_hba.lpfc_nvmet_ctx_list); @@ -1232,7 +1319,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr; struct lpfc_nvmet_ctxbuf *ctx_buf; uint32_t *payload; - uint32_t size, oxid, sid, rc; + uint32_t size, oxid, sid, rc, qno; unsigned long iflag; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS uint32_t id; @@ -1257,21 +1344,41 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, } spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_lock, iflag); + fc_hdr = (struct fc_frame_header *)(nvmebuf->hbuf.virt); + oxid = be16_to_cpu(fc_hdr->fh_ox_id); + size = nvmebuf->bytes_recv; + +#ifdef CONFIG_SCSI_LPFC_DEBUG_FS + if (phba->cpucheck_on & LPFC_CHECK_NVMET_RCV) { + id = smp_processor_id(); + if (id < LPFC_CHECK_CPU_CNT) + phba->cpucheck_rcv_io[id]++; + } +#endif + + lpfc_nvmeio_data(phba, "NVMET FCP RCV: xri x%x sz %d CPU %02x\n", + oxid, size, smp_processor_id()); + if (!ctx_buf) { - lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6408 No NVMET ctx Drop IO\n"); - oxid = 0; - size = 0; - sid = 0; - ctxp = NULL; - goto dropit; + /* Queue this NVME IO to process later */ + spin_lock_irqsave(&phba->sli4_hba.nvmet_io_wait_lock, iflag); + list_add_tail(&nvmebuf->hbuf.list, + &phba->sli4_hba.lpfc_nvmet_io_wait_list); + phba->sli4_hba.nvmet_io_wait_cnt++; + phba->sli4_hba.nvmet_io_wait_total++; + spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_wait_lock, + iflag); + + /* Post a brand new DMA buffer to RQ */ + qno = nvmebuf->idx; + lpfc_post_rq_buffer( + phba, phba->sli4_hba.nvmet_mrq_hdr[qno], + phba->sli4_hba.nvmet_mrq_data[qno], 1, qno); + return; } tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; payload = (uint32_t *)(nvmebuf->dbuf.virt); - fc_hdr = (struct fc_frame_header *)(nvmebuf->hbuf.virt); - size = nvmebuf->bytes_recv; - oxid = be16_to_cpu(fc_hdr->fh_ox_id); sid = sli4_sid_from_fc_hdr(fc_hdr); ctxp = (struct lpfc_nvmet_rcv_ctx *)ctx_buf->context; @@ -1302,17 +1409,8 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, ctxp->ts_isr_status = 0; ctxp->ts_status_nvme = 0; } - - if (phba->cpucheck_on & LPFC_CHECK_NVMET_RCV) { - id = smp_processor_id(); - if (id < LPFC_CHECK_CPU_CNT) - phba->cpucheck_rcv_io[id]++; - } #endif - lpfc_nvmeio_data(phba, "NVMET FCP RCV: xri x%x sz %d CPU %02x\n", - oxid, size, smp_processor_id()); - atomic_inc(&tgtp->rcv_fcp_cmd_in); /* * The calling sequence should be: diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index d68ee3ee299a..3fb4e715bfa2 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -6513,9 +6513,9 @@ lpfc_set_host_data(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox) (phba->hba_flag & HBA_FCOE_MODE) ? "FCoE" : "FC"); } -static int +int lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, - struct lpfc_queue *drq, int count) + struct lpfc_queue *drq, int count, int idx) { int rc, i; struct lpfc_rqe hrqe; @@ -6534,6 +6534,7 @@ lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq, break; rqb_buffer->hrq = hrq; rqb_buffer->drq = drq; + rqb_buffer->idx = idx; list_add_tail(&rqb_buffer->hbuf.list, &rqb_buf_list); } while (!list_empty(&rqb_buf_list)) { @@ -6980,7 +6981,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) lpfc_post_rq_buffer( phba, phba->sli4_hba.nvmet_mrq_hdr[i], phba->sli4_hba.nvmet_mrq_data[i], - LPFC_NVMET_RQE_DEF_COUNT); + LPFC_NVMET_RQE_DEF_COUNT, i); } } diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 19e2f190ea2e..c1c9a9125266 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -619,13 +619,16 @@ struct lpfc_sli4_hba { uint16_t els_xri_cnt; uint16_t nvmet_xri_cnt; uint16_t nvmet_ctx_cnt; + uint16_t nvmet_io_wait_cnt; + uint16_t nvmet_io_wait_total; struct list_head lpfc_els_sgl_list; struct list_head lpfc_abts_els_sgl_list; struct list_head lpfc_nvmet_sgl_list; struct list_head lpfc_abts_nvmet_ctx_list; - struct list_head lpfc_nvmet_ctx_list; struct list_head lpfc_abts_scsi_buf_list; struct list_head lpfc_abts_nvme_buf_list; + struct list_head lpfc_nvmet_ctx_list; + struct list_head lpfc_nvmet_io_wait_list; struct lpfc_sglq **lpfc_sglq_active_list; struct list_head lpfc_rpi_hdr_list; unsigned long *rpi_bmask; @@ -657,6 +660,7 @@ struct lpfc_sli4_hba { spinlock_t abts_scsi_buf_list_lock; /* list of aborted SCSI IOs */ spinlock_t sgl_list_lock; /* list of aborted els IOs */ spinlock_t nvmet_io_lock; + spinlock_t nvmet_io_wait_lock; /* IOs waiting for ctx resources */ uint32_t physical_port; /* CPU to vector mapping information */ From 82820f0cf19aa62e2608c2909bd44e7a68268ff5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:47 -0700 Subject: [PATCH 046/254] scsi: lpfc: Fix NVME I+T not registering NVME as a supported FC4 type When the driver send the RPA command, it does not send supported FC4 Type NVME to the management server. Encode NVME (type x28) in the AttribEntry in the RPA command. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_ct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index c7962dae4dab..f2cd19c6c2df 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -2092,6 +2092,7 @@ lpfc_fdmi_port_attr_fc4type(struct lpfc_vport *vport, ae->un.AttrTypes[3] = 0x02; /* Type 1 - ELS */ ae->un.AttrTypes[2] = 0x01; /* Type 8 - FCP */ + ae->un.AttrTypes[6] = 0x01; /* Type 40 - NVME */ ae->un.AttrTypes[7] = 0x01; /* Type 32 - CT */ size = FOURBYTES + 32; ad->AttrLen = cpu_to_be16(size); From 667a7662529bf0afb1d84a32ceb0da0a875a3b6c Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:48 -0700 Subject: [PATCH 047/254] scsi: lpfc: Fix debugfs root inode "lpfc" not getting deleted on driver unload. When unloading and reloading the driver, the driver fails to recreate the lpfc root inode in the debugfs tree. The driver is incorrectly removing the lpfc root inode in lpfc_debugfs_terminate in the first driver instance that unloads and then sets the lpfc_debugfs_root global parameter to NULL. When the final driver instance unloads, the debugfs calls quietly ignore the remove on a NULL pointer. The bug is that the debugfs_remove call returns void so the driver doesn't know to correctly set the global parameter to NULL. Base the debugfs_remove of the lpfc_debugfs_root parameter on lpfc_debugfs_hba_count because this parameter tracks the fnX instance tracked per driver instance. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_debugfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index c7d1c9d37a64..4bcb92c844ca 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -5866,8 +5866,10 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport) atomic_dec(&lpfc_debugfs_hba_count); } - debugfs_remove(lpfc_debugfs_root); /* lpfc */ - lpfc_debugfs_root = NULL; + if (atomic_read(&lpfc_debugfs_hba_count) == 0) { + debugfs_remove(lpfc_debugfs_root); /* lpfc */ + lpfc_debugfs_root = NULL; + } } #endif return; From 64eb4dcb140a7c5547f6e965fb471b1b75c01108 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:49 -0700 Subject: [PATCH 048/254] scsi: lpfc: Cleanup entry_repost settings on SLI4 queues Too many work items being processed in IRQ context take a lot of CPU time and cause problems. With a recent change, we get out of the ISR after hitting entry_repost work items on a queue. However, the actual values for entry repost are still high. EQ is 128 and CQ is 128, this could translate into processing 128 * 128 (16384) work items under IRQ context. Set entry_repost in the actual queue creation routine now. Limit EQ repost to 8 and CQ repost to 64 to further limit the amount of time spent in the IRQ. Fix fof IRQ routines as well. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_sli.c | 19 ++++++++----------- drivers/scsi/lpfc/lpfc_sli4.h | 6 ++++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 3fb4e715bfa2..903c06ff828a 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -13922,17 +13922,10 @@ lpfc_sli4_queue_alloc(struct lpfc_hba *phba, uint32_t entry_size, } queue->entry_size = entry_size; queue->entry_count = entry_count; - - /* - * entry_repost is calculated based on the number of entries in the - * queue. This works out except for RQs. If buffers are NOT initially - * posted for every RQE, entry_repost should be adjusted accordingly. - */ - queue->entry_repost = (entry_count >> 3); - if (queue->entry_repost < LPFC_QUEUE_MIN_REPOST) - queue->entry_repost = LPFC_QUEUE_MIN_REPOST; queue->phba = phba; + /* entry_repost will be set during q creation */ + return queue; out_fail: lpfc_sli4_queue_free(queue); @@ -14163,6 +14156,7 @@ lpfc_eq_create(struct lpfc_hba *phba, struct lpfc_queue *eq, uint32_t imax) status = -ENXIO; eq->host_index = 0; eq->hba_index = 0; + eq->entry_repost = LPFC_EQ_REPOST; mempool_free(mbox, phba->mbox_mem_pool); return status; @@ -14236,9 +14230,9 @@ lpfc_cq_create(struct lpfc_hba *phba, struct lpfc_queue *cq, default: lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "0361 Unsupported CQ count: " - "entry cnt %d sz %d pg cnt %d repost %d\n", + "entry cnt %d sz %d pg cnt %d\n", cq->entry_count, cq->entry_size, - cq->page_count, cq->entry_repost); + cq->page_count); if (cq->entry_count < 256) { status = -EINVAL; goto out; @@ -14291,6 +14285,7 @@ lpfc_cq_create(struct lpfc_hba *phba, struct lpfc_queue *cq, cq->assoc_qid = eq->queue_id; cq->host_index = 0; cq->hba_index = 0; + cq->entry_repost = LPFC_CQ_REPOST; out: mempool_free(mbox, phba->mbox_mem_pool); @@ -14482,6 +14477,7 @@ lpfc_cq_create_set(struct lpfc_hba *phba, struct lpfc_queue **cqp, cq->assoc_qid = eq->queue_id; cq->host_index = 0; cq->hba_index = 0; + cq->entry_repost = LPFC_CQ_REPOST; rc = 0; list_for_each_entry(dmabuf, &cq->page_list, list) { @@ -14730,6 +14726,7 @@ lpfc_mq_create(struct lpfc_hba *phba, struct lpfc_queue *mq, mq->subtype = subtype; mq->host_index = 0; mq->hba_index = 0; + mq->entry_repost = LPFC_MQ_REPOST; /* link the mq onto the parent cq child list */ list_add_tail(&mq->list, &cq->child_list); diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index c1c9a9125266..cf863db27700 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -24,7 +24,6 @@ #define LPFC_XRI_EXCH_BUSY_WAIT_TMO 10000 #define LPFC_XRI_EXCH_BUSY_WAIT_T1 10 #define LPFC_XRI_EXCH_BUSY_WAIT_T2 30000 -#define LPFC_RELEASE_NOTIFICATION_INTERVAL 32 #define LPFC_RPI_LOW_WATER_MARK 10 #define LPFC_UNREG_FCF 1 @@ -155,8 +154,11 @@ struct lpfc_queue { uint32_t entry_count; /* Number of entries to support on the queue */ uint32_t entry_size; /* Size of each queue entry. */ uint32_t entry_repost; /* Count of entries before doorbell is rung */ -#define LPFC_QUEUE_MIN_REPOST 8 +#define LPFC_EQ_REPOST 8 +#define LPFC_MQ_REPOST 8 +#define LPFC_CQ_REPOST 64 #define LPFC_RQ_REPOST 64 +#define LPFC_RELEASE_NOTIFICATION_INTERVAL 32 /* For WQs */ uint32_t queue_id; /* Queue ID assigned by the hardware */ uint32_t assoc_qid; /* Queue ID associated with, for CQ/WQ/MQ */ uint32_t page_count; /* Number of pages allocated for this queue */ From dc53a61852279f25909d99dad4638b4aee0b2d82 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:50 -0700 Subject: [PATCH 049/254] scsi: lpfc: Fix NVMEI's handling of NVMET's PRLI response attributes Code review of NVMEI's FC_PORT_ROLE_NVME_DISCOVERY looked wrong. Discussions with storage architecture team clarified NVMEI's audit of the PRLI response port roles. Following up discussion with code review showed a few minor corrections were required - especially in anticipation of NVME auto discovery. During PRLI, NVMEI should sent prli_init - which it it does. NVMET should send prli_tgt and prli_disc - which it does. When NVMEI receives a PRLI Response now, it audits the incoming target bits and stores the attributes in the corresponding NDLP. Later, when NVMEI registers the NVME rport, it uses the stored ndlp attributes to set the rport port_roles correctly. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_disc.h | 1 + drivers/scsi/lpfc/lpfc_nportdisc.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_disc.h b/drivers/scsi/lpfc/lpfc_disc.h index 9d5a379f4b15..094c97b9e5f7 100644 --- a/drivers/scsi/lpfc/lpfc_disc.h +++ b/drivers/scsi/lpfc/lpfc_disc.h @@ -90,6 +90,7 @@ struct lpfc_nodelist { #define NLP_FCP_INITIATOR 0x10 /* entry is an FCP Initiator */ #define NLP_NVME_TARGET 0x20 /* entry is a NVME Target */ #define NLP_NVME_INITIATOR 0x40 /* entry is a NVME Initiator */ +#define NLP_NVME_DISCOVERY 0x80 /* entry has NVME disc srvc */ uint16_t nlp_fc4_type; /* FC types node supports. */ /* Assigned from GID_FF, only diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index 8777c2d5f50d..bff3de053df4 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -1944,7 +1944,13 @@ lpfc_cmpl_prli_prli_issue(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, /* Target driver cannot solicit NVME FB. */ if (bf_get_be32(prli_tgt, nvpr)) { + /* Complete the nvme target roles. The transport + * needs to know if the rport is capable of + * discovery in addition to its role. + */ ndlp->nlp_type |= NLP_NVME_TARGET; + if (bf_get_be32(prli_disc, nvpr)) + ndlp->nlp_type |= NLP_NVME_DISCOVERY; if ((bf_get_be32(prli_fba, nvpr) == 1) && (bf_get_be32(prli_fb_sz, nvpr) > 0) && (phba->cfg_nvme_enable_fb) && From ae9e28f36a6cca4e5760f4927b70b6c9e588db1a Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:51 -0700 Subject: [PATCH 050/254] scsi: lpfc: Add MDS Diagnostic support. Added code to support Cisco MDS loopback diagnostic. The diagnostics run various loopbacks including one which loops-back frame through the driver. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 2 + drivers/scsi/lpfc/lpfc_els.c | 7 ++ drivers/scsi/lpfc/lpfc_hbadisc.c | 3 +- drivers/scsi/lpfc/lpfc_hw4.h | 15 +++- drivers/scsi/lpfc/lpfc_init.c | 13 +++ drivers/scsi/lpfc/lpfc_sli.c | 131 +++++++++++++++++++++++++++++-- 6 files changed, 161 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index c47bde6205c9..f2c0ba6ced78 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -675,6 +675,8 @@ struct lpfc_hba { /* INIT_LINK mailbox command */ #define LS_NPIV_FAB_SUPPORTED 0x2 /* Fabric supports NPIV */ #define LS_IGNORE_ERATT 0x4 /* intr handler should ignore ERATT */ +#define LS_MDS_LINK_DOWN 0x8 /* MDS Diagnostics Link Down */ +#define LS_MDS_LOOPBACK 0x16 /* MDS Diagnostics Link Up (Loopback) */ uint32_t hba_flag; /* hba generic flags */ #define HBA_ERATT_HANDLED 0x1 /* This flag is set when eratt handled */ diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 3085895464d9..1d36f82fa369 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -1047,6 +1047,13 @@ stop_rr_fcf_flogi: irsp->ulpStatus, irsp->un.ulpWord[4], irsp->ulpTimeout); + + /* If this is not a loop open failure, bail out */ + if (!(irsp->ulpStatus == IOSTAT_LOCAL_REJECT && + ((irsp->un.ulpWord[4] & IOERR_PARAM_MASK) == + IOERR_LOOP_OPEN_FAILURE))) + goto flogifail; + /* FLOGI failed, so there is no fabric */ spin_lock_irq(shost->host_lock); vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP); diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index dcc9b3858778..3ffcd9215ca8 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -701,7 +701,8 @@ lpfc_work_done(struct lpfc_hba *phba) /* Set the lpfc data pending flag */ set_bit(LPFC_DATA_READY, &phba->data_flags); } else { - if (phba->link_state >= LPFC_LINK_UP) { + if (phba->link_state >= LPFC_LINK_UP || + phba->link_flag & LS_MDS_LOOPBACK) { pring->flag &= ~LPFC_DEFERRED_RING_EVENT; lpfc_sli_handle_slow_ring_event(phba, pring, (status & diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index df97c6b7433b..e0a5fce416ae 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -4421,6 +4421,19 @@ struct fcp_treceive64_wqe { }; #define TXRDY_PAYLOAD_LEN 12 +#define CMD_SEND_FRAME 0xE1 + +struct send_frame_wqe { + struct ulp_bde64 bde; /* words 0-2 */ + uint32_t frame_len; /* word 3 */ + uint32_t fc_hdr_wd0; /* word 4 */ + uint32_t fc_hdr_wd1; /* word 5 */ + struct wqe_common wqe_com; /* words 6-11 */ + uint32_t fc_hdr_wd2; /* word 12 */ + uint32_t fc_hdr_wd3; /* word 13 */ + uint32_t fc_hdr_wd4; /* word 14 */ + uint32_t fc_hdr_wd5; /* word 15 */ +}; union lpfc_wqe { uint32_t words[16]; @@ -4439,7 +4452,7 @@ union lpfc_wqe { struct fcp_trsp64_wqe fcp_trsp; struct fcp_tsend64_wqe fcp_tsend; struct fcp_treceive64_wqe fcp_treceive; - + struct send_frame_wqe send_frame; }; union lpfc_wqe128 { diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 9f6c7e71814b..9add9473cae5 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -4540,6 +4540,19 @@ lpfc_sli4_async_fc_evt(struct lpfc_hba *phba, struct lpfc_acqe_fc_la *acqe_fc) pmb->vport = phba->pport; if (phba->sli4_hba.link_state.status != LPFC_FC_LA_TYPE_LINK_UP) { + phba->link_flag &= ~(LS_MDS_LINK_DOWN | LS_MDS_LOOPBACK); + + switch (phba->sli4_hba.link_state.status) { + case LPFC_FC_LA_TYPE_MDS_LINK_DOWN: + phba->link_flag |= LS_MDS_LINK_DOWN; + break; + case LPFC_FC_LA_TYPE_MDS_LOOPBACK: + phba->link_flag |= LS_MDS_LOOPBACK; + break; + default: + break; + } + /* Parse and translate status field */ mb = &pmb->u.mb; mb->mbxStatus = lpfc_sli4_parse_latt_fault(phba, diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 903c06ff828a..d6b184839bc2 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -74,6 +74,8 @@ static struct lpfc_iocbq *lpfc_sli4_els_wcqe_to_rspiocbq(struct lpfc_hba *, struct lpfc_iocbq *); static void lpfc_sli4_send_seq_to_ulp(struct lpfc_vport *, struct hbq_dmabuf *); +static void lpfc_sli4_handle_mds_loopback(struct lpfc_vport *vport, + struct hbq_dmabuf *dmabuf); static int lpfc_sli4_fp_handle_cqe(struct lpfc_hba *, struct lpfc_queue *, struct lpfc_cqe *); static int lpfc_sli4_post_sgl_list(struct lpfc_hba *, struct list_head *, @@ -5907,7 +5909,7 @@ lpfc_set_features(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox, bf_set(lpfc_mbx_set_feature_mds, &mbox->u.mqe.un.set_feature, 1); bf_set(lpfc_mbx_set_feature_mds_deep_loopbk, - &mbox->u.mqe.un.set_feature, 0); + &mbox->u.mqe.un.set_feature, 1); mbox->u.mqe.un.set_feature.feature = LPFC_SET_MDS_DIAGS; mbox->u.mqe.un.set_feature.param_len = 8; break; @@ -8688,8 +8690,11 @@ lpfc_sli4_iocb2wqe(struct lpfc_hba *phba, struct lpfc_iocbq *iocbq, memset(wqe, 0, sizeof(union lpfc_wqe128)); /* Some of the fields are in the right position already */ memcpy(wqe, &iocbq->iocb, sizeof(union lpfc_wqe)); - wqe->generic.wqe_com.word7 = 0; /* The ct field has moved so reset */ - wqe->generic.wqe_com.word10 = 0; + if (iocbq->iocb.ulpCommand != CMD_SEND_FRAME) { + /* The ct field has moved so reset */ + wqe->generic.wqe_com.word7 = 0; + wqe->generic.wqe_com.word10 = 0; + } abort_tag = (uint32_t) iocbq->iotag; xritag = iocbq->sli4_xritag; @@ -9183,6 +9188,10 @@ lpfc_sli4_iocb2wqe(struct lpfc_hba *phba, struct lpfc_iocbq *iocbq, } break; + case CMD_SEND_FRAME: + bf_set(wqe_xri_tag, &wqe->generic.wqe_com, xritag); + bf_set(wqe_reqtag, &wqe->generic.wqe_com, iocbq->iotag); + return 0; case CMD_XRI_ABORTED_CX: case CMD_CREATE_XRI_CR: /* Do we expect to use this? */ case CMD_IOCB_FCP_IBIDIR64_CR: /* bidirectional xfer */ @@ -16137,6 +16146,8 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) struct fc_vft_header *fc_vft_hdr; uint32_t *header = (uint32_t *) fc_hdr; +#define FC_RCTL_MDS_DIAGS 0xF4 + switch (fc_hdr->fh_r_ctl) { case FC_RCTL_DD_UNCAT: /* uncategorized information */ case FC_RCTL_DD_SOL_DATA: /* solicited data */ @@ -16164,6 +16175,7 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) case FC_RCTL_F_BSY: /* fabric busy to data frame */ case FC_RCTL_F_BSYL: /* fabric busy to link control frame */ case FC_RCTL_LCR: /* link credit reset */ + case FC_RCTL_MDS_DIAGS: /* MDS Diagnostics */ case FC_RCTL_END: /* end */ break; case FC_RCTL_VFTH: /* Virtual Fabric tagging Header */ @@ -16173,12 +16185,16 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) default: goto drop; } + +#define FC_TYPE_VENDOR_UNIQUE 0xFF + switch (fc_hdr->fh_type) { case FC_TYPE_BLS: case FC_TYPE_ELS: case FC_TYPE_FCP: case FC_TYPE_CT: case FC_TYPE_NVME: + case FC_TYPE_VENDOR_UNIQUE: break; case FC_TYPE_IP: case FC_TYPE_ILS: @@ -16189,12 +16205,14 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) lpfc_printf_log(phba, KERN_INFO, LOG_ELS, "2538 Received frame rctl:%s (x%x), type:%s (x%x), " "frame Data:%08x %08x %08x %08x %08x %08x %08x\n", + (fc_hdr->fh_r_ctl == FC_RCTL_MDS_DIAGS) ? "MDS Diags" : lpfc_rctl_names[fc_hdr->fh_r_ctl], fc_hdr->fh_r_ctl, - lpfc_type_names[fc_hdr->fh_type], fc_hdr->fh_type, - be32_to_cpu(header[0]), be32_to_cpu(header[1]), - be32_to_cpu(header[2]), be32_to_cpu(header[3]), - be32_to_cpu(header[4]), be32_to_cpu(header[5]), - be32_to_cpu(header[6])); + (fc_hdr->fh_type == FC_TYPE_VENDOR_UNIQUE) ? + "Vendor Unique" : lpfc_type_names[fc_hdr->fh_type], + fc_hdr->fh_type, be32_to_cpu(header[0]), + be32_to_cpu(header[1]), be32_to_cpu(header[2]), + be32_to_cpu(header[3]), be32_to_cpu(header[4]), + be32_to_cpu(header[5]), be32_to_cpu(header[6])); return 0; drop: lpfc_printf_log(phba, KERN_WARNING, LOG_ELS, @@ -17000,6 +17018,96 @@ lpfc_sli4_send_seq_to_ulp(struct lpfc_vport *vport, lpfc_sli_release_iocbq(phba, iocbq); } +static void +lpfc_sli4_mds_loopback_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, + struct lpfc_iocbq *rspiocb) +{ + struct lpfc_dmabuf *pcmd = cmdiocb->context2; + + if (pcmd && pcmd->virt) + pci_pool_free(phba->lpfc_drb_pool, pcmd->virt, pcmd->phys); + kfree(pcmd); + lpfc_sli_release_iocbq(phba, cmdiocb); +} + +static void +lpfc_sli4_handle_mds_loopback(struct lpfc_vport *vport, + struct hbq_dmabuf *dmabuf) +{ + struct fc_frame_header *fc_hdr; + struct lpfc_hba *phba = vport->phba; + struct lpfc_iocbq *iocbq = NULL; + union lpfc_wqe *wqe; + struct lpfc_dmabuf *pcmd = NULL; + uint32_t frame_len; + int rc; + + fc_hdr = (struct fc_frame_header *)dmabuf->hbuf.virt; + frame_len = bf_get(lpfc_rcqe_length, &dmabuf->cq_event.cqe.rcqe_cmpl); + + /* Send the received frame back */ + iocbq = lpfc_sli_get_iocbq(phba); + if (!iocbq) + goto exit; + + /* Allocate buffer for command payload */ + pcmd = kmalloc(sizeof(struct lpfc_dmabuf), GFP_KERNEL); + if (pcmd) + pcmd->virt = pci_pool_alloc(phba->lpfc_drb_pool, GFP_KERNEL, + &pcmd->phys); + if (!pcmd || !pcmd->virt) + goto exit; + + INIT_LIST_HEAD(&pcmd->list); + + /* copyin the payload */ + memcpy(pcmd->virt, dmabuf->dbuf.virt, frame_len); + + /* fill in BDE's for command */ + iocbq->iocb.un.xseq64.bdl.addrHigh = putPaddrHigh(pcmd->phys); + iocbq->iocb.un.xseq64.bdl.addrLow = putPaddrLow(pcmd->phys); + iocbq->iocb.un.xseq64.bdl.bdeFlags = BUFF_TYPE_BDE_64; + iocbq->iocb.un.xseq64.bdl.bdeSize = frame_len; + + iocbq->context2 = pcmd; + iocbq->vport = vport; + iocbq->iocb_flag &= ~LPFC_FIP_ELS_ID_MASK; + iocbq->iocb_flag |= LPFC_USE_FCPWQIDX; + + /* + * Setup rest of the iocb as though it were a WQE + * Build the SEND_FRAME WQE + */ + wqe = (union lpfc_wqe *)&iocbq->iocb; + + wqe->send_frame.frame_len = frame_len; + wqe->send_frame.fc_hdr_wd0 = be32_to_cpu(*((uint32_t *)fc_hdr)); + wqe->send_frame.fc_hdr_wd1 = be32_to_cpu(*((uint32_t *)fc_hdr + 1)); + wqe->send_frame.fc_hdr_wd2 = be32_to_cpu(*((uint32_t *)fc_hdr + 2)); + wqe->send_frame.fc_hdr_wd3 = be32_to_cpu(*((uint32_t *)fc_hdr + 3)); + wqe->send_frame.fc_hdr_wd4 = be32_to_cpu(*((uint32_t *)fc_hdr + 4)); + wqe->send_frame.fc_hdr_wd5 = be32_to_cpu(*((uint32_t *)fc_hdr + 5)); + + iocbq->iocb.ulpCommand = CMD_SEND_FRAME; + iocbq->iocb.ulpLe = 1; + iocbq->iocb_cmpl = lpfc_sli4_mds_loopback_cmpl; + rc = lpfc_sli_issue_iocb(phba, LPFC_ELS_RING, iocbq, 0); + if (rc == IOCB_ERROR) + goto exit; + + lpfc_in_buf_free(phba, &dmabuf->dbuf); + return; + +exit: + lpfc_printf_log(phba, KERN_WARNING, LOG_SLI, + "2023 Unable to process MDS loopback frame\n"); + if (pcmd && pcmd->virt) + pci_pool_free(phba->lpfc_drb_pool, pcmd->virt, pcmd->phys); + kfree(pcmd); + lpfc_sli_release_iocbq(phba, iocbq); + lpfc_in_buf_free(phba, &dmabuf->dbuf); +} + /** * lpfc_sli4_handle_received_buffer - Handle received buffers from firmware * @phba: Pointer to HBA context object. @@ -17038,6 +17146,13 @@ lpfc_sli4_handle_received_buffer(struct lpfc_hba *phba, fcfi = bf_get(lpfc_rcqe_fcf_id, &dmabuf->cq_event.cqe.rcqe_cmpl); + if (fc_hdr->fh_r_ctl == 0xF4 && fc_hdr->fh_type == 0xFF) { + vport = phba->pport; + /* Handle MDS Loopback frames */ + lpfc_sli4_handle_mds_loopback(vport, dmabuf); + return; + } + /* d_id this frame is directed to */ did = sli4_did_from_fc_hdr(fc_hdr); From 2848e1d503d60955ff51ae9ec8d5eada6bd9ba6d Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 15:20:52 -0700 Subject: [PATCH 051/254] scsi: lpfc: update version to 11.2.0.14 Change driver version to 11.2.0.14. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h index 1c26dc67151b..c2653244221c 100644 --- a/drivers/scsi/lpfc/lpfc_version.h +++ b/drivers/scsi/lpfc/lpfc_version.h @@ -20,7 +20,7 @@ * included with this package. * *******************************************************************/ -#define LPFC_DRIVER_VERSION "11.2.0.12" +#define LPFC_DRIVER_VERSION "11.2.0.14" #define LPFC_DRIVER_NAME "lpfc" /* Used for SLI 2/3 */ From 5667c86acf021e6dcf02584408b4484a273ac68f Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Sun, 14 May 2017 21:41:55 -0700 Subject: [PATCH 052/254] mac80211: strictly check mesh address extension mode Mesh forwarding path checks for address extension mode to fetch appropriate proxied address and MPP address. Existing condition that looks for 6 address format is not strict enough so that frames with improper values are processed and invalid entries are added into MPP table. Fix that by adding a stricter check before processing the packet. Per IEEE Std 802.11s-2011 spec. Table 7-6g1 lists address extension mode 0x3 as reserved one. And also Table Table 9-13 does not specify 0x3 as valid address field. Fixes: 9b395bc3be1c ("mac80211: verify that skb data is present") Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 ++- net/wireless/util.c | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 35f4c7d7a500..1f75280ba26c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2492,7 +2492,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(hdr->addr1)) { mpp_addr = hdr->addr3; proxied_addr = mesh_hdr->eaddr1; - } else if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6) { + } else if ((mesh_hdr->flags & MESH_FLAGS_AE) == + MESH_FLAGS_AE_A5_A6) { /* has_a4 already checked in ieee80211_rx_mesh_check */ mpp_addr = hdr->addr4; proxied_addr = mesh_hdr->eaddr2; diff --git a/net/wireless/util.c b/net/wireless/util.c index 7198373e2920..4992f1025c9d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -454,6 +454,8 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (iftype == NL80211_IFTYPE_MESH_POINT) skb_copy_bits(skb, hdrlen, &mesh_flags, 1); + mesh_flags &= MESH_FLAGS_AE; + switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): @@ -469,9 +471,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, iftype != NL80211_IFTYPE_STATION)) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - if (mesh_flags & MESH_FLAGS_AE_A4) + if (mesh_flags == MESH_FLAGS_AE_A4) return -1; - if (mesh_flags & MESH_FLAGS_AE_A5_A6) { + if (mesh_flags == MESH_FLAGS_AE_A5_A6) { skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), tmp.h_dest, 2 * ETH_ALEN); @@ -487,9 +489,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, ether_addr_equal(tmp.h_source, addr))) return -1; if (iftype == NL80211_IFTYPE_MESH_POINT) { - if (mesh_flags & MESH_FLAGS_AE_A5_A6) + if (mesh_flags == MESH_FLAGS_AE_A5_A6) return -1; - if (mesh_flags & MESH_FLAGS_AE_A4) + if (mesh_flags == MESH_FLAGS_AE_A4) skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), tmp.h_source, ETH_ALEN); From 53cf29d3b1bc5b86fcff5fdc52f873d79d908ef4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 17 May 2017 19:02:17 -0300 Subject: [PATCH 053/254] scsi: lpfc: Fix NULL pointer dereference during PCI error recovery Recent commit on patchset "lpfc updates for 11.2.0.14" fixed an issue about dereferencing a NULL pointer on port reset. The specific commit, named "lpfc: Fix system crash when port is reset.", is missing a check against NULL pointer on lpfc_els_flush_cmd() though. Since we destroy the queues on adapter resets, like in PCI error recovery path, we need the validation present on this patch in order to avoid a NULL pointer dereference when trying to flush commands of ELS wq, after it has been destroyed (which would lead to a kernel oops). Tested-by: Raphael Silva Signed-off-by: Guilherme G. Piccoli Acked-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 1d36f82fa369..8e532b39ae93 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -7451,6 +7451,13 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport) */ spin_lock_irq(&phba->hbalock); pring = lpfc_phba_elsring(phba); + + /* Bail out if we've no ELS wq, like in PCI error recovery case. */ + if (unlikely(!pring)) { + spin_unlock_irq(&phba->hbalock); + return; + } + if (phba->sli_rev == LPFC_SLI_REV4) spin_lock(&pring->ring_lock); From eeeb51d834d76c66784e7fe1a9ace3ce3f8d2af1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 16 May 2017 20:52:29 -0700 Subject: [PATCH 054/254] scsi: lpfc: fix build issue if NVME_FC_TARGET is not defined fix build issue if NVME_FC_TARGET is not defined. noop the code. The code will never be invoked if target mode is not enabled. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_nvmet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index 312f54278bd4..f94294b77b7b 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -157,6 +157,7 @@ out: void lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) { +#if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) struct lpfc_nvmet_rcv_ctx *ctxp = ctx_buf->context; struct lpfc_nvmet_tgtport *tgtp; struct fc_frame_header *fc_hdr; @@ -260,6 +261,7 @@ lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf) &phba->sli4_hba.lpfc_nvmet_ctx_list); phba->sli4_hba.nvmet_ctx_cnt++; spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_lock, iflag); +#endif } #ifdef CONFIG_SCSI_LPFC_DEBUG_FS From 9933e113c2e87a9f46a40fde8dafbf801dca1ab9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 May 2017 03:48:23 +0800 Subject: [PATCH 055/254] crypto: skcipher - Add missing API setkey checks The API setkey checks for key sizes and alignment went AWOL during the skcipher conversion. This patch restores them. Cc: Fixes: 4e6c3df4d729 ("crypto: skcipher - Add low-level skcipher...") Reported-by: Baozeng Signed-off-by: Herbert Xu --- crypto/skcipher.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 014af741fc6a..4faa0fd53b0c 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -764,6 +764,44 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) return 0; } +static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + unsigned long alignmask = crypto_skcipher_alignmask(tfm); + struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); + u8 *buffer, *alignbuffer; + unsigned long absize; + int ret; + + absize = keylen + alignmask; + buffer = kmalloc(absize, GFP_ATOMIC); + if (!buffer) + return -ENOMEM; + + alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); + memcpy(alignbuffer, key, keylen); + ret = cipher->setkey(tfm, alignbuffer, keylen); + kzfree(buffer); + return ret; +} + +static int skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); + unsigned long alignmask = crypto_skcipher_alignmask(tfm); + + if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) { + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + if ((unsigned long)key & alignmask) + return skcipher_setkey_unaligned(tfm, key, keylen); + + return cipher->setkey(tfm, key, keylen); +} + static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); @@ -784,7 +822,7 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) tfm->__crt_alg->cra_type == &crypto_givcipher_type) return crypto_init_skcipher_ops_ablkcipher(tfm); - skcipher->setkey = alg->setkey; + skcipher->setkey = skcipher_setkey; skcipher->encrypt = alg->encrypt; skcipher->decrypt = alg->decrypt; skcipher->ivsize = alg->ivsize; From 751a9c763849f5859cb69ea44b0430d00672f637 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 17 May 2017 11:24:47 -0400 Subject: [PATCH 056/254] netfilter: xtables: fix build failure from COMPAT_XT_ALIGN outside CONFIG_COMPAT The patch in the Fixes references COMPAT_XT_ALIGN in the definition of XT_DATA_TO_USER, outside an #ifdef CONFIG_COMPAT block. Split XT_DATA_TO_USER into separate compat and non compat variants and define the first inside an CONFIG_COMPAT block. This simplifies both variants by removing branches inside the macro. Fixes: 324318f0248c ("netfilter: xtables: zero padding in data_to_user") Reported-by: Stephen Rothwell Signed-off-by: Willem de Bruijn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d17769599c10..1770c1d9b37f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -296,18 +296,17 @@ int xt_data_to_user(void __user *dst, const void *src, } EXPORT_SYMBOL_GPL(xt_data_to_user); -#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ +#define XT_DATA_TO_USER(U, K, TYPE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ - C_SIZE ? : K->u.kernel.TYPE->TYPE##size, \ - C_SIZE ? COMPAT_XT_ALIGN(C_SIZE) : \ - XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) + K->u.kernel.TYPE->TYPE##size, \ + XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u) { return XT_OBJ_TO_USER(u, m, match, 0) || - XT_DATA_TO_USER(u, m, match, 0); + XT_DATA_TO_USER(u, m, match); } EXPORT_SYMBOL_GPL(xt_match_to_user); @@ -315,7 +314,7 @@ int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u) { return XT_OBJ_TO_USER(u, t, target, 0) || - XT_DATA_TO_USER(u, t, target, 0); + XT_DATA_TO_USER(u, t, target); } EXPORT_SYMBOL_GPL(xt_target_to_user); @@ -614,6 +613,12 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, } EXPORT_SYMBOL_GPL(xt_compat_match_from_user); +#define COMPAT_XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ + xt_data_to_user(U->data, K->data, \ + K->u.kernel.TYPE->usersize, \ + C_SIZE, \ + COMPAT_XT_ALIGN(C_SIZE)) + int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size) { @@ -629,7 +634,7 @@ int xt_compat_match_to_user(const struct xt_entry_match *m, if (match->compat_to_user((void __user *)cm->data, m->data)) return -EFAULT; } else { - if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) + if (COMPAT_XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) return -EFAULT; } @@ -975,7 +980,7 @@ int xt_compat_target_to_user(const struct xt_entry_target *t, if (target->compat_to_user((void __user *)ct->data, t->data)) return -EFAULT; } else { - if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) + if (COMPAT_XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) return -EFAULT; } From 4c4fc90964b1cf205a67df566cc82ea1731bcb00 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 18 May 2017 12:29:55 +0100 Subject: [PATCH 057/254] drivers/tty: 8250: only call fintek_8250_probe when doing port I/O Commit fa01e2ca9f53 ("serial: 8250: Integrate Fintek into 8250_base") modified the probing logic for PNP0501 devices, to remove a collision between the generic 16550A driver and the Fintek driver, which reused the same ACPI _HID. The Fintek device probe is now incorporated into the common 8250 probe path, and gets called for all discovered 16550A compatible devices, including ones that are MMIO mapped rather than IO mapped. However, the Fintek driver assumes the port base is a I/O address, and proceeds to probe some arbitrary offsets above it. This is generally a wrong thing to do, but on ARM systems (having no native port I/O), this may result in faulting accesses of completely unrelated MMIO regions in the PCI I/O space. Given that this is at serial probe time, this results in hard to diagnose crashes at boot. So let's restrict the Fintek probe to devices that we know are using port I/O in the first place. Fixes: fa01e2ca9f53 ("serial: 8250: Integrate Fintek into 8250_base") Suggested-by: Arnd Bergmann Reviewed-by: Ricardo Ribalda Signed-off-by: Ard Biesheuvel Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 09a65a3ec7f7..e7765f010fe8 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1337,7 +1337,7 @@ out_lock: /* * Check if the device is a Fintek F81216A */ - if (port->type == PORT_16550A) + if (port->type == PORT_16550A && port->iotype == UPIO_PORT) fintek_8250_probe(up); if (up->capabilities != old_capabilities) { From d3ba126a226a6b6da021ebfea444a2a807cde945 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 11 Apr 2017 19:07:28 +0200 Subject: [PATCH 058/254] Revert "tty_port: register tty ports with serdev bus" This reverts commit 8ee3fde047589dc9c201251f07d0ca1dc776feca. The new serdev bus hooked into the tty layer in tty_port_register_device() by registering a serdev controller instead of a tty device whenever a serdev client is present, and by deregistering the controller in the tty-port destructor. This is broken in several ways: Firstly, it leads to a NULL-pointer dereference whenever a tty driver later deregisters its devices as no corresponding character device will exist. Secondly, far from every tty driver uses tty-port refcounting (e.g. serial core) so the serdev devices might never be deregistered or deallocated. Thirdly, deregistering at tty-port destruction is too late as the underlying device and structures may be long gone by then. A port is not released before an open tty device is closed, something which a registered serdev client can prevent from ever happening. A driver callback while the device is gone typically also leads to crashes. Many tty drivers even keep their ports around until the driver is unloaded (e.g. serial core), something which even if a late callback never happens, leads to leaks if a device is unbound from its driver and is later rebound. The right solution here is to add a new tty_port_unregister_device() helper and to never call tty_device_unregister() whenever the port has been claimed by serdev, but since this requires modifying just about every tty driver (and multiple subsystems) it will need to be done incrementally. Reverting the offending patch is the first step in fixing the broken lifetime assumptions. A follow-up patch will add a new pair of tty-device registration helpers, which a vetted tty driver can use to support serdev (initially serial core). When every tty driver uses the serdev helpers (at least for deregistration), we can add serdev registration to tty_port_register_device() again. Note that this also fixes another issue with serdev, which currently allocates and registers a serdev controller for every tty device registered using tty_port_device_register() only to immediately deregister and deallocate it when the corresponding OF node or serdev child node is missing. This should be addressed before enabling serdev for hot-pluggable buses. Signed-off-by: Johan Hovold Reviewed-by: Rob Herring Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_port.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index 1d21a9c1d33e..0c880f17d27e 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -16,7 +16,6 @@ #include #include #include -#include static int tty_port_default_receive_buf(struct tty_port *port, const unsigned char *p, @@ -129,15 +128,7 @@ struct device *tty_port_register_device_attr(struct tty_port *port, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { - struct device *dev; - tty_port_link_device(port, driver, index); - - dev = serdev_tty_port_register(port, device, driver, index); - if (PTR_ERR(dev) != -ENODEV) - /* Skip creating cdev if we registered a serdev device */ - return dev; - return tty_register_device_attr(driver, index, device, drvdata, attr_grp); } @@ -189,9 +180,6 @@ static void tty_port_destructor(struct kref *kref) /* check if last port ref was dropped before tty release */ if (WARN_ON(port->itty)) return; - - serdev_tty_port_unregister(port); - if (port->xmit_buf) free_page((unsigned long)port->xmit_buf); tty_port_destroy(port); From aee5da7838787f8ed47f825dbe09e2812acdf97b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 11 Apr 2017 19:07:29 +0200 Subject: [PATCH 059/254] serdev: fix tty-port client deregistration The port client data must be set when registering the serdev controller or client deregistration will fail (and the serdev devices are left registered and allocated) if the port was never opened in between. Make sure to clear the port client data on any probe errors to avoid a use-after-free when the client is later deregistered unconditionally (e.g. in a tty-port deregistration helper). Also move port client operation initialisation to registration. Note that the client ops must be restored on failed probe. Fixes: bed35c6dfa6a ("serdev: add a tty port controller driver") Signed-off-by: Johan Hovold Reviewed-by: Rob Herring Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serdev/serdev-ttyport.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c index 487c88f6aa0e..013efffd2e82 100644 --- a/drivers/tty/serdev/serdev-ttyport.c +++ b/drivers/tty/serdev/serdev-ttyport.c @@ -102,9 +102,6 @@ static int ttyport_open(struct serdev_controller *ctrl) return PTR_ERR(tty); serport->tty = tty; - serport->port->client_ops = &client_ops; - serport->port->client_data = ctrl; - if (tty->ops->open) tty->ops->open(serport->tty, NULL); else @@ -215,6 +212,7 @@ struct device *serdev_tty_port_register(struct tty_port *port, struct device *parent, struct tty_driver *drv, int idx) { + const struct tty_port_client_operations *old_ops; struct serdev_controller *ctrl; struct serport *serport; int ret; @@ -233,15 +231,22 @@ struct device *serdev_tty_port_register(struct tty_port *port, ctrl->ops = &ctrl_ops; + old_ops = port->client_ops; + port->client_ops = &client_ops; + port->client_data = ctrl; + ret = serdev_controller_add(ctrl); if (ret) - goto err_controller_put; + goto err_reset_data; dev_info(&ctrl->dev, "tty port %s%d registered\n", drv->name, idx); return &ctrl->dev; -err_controller_put: +err_reset_data: + port->client_data = NULL; + port->client_ops = old_ops; serdev_controller_put(ctrl); + return ERR_PTR(ret); } From be40597a1bc173bf9dadccdf5388b956f620ae8f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 12 May 2017 16:35:45 +0200 Subject: [PATCH 060/254] serial: efm32: Fix parity management in 'efm32_uart_console_get_options()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UARTn_FRAME_PARITY_ODD is 0x0300 UARTn_FRAME_PARITY_EVEN is 0x0200 So if the UART is configured for EVEN parity, it would be reported as ODD. Fix it by correctly testing if the 2 bits are set. Fixes: 3afbd89c9639 ("serial/efm32: add new driver") Signed-off-by: Christophe JAILLET Acked-by: Uwe Kleine-König Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/efm32-uart.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/tty/serial/efm32-uart.c b/drivers/tty/serial/efm32-uart.c index ebd8569f9ad5..9fff25be87f9 100644 --- a/drivers/tty/serial/efm32-uart.c +++ b/drivers/tty/serial/efm32-uart.c @@ -27,6 +27,7 @@ #define UARTn_FRAME 0x04 #define UARTn_FRAME_DATABITS__MASK 0x000f #define UARTn_FRAME_DATABITS(n) ((n) - 3) +#define UARTn_FRAME_PARITY__MASK 0x0300 #define UARTn_FRAME_PARITY_NONE 0x0000 #define UARTn_FRAME_PARITY_EVEN 0x0200 #define UARTn_FRAME_PARITY_ODD 0x0300 @@ -572,12 +573,16 @@ static void efm32_uart_console_get_options(struct efm32_uart_port *efm_port, 16 * (4 + (clkdiv >> 6))); frame = efm32_uart_read32(efm_port, UARTn_FRAME); - if (frame & UARTn_FRAME_PARITY_ODD) + switch (frame & UARTn_FRAME_PARITY__MASK) { + case UARTn_FRAME_PARITY_ODD: *parity = 'o'; - else if (frame & UARTn_FRAME_PARITY_EVEN) + break; + case UARTn_FRAME_PARITY_EVEN: *parity = 'e'; - else + break; + default: *parity = 'n'; + } *bits = (frame & UARTn_FRAME_DATABITS__MASK) - UARTn_FRAME_DATABITS(4) + 4; From 2c0ac5b48a3586f612b85755b041ed7733dc8e6b Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 24 Apr 2017 12:30:15 +0200 Subject: [PATCH 061/254] serial: exar: Fix stuck MSIs After migrating 8250_exar to MSI in 172c33cb61da, we can get stuck without further interrupts because of the special wake-up event these chips send. They are only cleared by reading INT0. As we fail to do so during startup and shutdown, we can leave the interrupt line asserted, which is fatal with edge-triggered MSIs. Add the required reading of INT0 to startup and shutdown. Also account for the fact that a pending wake-up interrupt means we have to return 1 from exar_handle_irq. Drop the unneeded reading of INT1..3 along with this - those never reset anything. An alternative approach would have been disabling the wake-up interrupt. Unfortunately, this feature (REGB[17] = 1) is not available on the XR17D15X. Fixes: 172c33cb61da ("serial: exar: Enable MSI support") Signed-off-by: Jan Kiszka Reviewed-by: Andy Shevchenko Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index e7765f010fe8..68fd045a7025 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -47,6 +47,7 @@ /* * These are definitions for the Exar XR17V35X and XR17(C|D)15X */ +#define UART_EXAR_INT0 0x80 #define UART_EXAR_SLEEP 0x8b /* Sleep mode */ #define UART_EXAR_DVID 0x8d /* Device identification */ @@ -1869,17 +1870,13 @@ static int serial8250_default_handle_irq(struct uart_port *port) static int exar_handle_irq(struct uart_port *port) { unsigned int iir = serial_port_in(port, UART_IIR); - int ret; + int ret = 0; - ret = serial8250_handle_irq(port, iir); + if (((port->type == PORT_XR17V35X) || (port->type == PORT_XR17D15X)) && + serial_port_in(port, UART_EXAR_INT0) != 0) + ret = 1; - if ((port->type == PORT_XR17V35X) || - (port->type == PORT_XR17D15X)) { - serial_port_in(port, 0x80); - serial_port_in(port, 0x81); - serial_port_in(port, 0x82); - serial_port_in(port, 0x83); - } + ret |= serial8250_handle_irq(port, iir); return ret; } @@ -2177,6 +2174,8 @@ int serial8250_do_startup(struct uart_port *port) serial_port_in(port, UART_RX); serial_port_in(port, UART_IIR); serial_port_in(port, UART_MSR); + if ((port->type == PORT_XR17V35X) || (port->type == PORT_XR17D15X)) + serial_port_in(port, UART_EXAR_INT0); /* * At this point, there's no way the LSR could still be 0xff; @@ -2335,6 +2334,8 @@ dont_test_tx_en: serial_port_in(port, UART_RX); serial_port_in(port, UART_IIR); serial_port_in(port, UART_MSR); + if ((port->type == PORT_XR17V35X) || (port->type == PORT_XR17D15X)) + serial_port_in(port, UART_EXAR_INT0); up->lsr_saved_flags = 0; up->msr_saved_flags = 0; From 5c9d6abed9e0a061de252a53ab687a1171502e81 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 13 May 2017 00:36:22 +0300 Subject: [PATCH 062/254] serial: altera_jtaguart: adding iounmap() The driver does ioremap(port->mapbase, ALTERA_JTAGUART_SIZE), but there is no any iounmap(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Acked-by: Tobias Klauser Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/altera_jtaguart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/altera_jtaguart.c b/drivers/tty/serial/altera_jtaguart.c index 18e3f8342b85..0475f5d261ce 100644 --- a/drivers/tty/serial/altera_jtaguart.c +++ b/drivers/tty/serial/altera_jtaguart.c @@ -478,6 +478,7 @@ static int altera_jtaguart_remove(struct platform_device *pdev) port = &altera_jtaguart_ports[i].port; uart_remove_one_port(&altera_jtaguart_driver, port); + iounmap(port->membase); return 0; } From 1e948479b3d63e3ac0ecca13cbf4921c7d17c168 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 26 Apr 2017 12:24:21 +0200 Subject: [PATCH 063/254] serial: ifx6x60: fix use-after-free on module unload Make sure to deregister the SPI driver before releasing the tty driver to avoid use-after-free in the SPI remove callback where the tty devices are deregistered. Fixes: 72d4724ea54c ("serial: ifx6x60: Add modem power off function in the platform reboot process") Cc: stable # 3.8 Cc: Jun Chen Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/ifx6x60.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c index 157883653256..f190a84a0246 100644 --- a/drivers/tty/serial/ifx6x60.c +++ b/drivers/tty/serial/ifx6x60.c @@ -1382,9 +1382,9 @@ static struct spi_driver ifx_spi_driver = { static void __exit ifx_spi_exit(void) { /* unregister */ + spi_unregister_driver(&ifx_spi_driver); tty_unregister_driver(tty_drv); put_tty_driver(tty_drv); - spi_unregister_driver(&ifx_spi_driver); unregister_reboot_notifier(&ifx_modem_reboot_notifier_block); } From 11d4d32158eeaf36fe1073a4a260193d7a19ccf1 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 26 Apr 2017 12:30:04 +0200 Subject: [PATCH 064/254] tty: ehv_bytechan: clean up init error handling Straighten out the initcall error handling to avoid deregistering a never-registered tty driver (something which would lead to a NULL-pointer dereference) in the most unlikely event that driver registration fails (e.g. we've run out of major numbers). Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/tty/ehv_bytechan.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/tty/ehv_bytechan.c b/drivers/tty/ehv_bytechan.c index 7ac9bcdf1e61..61fe8d6fd24e 100644 --- a/drivers/tty/ehv_bytechan.c +++ b/drivers/tty/ehv_bytechan.c @@ -764,7 +764,7 @@ static int __init ehv_bc_init(void) ehv_bc_driver = alloc_tty_driver(count); if (!ehv_bc_driver) { ret = -ENOMEM; - goto error; + goto err_free_bcs; } ehv_bc_driver->driver_name = "ehv-bc"; @@ -778,24 +778,23 @@ static int __init ehv_bc_init(void) ret = tty_register_driver(ehv_bc_driver); if (ret) { pr_err("ehv-bc: could not register tty driver (ret=%i)\n", ret); - goto error; + goto err_put_tty_driver; } ret = platform_driver_register(&ehv_bc_tty_driver); if (ret) { pr_err("ehv-bc: could not register platform driver (ret=%i)\n", ret); - goto error; + goto err_deregister_tty_driver; } return 0; -error: - if (ehv_bc_driver) { - tty_unregister_driver(ehv_bc_driver); - put_tty_driver(ehv_bc_driver); - } - +err_deregister_tty_driver: + tty_unregister_driver(ehv_bc_driver); +err_put_tty_driver: + put_tty_driver(ehv_bc_driver); +err_free_bcs: kfree(bcs); return ret; From 925bb1ce47f429f69aad35876df7ecd8c53deb7e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 11 May 2017 12:18:52 +0200 Subject: [PATCH 065/254] tty: fix port buffer locking tty_insert_flip_string_fixed_flag() is racy against itself when called from the ioctl(TCXONC, TCION/TCIOFF) path [1] and the flush_to_ldisc() workqueue path [2]. The problem is that port->buf.tail->used is modified without consistent locking; the ioctl path takes tty->atomic_write_lock, whereas the workqueue path takes ldata->output_lock. We cannot simply take ldata->output_lock, since that is specific to the N_TTY line discipline. It might seem natural to try to take port->buf.lock inside tty_insert_flip_string_fixed_flag() and friends (where port->buf is actually used/modified), but this creates problems for flush_to_ldisc() which takes it before grabbing tty->ldisc_sem, o_tty->termios_rwsem, and ldata->output_lock. Therefore, the simplest solution for now seems to be to take tty->atomic_write_lock inside tty_port_default_receive_buf(). This lock is also used in the write path [3] with a consistent ordering. [1]: Call Trace: tty_insert_flip_string_fixed_flag pty_write tty_send_xchar // down_read(&o_tty->termios_rwsem) // mutex_lock(&tty->atomic_write_lock) n_tty_ioctl_helper n_tty_ioctl tty_ioctl // down_read(&tty->ldisc_sem) do_vfs_ioctl SyS_ioctl [2]: Workqueue: events_unbound flush_to_ldisc Call Trace: tty_insert_flip_string_fixed_flag pty_write tty_put_char __process_echoes commit_echoes // mutex_lock(&ldata->output_lock) n_tty_receive_buf_common n_tty_receive_buf2 tty_ldisc_receive_buf // down_read(&o_tty->termios_rwsem) tty_port_default_receive_buf // down_read(&tty->ldisc_sem) flush_to_ldisc // mutex_lock(&port->buf.lock) process_one_work [3]: Call Trace: tty_insert_flip_string_fixed_flag pty_write n_tty_write // mutex_lock(&ldata->output_lock) // down_read(&tty->termios_rwsem) do_tty_write (inline) // mutex_lock(&tty->atomic_write_lock) tty_write // down_read(&tty->ldisc_sem) __vfs_write vfs_write SyS_write The bug can result in about a dozen different crashes depending on what exactly gets corrupted when port->buf.tail->used points outside the buffer. The patch passes my LOCKDEP/PROVE_LOCKING testing but more testing is always welcome. Found using syzkaller. Cc: Signed-off-by: Vegard Nossum Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_port.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index 0c880f17d27e..88dac3b79369 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -33,7 +33,9 @@ static int tty_port_default_receive_buf(struct tty_port *port, if (!disc) return 0; + mutex_lock(&tty->atomic_write_lock); ret = tty_ldisc_receive_buf(disc, p, (char *)f, count); + mutex_unlock(&tty->atomic_write_lock); tty_ldisc_deref(disc); From 88e2582e90bb89fe895ff0dceeb5d5ab65d07997 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Thu, 11 May 2017 12:56:14 +0200 Subject: [PATCH 066/254] serial: core: fix crash in uart_suspend_port With serdev we might end up with serial ports that have no cdev exported to userspace, as they are used as the bus interface to other devices. In that case serial_match_port() won't be able to find a matching tty_dev. Skip the irq wakeup enabling in that case, as serdev will make sure to keep the port active, as long as there are devices depending on it. Fixes: 8ee3fde04758 (tty_port: register tty ports with serdev bus) Signed-off-by: Lucas Stach Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 0f45b7884a2c..bc6caea6099f 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -2083,7 +2083,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport) mutex_lock(&port->mutex); tty_dev = device_find_child(uport->dev, &match, serial_match_port); - if (device_may_wakeup(tty_dev)) { + if (tty_dev && device_may_wakeup(tty_dev)) { if (!enable_irq_wake(uport->irq)) uport->irq_wake = 1; put_device(tty_dev); From 6bdc00d01e202ae11fa1cae0dacbef895434483d Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 28 Apr 2017 13:47:21 +0200 Subject: [PATCH 067/254] serdev: Restore serdev_device_write_buf for atomic context Starting with commit 6fe729c4bdae ("serdev: Add serdev_device_write subroutine") the function serdev_device_write_buf cannot be used in atomic context anymore (mutex_lock is sleeping). So restore the old behavior. Signed-off-by: Stefan Wahren Fixes: 6fe729c4bdae ("serdev: Add serdev_device_write subroutine") Acked-by: Rob Herring Reviewed-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serdev/core.c | 12 ++++++++++++ include/linux/serdev.h | 14 +++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c index 433de5ea9b02..f71b47334149 100644 --- a/drivers/tty/serdev/core.c +++ b/drivers/tty/serdev/core.c @@ -122,6 +122,18 @@ void serdev_device_write_wakeup(struct serdev_device *serdev) } EXPORT_SYMBOL_GPL(serdev_device_write_wakeup); +int serdev_device_write_buf(struct serdev_device *serdev, + const unsigned char *buf, size_t count) +{ + struct serdev_controller *ctrl = serdev->ctrl; + + if (!ctrl || !ctrl->ops->write_buf) + return -EINVAL; + + return ctrl->ops->write_buf(ctrl, buf, count); +} +EXPORT_SYMBOL_GPL(serdev_device_write_buf); + int serdev_device_write(struct serdev_device *serdev, const unsigned char *buf, size_t count, unsigned long timeout) diff --git a/include/linux/serdev.h b/include/linux/serdev.h index cda76c6506ca..e2a225bf716d 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -195,6 +195,7 @@ int serdev_device_open(struct serdev_device *); void serdev_device_close(struct serdev_device *); unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int); void serdev_device_set_flow_control(struct serdev_device *, bool); +int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t); void serdev_device_wait_until_sent(struct serdev_device *, long); int serdev_device_get_tiocm(struct serdev_device *); int serdev_device_set_tiocm(struct serdev_device *, int, int); @@ -236,6 +237,12 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev return 0; } static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {} +static inline int serdev_device_write_buf(struct serdev_device *serdev, + const unsigned char *buf, + size_t count) +{ + return -ENODEV; +} static inline void serdev_device_wait_until_sent(struct serdev_device *sdev, long timeout) {} static inline int serdev_device_get_tiocm(struct serdev_device *serdev) { @@ -312,11 +319,4 @@ static inline struct device *serdev_tty_port_register(struct tty_port *port, static inline void serdev_tty_port_unregister(struct tty_port *port) {} #endif /* CONFIG_SERIAL_DEV_CTRL_TTYPORT */ -static inline int serdev_device_write_buf(struct serdev_device *serdev, - const unsigned char *data, - size_t count) -{ - return serdev_device_write(serdev, data, count, 0); -} - #endif /*_LINUX_SERDEV_H */ From 8cde11b2baa1d02eb2eb955dfd47d9f2a12f12cf Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 18 May 2017 17:33:00 +0200 Subject: [PATCH 068/254] tty/serdev: add serdev registration interface Add a new interface for registering a serdev controller and clients, and a helper function to deregister serdev devices (or a tty device) that were previously registered using the new interface. Once every driver currently using the tty_port_register_device() helpers have been vetted and converted to use the new serdev registration interface (at least for deregistration), we can move serdev registration to the current helpers and get rid of the serdev-specific functions. Reviewed-by: Rob Herring Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serdev/serdev-ttyport.c | 6 ++- drivers/tty/tty_port.c | 75 +++++++++++++++++++++++++++++ include/linux/serdev.h | 7 ++- include/linux/tty.h | 9 ++++ 4 files changed, 93 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c index 013efffd2e82..d0a021c93986 100644 --- a/drivers/tty/serdev/serdev-ttyport.c +++ b/drivers/tty/serdev/serdev-ttyport.c @@ -250,16 +250,18 @@ err_reset_data: return ERR_PTR(ret); } -void serdev_tty_port_unregister(struct tty_port *port) +int serdev_tty_port_unregister(struct tty_port *port) { struct serdev_controller *ctrl = port->client_data; struct serport *serport = serdev_controller_get_drvdata(ctrl); if (!serport) - return; + return -ENODEV; serdev_controller_remove(ctrl); port->client_ops = NULL; port->client_data = NULL; serdev_controller_put(ctrl); + + return 0; } diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index 88dac3b79369..4fb3165384c4 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -16,6 +16,7 @@ #include #include #include +#include static int tty_port_default_receive_buf(struct tty_port *port, const unsigned char *p, @@ -136,6 +137,80 @@ struct device *tty_port_register_device_attr(struct tty_port *port, } EXPORT_SYMBOL_GPL(tty_port_register_device_attr); +/** + * tty_port_register_device_attr_serdev - register tty or serdev device + * @port: tty_port of the device + * @driver: tty_driver for this device + * @index: index of the tty + * @device: parent if exists, otherwise NULL + * @drvdata: driver data for the device + * @attr_grp: attribute group for the device + * + * Register a serdev or tty device depending on if the parent device has any + * defined serdev clients or not. + */ +struct device *tty_port_register_device_attr_serdev(struct tty_port *port, + struct tty_driver *driver, unsigned index, + struct device *device, void *drvdata, + const struct attribute_group **attr_grp) +{ + struct device *dev; + + tty_port_link_device(port, driver, index); + + dev = serdev_tty_port_register(port, device, driver, index); + if (PTR_ERR(dev) != -ENODEV) { + /* Skip creating cdev if we registered a serdev device */ + return dev; + } + + return tty_register_device_attr(driver, index, device, drvdata, + attr_grp); +} +EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev); + +/** + * tty_port_register_device_serdev - register tty or serdev device + * @port: tty_port of the device + * @driver: tty_driver for this device + * @index: index of the tty + * @device: parent if exists, otherwise NULL + * + * Register a serdev or tty device depending on if the parent device has any + * defined serdev clients or not. + */ +struct device *tty_port_register_device_serdev(struct tty_port *port, + struct tty_driver *driver, unsigned index, + struct device *device) +{ + return tty_port_register_device_attr_serdev(port, driver, index, + device, NULL, NULL); +} +EXPORT_SYMBOL_GPL(tty_port_register_device_serdev); + +/** + * tty_port_unregister_device - deregister a tty or serdev device + * @port: tty_port of the device + * @driver: tty_driver for this device + * @index: index of the tty + * + * If a tty or serdev device is registered with a call to + * tty_port_register_device_serdev() then this function must be called when + * the device is gone. + */ +void tty_port_unregister_device(struct tty_port *port, + struct tty_driver *driver, unsigned index) +{ + int ret; + + ret = serdev_tty_port_unregister(port); + if (ret == 0) + return; + + tty_unregister_device(driver, index); +} +EXPORT_SYMBOL_GPL(tty_port_unregister_device); + int tty_port_alloc_xmit_buf(struct tty_port *port) { /* We may sleep in get_zeroed_page() */ diff --git a/include/linux/serdev.h b/include/linux/serdev.h index e2a225bf716d..e69402d4a8ae 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -308,7 +308,7 @@ struct tty_driver; struct device *serdev_tty_port_register(struct tty_port *port, struct device *parent, struct tty_driver *drv, int idx); -void serdev_tty_port_unregister(struct tty_port *port); +int serdev_tty_port_unregister(struct tty_port *port); #else static inline struct device *serdev_tty_port_register(struct tty_port *port, struct device *parent, @@ -316,7 +316,10 @@ static inline struct device *serdev_tty_port_register(struct tty_port *port, { return ERR_PTR(-ENODEV); } -static inline void serdev_tty_port_unregister(struct tty_port *port) {} +static inline int serdev_tty_port_unregister(struct tty_port *port) +{ + return -ENODEV; +} #endif /* CONFIG_SERIAL_DEV_CTRL_TTYPORT */ #endif /*_LINUX_SERDEV_H */ diff --git a/include/linux/tty.h b/include/linux/tty.h index d07cd2105a6c..eccb4ec30a8a 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -558,6 +558,15 @@ extern struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); +extern struct device *tty_port_register_device_serdev(struct tty_port *port, + struct tty_driver *driver, unsigned index, + struct device *device); +extern struct device *tty_port_register_device_attr_serdev(struct tty_port *port, + struct tty_driver *driver, unsigned index, + struct device *device, void *drvdata, + const struct attribute_group **attr_grp); +extern void tty_port_unregister_device(struct tty_port *port, + struct tty_driver *driver, unsigned index); extern int tty_port_alloc_xmit_buf(struct tty_port *port); extern void tty_port_free_xmit_buf(struct tty_port *port); extern void tty_port_destroy(struct tty_port *port); From da4c279942b05727088774df224c0734688b4cbc Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 18 May 2017 17:33:01 +0200 Subject: [PATCH 069/254] serial: enable serdev support Enable serdev support by using the new device-registration helpers. Reviewed-by: Rob Herring Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index bc6caea6099f..13bfd5dcffce 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -2782,7 +2782,7 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport) * Register the port whether it's detected or not. This allows * setserial to be used to alter this port's parameters. */ - tty_dev = tty_port_register_device_attr(port, drv->tty_driver, + tty_dev = tty_port_register_device_attr_serdev(port, drv->tty_driver, uport->line, uport->dev, port, uport->tty_groups); if (likely(!IS_ERR(tty_dev))) { device_set_wakeup_capable(tty_dev, 1); @@ -2845,7 +2845,7 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *uport) /* * Remove the devices from the tty layer */ - tty_unregister_device(drv->tty_driver, uport->line); + tty_port_unregister_device(port, drv->tty_driver, uport->line); tty = tty_port_tty_get(port); if (tty) { From 463f620b1256e0488d932088e04a372817e8c42e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Potomski?= Date: Fri, 12 May 2017 08:36:27 +0200 Subject: [PATCH 070/254] scsi: ufs: Clean up some rpm/spm level SysFS nodes upon remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When reloading module these two attributes aren't cleaned up properly and they persist causing warnings when trying to load module again. Additionally they are not recreated properly due to that. Signed-off-by: Michał Potomski Reviewed-by: Subhash Jadavani Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index abc7e87937cc..ffe8d8608818 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7698,6 +7698,12 @@ static inline void ufshcd_add_sysfs_nodes(struct ufs_hba *hba) ufshcd_add_spm_lvl_sysfs_nodes(hba); } +static inline void ufshcd_remove_sysfs_nodes(struct ufs_hba *hba) +{ + device_remove_file(hba->dev, &hba->rpm_lvl_attr); + device_remove_file(hba->dev, &hba->spm_lvl_attr); +} + /** * ufshcd_shutdown - shutdown routine * @hba: per adapter instance @@ -7735,6 +7741,7 @@ EXPORT_SYMBOL(ufshcd_shutdown); */ void ufshcd_remove(struct ufs_hba *hba) { + ufshcd_remove_sysfs_nodes(hba); scsi_remove_host(hba->host); /* disable interrupts */ ufshcd_disable_intr(hba, hba->intr_mask); From a351e40b6de550049423a26f7ded7b639e363d89 Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Wed, 17 May 2017 20:30:43 +0530 Subject: [PATCH 071/254] scsi: csiostor: fix use after free in csio_hw_use_fwconfig() mbp pointer is passed to csio_hw_validate_caps() so call mempool_free() after calling csio_hw_validate_caps(). Signed-off-by: Varun Prakash Fixes: 541c571fa2fd ("csiostor:Use firmware version from cxgb4/t4fw_version.h") Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/csiostor/csio_hw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/csiostor/csio_hw.c b/drivers/scsi/csiostor/csio_hw.c index 622bdabc8894..dab195f04da7 100644 --- a/drivers/scsi/csiostor/csio_hw.c +++ b/drivers/scsi/csiostor/csio_hw.c @@ -1769,7 +1769,6 @@ csio_hw_use_fwconfig(struct csio_hw *hw, int reset, u32 *fw_cfg_param) goto bye; } - mempool_free(mbp, hw->mb_mempool); if (finicsum != cfcsum) { csio_warn(hw, "Config File checksum mismatch: csum=%#x, computed=%#x\n", @@ -1780,6 +1779,10 @@ csio_hw_use_fwconfig(struct csio_hw *hw, int reset, u32 *fw_cfg_param) rv = csio_hw_validate_caps(hw, mbp); if (rv != 0) goto bye; + + mempool_free(mbp, hw->mb_mempool); + mbp = NULL; + /* * Note that we're operating with parameters * not supplied by the driver, rather than from hard-wired From 1bad6c4a57efda0d5f5bf8a2403b21b1ed24875c Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 18 May 2017 15:40:05 -0700 Subject: [PATCH 072/254] scsi: zero per-cmd private driver data for each MQ I/O In lower layer driver's (LLD) scsi_host_template, the driver may optionally ask SCSI to allocate its private driver memory for each command, by specifying cmd_size. This memory is allocated at the end of scsi_cmnd by SCSI. Later when SCSI queues a command, the LLD can use scsi_cmd_priv to get to its private data. Some LLD, e.g. hv_storvsc, doesn't clear its private data before use. In this case, the LLD may get to stale or uninitialized data in its private driver memory. This may result in unexpected driver and hardware behavior. Fix this problem by also zeroing the private driver memory before passing them to LLD. Signed-off-by: Long Li Reviewed-by: Bart Van Assche Reviewed-by: KY Srinivasan Reviewed-by: Christoph Hellwig CC: # 4.11+ Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index e31f1cc90b81..99e16ac479e3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1851,7 +1851,7 @@ static int scsi_mq_prep_fn(struct request *req) /* zero out the cmd, except for the embedded scsi_request */ memset((char *)cmd + sizeof(cmd->req), 0, - sizeof(*cmd) - sizeof(cmd->req)); + sizeof(*cmd) - sizeof(cmd->req) + shost->hostt->cmd_size); req->special = cmd; From bae3dee0992dcb336a591468376b046e5447997b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 16 May 2017 14:17:20 +0800 Subject: [PATCH 073/254] mmc: sdhci-xenon: kill xenon_clean_phy() Currently, the xenon_clean_phy() is only used for freeing phy_params. The phy_params is allocated by devm_kzalloc(), there's no need to free is explicitly. Signed-off-by: Jisheng Zhang Acked-by: Hu Ziji Acked-by: Adrian Hunter --- drivers/mmc/host/sdhci-xenon-phy.c | 14 +------------- drivers/mmc/host/sdhci-xenon.c | 6 +----- drivers/mmc/host/sdhci-xenon.h | 1 - 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/drivers/mmc/host/sdhci-xenon-phy.c b/drivers/mmc/host/sdhci-xenon-phy.c index 6356781f1cca..f7e26b031e76 100644 --- a/drivers/mmc/host/sdhci-xenon-phy.c +++ b/drivers/mmc/host/sdhci-xenon-phy.c @@ -787,14 +787,6 @@ int xenon_phy_adj(struct sdhci_host *host, struct mmc_ios *ios) return ret; } -void xenon_clean_phy(struct sdhci_host *host) -{ - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct xenon_priv *priv = sdhci_pltfm_priv(pltfm_host); - - kfree(priv->phy_params); -} - static int xenon_add_phy(struct device_node *np, struct sdhci_host *host, const char *phy_name) { @@ -819,11 +811,7 @@ static int xenon_add_phy(struct device_node *np, struct sdhci_host *host, if (ret) return ret; - ret = xenon_emmc_phy_parse_param_dt(host, np, priv->phy_params); - if (ret) - xenon_clean_phy(host); - - return ret; + return xenon_emmc_phy_parse_param_dt(host, np, priv->phy_params); } int xenon_phy_parse_dt(struct device_node *np, struct sdhci_host *host) diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index 67246655315b..bc1781bb070b 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -486,7 +486,7 @@ static int xenon_probe(struct platform_device *pdev) err = xenon_sdhc_prepare(host); if (err) - goto clean_phy_param; + goto err_clk; err = sdhci_add_host(host); if (err) @@ -496,8 +496,6 @@ static int xenon_probe(struct platform_device *pdev) remove_sdhc: xenon_sdhc_unprepare(host); -clean_phy_param: - xenon_clean_phy(host); err_clk: clk_disable_unprepare(pltfm_host->clk); free_pltfm: @@ -510,8 +508,6 @@ static int xenon_remove(struct platform_device *pdev) struct sdhci_host *host = platform_get_drvdata(pdev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - xenon_clean_phy(host); - sdhci_remove_host(host, 0); xenon_sdhc_unprepare(host); diff --git a/drivers/mmc/host/sdhci-xenon.h b/drivers/mmc/host/sdhci-xenon.h index 6e6523ea01ce..73debb42dc2f 100644 --- a/drivers/mmc/host/sdhci-xenon.h +++ b/drivers/mmc/host/sdhci-xenon.h @@ -93,7 +93,6 @@ struct xenon_priv { }; int xenon_phy_adj(struct sdhci_host *host, struct mmc_ios *ios); -void xenon_clean_phy(struct sdhci_host *host); int xenon_phy_parse_dt(struct device_node *np, struct sdhci_host *host); void xenon_soc_pad_ctrl(struct sdhci_host *host, From aca69344c8a99e7374d913e42ba9120c398ee16f Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 16 May 2017 11:36:51 +0200 Subject: [PATCH 074/254] mmc: cavium-octeon: Fix interrupt enable code OCTEON SoCs with CIU3 do not have interrupt masking local to the MMC bus interface. Unfortunately, some even have a diagnostic register at the same address of the enable register, which causes the interrupts to fire immediately if stored to, thus breaking the driver. The proper action on these SoCs is not to touch this register. Fixes: 01d95843335c ("mmc: cavium: Add MMC support for Octeon SOCs.") Signed-off-by: David Daney [jglauber@cavium.com: removed point after subject line] Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium-octeon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium-octeon.c b/drivers/mmc/host/cavium-octeon.c index 772d0900026d..d698d66e3327 100644 --- a/drivers/mmc/host/cavium-octeon.c +++ b/drivers/mmc/host/cavium-octeon.c @@ -108,7 +108,7 @@ static void octeon_mmc_release_bus(struct cvm_mmc_host *host) static void octeon_mmc_int_enable(struct cvm_mmc_host *host, u64 val) { writeq(val, host->base + MIO_EMM_INT(host)); - if (!host->dma_active || (host->dma_active && !host->has_ciu3)) + if (!host->has_ciu3) writeq(val, host->base + MIO_EMM_INT_EN(host)); } From 899e4aad15e93315fa18ab9e9c88904ad237cfa0 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 16 May 2017 11:36:52 +0200 Subject: [PATCH 075/254] mmc: cavium-octeon: Use proper GPIO name for power control The devm_gpiod_get_optional() function appends a "-gpios" to the string passed to it, so if we want to find the "power-gpios" signal, we must pass "power" to this function. Fixes: 01d95843335c ("mmc: cavium: Add MMC support for Octeon SOCs.") Signed-off-by: David Daney [jglauber@cavium.com: removed point after subject line] Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium-octeon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium-octeon.c b/drivers/mmc/host/cavium-octeon.c index d698d66e3327..cbb566377508 100644 --- a/drivers/mmc/host/cavium-octeon.c +++ b/drivers/mmc/host/cavium-octeon.c @@ -267,7 +267,7 @@ static int octeon_mmc_probe(struct platform_device *pdev) } host->global_pwr_gpiod = devm_gpiod_get_optional(&pdev->dev, - "power-gpios", + "power", GPIOD_OUT_HIGH); if (IS_ERR(host->global_pwr_gpiod)) { dev_err(&pdev->dev, "Invalid power GPIO\n"); From fe06fe860250a4f01d0eaf70a2563b1997174a74 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 19 May 2017 11:29:04 +1000 Subject: [PATCH 076/254] selftests/powerpc: Fix TM resched DSCR test with some compilers The tm-resched-dscr test has started failing sometimes, depending on what compiler it's built with, eg: test: tm_resched_dscr Check DSCR TM context switch: tm-resched-dscr: tm-resched-dscr.c:76: test_body: Assertion `rv' failed. !! child died by signal 6 When it fails we see that the compiler doesn't initialise rv to 1 before entering the inline asm block. Although that's counter intuitive, it is allowed because we tell the compiler that the inline asm will write to rv (using "=r"), meaning the original value is irrelevant. Marking it as a read/write parameter would presumably work, but it seems simpler to fix it by setting the initial value of rv in the inline asm. Fixes: 96d016108640 ("powerpc: Correct DSCR during TM context switch") Signed-off-by: Michael Ellerman Acked-by: Michael Neuling --- tools/testing/selftests/powerpc/tm/tm-resched-dscr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c index d9c49f41515e..e79ccd6aada1 100644 --- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c +++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c @@ -42,12 +42,12 @@ int test_body(void) printf("Check DSCR TM context switch: "); fflush(stdout); for (;;) { - rv = 1; asm __volatile__ ( /* set a known value into the DSCR */ "ld 3, %[dscr1];" "mtspr %[sprn_dscr], 3;" + "li %[rv], 1;" /* start and suspend a transaction */ "tbegin.;" "beq 1f;" From a486cd23661c9387fb076c3f6ae8b2aa9d20d54a Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 19 May 2017 12:47:00 +0200 Subject: [PATCH 077/254] xfrm: fix state migration copy replay sequence numbers During xfrm migration copy replay and preplay sequence numbers from the previous state. Here is a tcpdump output showing the problem. 10.0.10.46 is running vanilla kernel, is the IKE/IPsec responder. After the migration it sent wrong sequence number, reset to 1. The migration is from 10.0.0.52 to 10.0.0.53. IP 10.0.0.52.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7cf), length 136 IP 10.0.10.46.4500 > 10.0.0.52.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x7cf), length 136 IP 10.0.0.52.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d0), length 136 IP 10.0.10.46.4500 > 10.0.0.52.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x7d0), length 136 IP 10.0.0.53.4500 > 10.0.10.46.4500: NONESP-encap: isakmp: child_sa inf2[I] IP 10.0.10.46.4500 > 10.0.0.53.4500: NONESP-encap: isakmp: child_sa inf2[R] IP 10.0.0.53.4500 > 10.0.10.46.4500: NONESP-encap: isakmp: child_sa inf2[I] IP 10.0.10.46.4500 > 10.0.0.53.4500: NONESP-encap: isakmp: child_sa inf2[R] IP 10.0.0.53.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d1), length 136 NOTE: next sequence is wrong 0x1 IP 10.0.10.46.4500 > 10.0.0.53.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x1), length 136 IP 10.0.0.53.4500 > 10.0.10.46.4500: UDP-encap: ESP(spi=0x43ef462d,seq=0x7d2), length 136 IP 10.0.10.46.4500 > 10.0.0.53.4500: UDP-encap: ESP(spi=0xca1c282d,seq=0x2), length 136 Signed-off-by: Antony Antony Reviewed-by: Richard Guy Briggs Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index fc3c5aa38754..2e291bc5f1fc 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1383,6 +1383,8 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig) x->curlft.add_time = orig->curlft.add_time; x->km.state = orig->km.state; x->km.seq = orig->km.seq; + x->replay = orig->replay; + x->preplay = orig->preplay; return x; From 3ecb3ac7b950ff8f6c6a61e8b7b0d6e3546429a0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 May 2017 19:16:15 -0700 Subject: [PATCH 078/254] xfs: avoid mount-time deadlock in CoW extent recovery If a malicious user corrupts the refcount btree to cause a cycle between different levels of the tree, the next mount attempt will deadlock in the CoW recovery routine while grabbing buffer locks. We can use the ability to re-grab a buffer that was previous locked to a transaction to avoid deadlocks, so do that here. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_refcount.c | 43 ++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b177ef33cd4c..82a38d86ebad 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1629,13 +1629,28 @@ xfs_refcount_recover_cow_leftovers( if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) return -EOPNOTSUPP; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + INIT_LIST_HEAD(&debris); + + /* + * In this first part, we use an empty transaction to gather up + * all the leftover CoW extents so that we can subsequently + * delete them. The empty transaction is used to avoid + * a buffer lock deadlock if there happens to be a loop in the + * refcountbt because we're allowed to re-grab a buffer that is + * already attached to our transaction. When we're done + * recording the CoW debris we cancel the (empty) transaction + * and everything goes away cleanly. + */ + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ - INIT_LIST_HEAD(&debris); memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); low.rc.rc_startblock = XFS_REFC_COW_START; @@ -1645,10 +1660,11 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_cursor; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); + xfs_trans_cancel(tp); /* Now iterate the list to free the leftovers */ - list_for_each_entry(rr, &debris, rr_list) { + list_for_each_entry_safe(rr, n, &debris, rr_list) { /* Set up transaction. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) @@ -1676,8 +1692,16 @@ xfs_refcount_recover_cow_leftovers( error = xfs_trans_commit(tp); if (error) goto out_free; + + list_del(&rr->rr_list); + kmem_free(rr); } + return error; +out_defer: + xfs_defer_cancel(&dfops); +out_trans: + xfs_trans_cancel(tp); out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { @@ -1688,11 +1712,6 @@ out_free: out_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_buf_relse(agbp); - goto out_free; - -out_defer: - xfs_defer_cancel(&dfops); - xfs_trans_cancel(tp); - goto out_free; + xfs_trans_brelse(tp, agbp); + goto out_trans; } From 3780578761921f094179c6289072a74b2228c602 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Sat, 20 May 2017 15:03:29 -0500 Subject: [PATCH 079/254] x86/boot: Use CROSS_COMPILE prefix for readelf The boot code Makefile contains a straight 'readelf' invocation. This causes build warnings in cross compile environments, when there is no unprefixed readelf accessible via $PATH. Add the missing $(CROSS_COMPILE) prefix. [ tglx: Rewrote changelog ] Fixes: 98f78525371b ("x86/boot: Refuse to build with data relocations") Signed-off-by: Rob Landley Acked-by: Kees Cook Cc: Jiri Kosina Cc: Paul Bolle Cc: "H.J. Lu" Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/ced18878-693a-9576-a024-113ef39a22c0@landley.net Signed-off-by: Thomas Gleixner --- arch/x86/boot/compressed/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 44163e8c3868..2c860ad4fe06 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -94,7 +94,7 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o quiet_cmd_check_data_rel = DATAREL $@ define cmd_check_data_rel for obj in $(filter %.o,$^); do \ - readelf -S $$obj | grep -qF .rel.local && { \ + ${CROSS_COMPILE}readelf -S $$obj | grep -qF .rel.local && { \ echo "error: $$obj has data relocations!" >&2; \ exit 1; \ } || true; \ From 499350a5a6e7512d9ed369ed63a4244b6536f4f8 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 18 May 2017 11:22:33 -0700 Subject: [PATCH 080/254] tcp: initialize rcv_mss to TCP_MIN_MSS instead of 0 When tcp_disconnect() is called, inet_csk_delack_init() sets icsk->icsk_ack.rcv_mss to 0. This could potentially cause tcp_recvmsg() => tcp_cleanup_rbuf() => __tcp_select_window() call path to have division by 0 issue. So this patch initializes rcv_mss to TCP_MIN_MSS instead of 0. Reported-by: Andrey Konovalov Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1e4c76d2b827..842b575f8fdd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2320,6 +2320,10 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window() + */ + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); From 34eb5fe07831458cf8238d54c1fc847dedeaf68c Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 18 May 2017 12:41:18 -0700 Subject: [PATCH 081/254] arp: fixed error in a comment the is_garp code deals just with gratuitous ARP packets, not every unsolicited packet. This patch is a result of a discussion in netdev: http://marc.info/?l=linux-netdev&m=149506354216994 Suggested-by: Julian Anastasov Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d54345a06f72..053492af8a6e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -846,7 +846,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ is_garp = tip == sip && addr_type == RTN_UNICAST; - /* Unsolicited ARP _replies_ also require target hwaddr to be + /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && arp->ar_op == htons(ARPOP_REPLY)) From 6fd05633bdafc0ae6ec0d55e61af10780d4d3530 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 18 May 2017 12:41:19 -0700 Subject: [PATCH 082/254] arp: decompose is_garp logic into a separate function The code is quite involving already to earn a separate function for itself. If anything, it helps arp_process readability. Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 053492af8a6e..ca6e1e6c1496 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -641,6 +641,27 @@ void arp_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(arp_xmit); +static bool arp_is_garp(struct net_device *dev, int addr_type, + __be16 ar_op, + __be32 sip, __be32 tip, + unsigned char *sha, unsigned char *tha) +{ + bool is_garp = tip == sip && addr_type == RTN_UNICAST; + + /* Gratuitous ARP _replies_ also require target hwaddr to be + * the same as source. + */ + if (is_garp && ar_op == htons(ARPOP_REPLY)) + is_garp = + /* IPv4 over IEEE 1394 doesn't provide target + * hardware address field in its ARP payload. + */ + tha && + !memcmp(tha, sha, dev->addr_len); + + return is_garp; +} + /* * Process an arp request. */ @@ -844,18 +865,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = tip == sip && addr_type == RTN_UNICAST; - - /* Gratuitous ARP _replies_ also require target hwaddr to be - * the same as source. - */ - if (is_garp && arp->ar_op == htons(ARPOP_REPLY)) - is_garp = - /* IPv4 over IEEE 1394 doesn't provide target - * hardware address field in its ARP payload. - */ - tha && - !memcmp(tha, sha, dev->addr_len); + is_garp = arp_is_garp(dev, addr_type, arp->ar_op, + sip, tip, sha, tha); if (!n && ((arp->ar_op == htons(ARPOP_REPLY) && From d9ef2e7bf99f59179b89d5c1c4d5b4919375daee Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 18 May 2017 12:41:20 -0700 Subject: [PATCH 083/254] arp: postpone addr_type calculation to as late as possible The addr_type retrieval can be costly, so it's worth trying to avoid its calculation as much as possible. This patch makes it calculated only for gratuitous ARP packets. This is especially important since later we may want to move is_garp calculation outside of arp_accept block, at which point the costly operation will be executed for all setups. The patch is the result of a discussion in net-dev: http://marc.info/?l=linux-netdev&m=149506354216994 Suggested-by: Julian Anastasov Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ca6e1e6c1496..c22103cec823 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -641,12 +641,12 @@ void arp_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(arp_xmit); -static bool arp_is_garp(struct net_device *dev, int addr_type, - __be16 ar_op, +static bool arp_is_garp(struct net *net, struct net_device *dev, + int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { - bool is_garp = tip == sip && addr_type == RTN_UNICAST; + bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. @@ -659,6 +659,11 @@ static bool arp_is_garp(struct net_device *dev, int addr_type, tha && !memcmp(tha, sha, dev->addr_len); + if (is_garp) { + *addr_type = inet_addr_type_dev_table(net, dev, sip); + if (*addr_type != RTN_UNICAST) + is_garp = false; + } return is_garp; } @@ -859,18 +864,23 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); if (IN_DEV_ARP_ACCEPT(in_dev)) { - unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip); + addr_type = -1; /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = arp_is_garp(dev, addr_type, arp->ar_op, + is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); if (!n && - ((arp->ar_op == htons(ARPOP_REPLY) && - addr_type == RTN_UNICAST) || is_garp)) + (is_garp || + (arp->ar_op == htons(ARPOP_REPLY) && + (addr_type == RTN_UNICAST || + (addr_type < 0 && + /* postpone calculation to as late as possible */ + inet_addr_type_dev_table(net, dev, sip) == + RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } From 7d472a59c0e5ec117220a05de6b370447fb6cb66 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 18 May 2017 12:41:21 -0700 Subject: [PATCH 084/254] arp: always override existing neigh entries with gratuitous ARP Currently, when arp_accept is 1, we always override existing neigh entries with incoming gratuitous ARP replies. Otherwise, we override them only if new replies satisfy _locktime_ conditional (packets arrive not earlier than _locktime_ seconds since the last update to the neigh entry). The idea behind locktime is to pick the very first (=> close) reply received in a unicast burst when ARP proxies are used. This helps to avoid ARP thrashing where Linux would switch back and forth from one proxy to another. This logic has nothing to do with gratuitous ARP replies that are generally not aligned in time when multiple IP address carriers send them into network. This patch enforces overriding of existing neigh entries by all incoming gratuitous ARP packets, irrespective of their time of arrival. This will make the kernel honour all incoming gratuitous ARP packets. Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c22103cec823..ae96e6f3e0cb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -863,16 +863,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); - if (IN_DEV_ARP_ACCEPT(in_dev)) { + if (n || IN_DEV_ARP_ACCEPT(in_dev)) { addr_type = -1; + is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, + sip, tip, sha, tha); + } + if (IN_DEV_ARP_ACCEPT(in_dev)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, - sip, tip, sha, tha); - if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && From fe0cd8ca1b82983db24b173bb8518ea646c02d25 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Fri, 19 May 2017 14:00:25 +0000 Subject: [PATCH 085/254] smsc95xx: Support only IPv4 TCP/UDP csum offload When TX checksum offload is used, if the computed checksum is 0 the LAN95xx device do not alter the checksum to 0xffff. In the case of ipv4 UDP checksum, it indicates to receiver that no checksum is calculated. Under ipv6, UDP checksum yields a result of zero must be changed to 0xffff. Hence disabling checksum offload for ipv6 packets. Signed-off-by: Nisar Sayed Reported-by: popcorn mix Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 765400b62168..2dfca96a63b6 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -681,7 +681,7 @@ static int smsc95xx_set_features(struct net_device *netdev, if (ret < 0) return ret; - if (features & NETIF_F_HW_CSUM) + if (features & NETIF_F_IP_CSUM) read_buf |= Tx_COE_EN_; else read_buf &= ~Tx_COE_EN_; @@ -1279,12 +1279,19 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf) spin_lock_init(&pdata->mac_cr_lock); + /* LAN95xx devices do not alter the computed checksum of 0 to 0xffff. + * RFC 2460, ipv6 UDP calculated checksum yields a result of zero must + * be changed to 0xffff. RFC 768, ipv4 UDP computed checksum is zero, + * it is transmitted as all ones. The zero transmitted checksum means + * transmitter generated no checksum. Hence, enable csum offload only + * for ipv4 packets. + */ if (DEFAULT_TX_CSUM_ENABLE) - dev->net->features |= NETIF_F_HW_CSUM; + dev->net->features |= NETIF_F_IP_CSUM; if (DEFAULT_RX_CSUM_ENABLE) dev->net->features |= NETIF_F_RXCSUM; - dev->net->hw_features = NETIF_F_HW_CSUM | NETIF_F_RXCSUM; + dev->net->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM; smsc95xx_init_mac_address(dev); From 6d18c732b95c0a9d35e9f978b4438bba15412284 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 19 May 2017 22:20:29 +0800 Subject: [PATCH 086/254] bridge: start hello_timer when enabling KERNEL_STP in br_stp_start Since commit 76b91c32dd86 ("bridge: stp: when using userspace stp stop kernel hello and hold timers"), bridge would not start hello_timer if stp_enabled is not KERNEL_STP when br_dev_open. The problem is even if users set stp_enabled with KERNEL_STP later, the timer will still not be started. It causes that KERNEL_STP can not really work. Users have to re-ifup the bridge to avoid this. This patch is to fix it by starting br->hello_timer when enabling KERNEL_STP in br_stp_start. As an improvement, it's also to start hello_timer again only when br->stp_enabled is KERNEL_STP in br_hello_timer_expired, there is no reason to start the timer again when it's NO_STP. Fixes: 76b91c32dd86 ("bridge: stp: when using userspace stp stop kernel hello and hold timers") Reported-by: Haidong Li Signed-off-by: Xin Long Acked-by: Nikolay Aleksandrov Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 1 + net/bridge/br_stp_timer.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 08341d2aa9c9..0db8102995a5 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -179,6 +179,7 @@ static void br_stp_start(struct net_bridge *br) br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ + mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index c98b3e5c140a..60b6fe277a8b 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -40,7 +40,7 @@ static void br_hello_timer_expired(unsigned long arg) if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); - if (br->stp_enabled != BR_USER_STP) + if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); } From 2d1f406139ec20320bf38bcd2461aa8e358084b5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 19 May 2017 11:39:09 +0200 Subject: [PATCH 087/254] x86/MCE: Export memory_error() Export the function which checks whether an MCE is a memory error to other users so that we can reuse the logic. Drop the boot_cpu_data use, while at it, as mce.cpuvendor already has the CPU vendor in there. Integrate a piece from a patch from Vishal Verma to export it for modules (nfit). The main reason we're exporting it is that the nfit handler nfit_handle_mce() needs to detect a memory error properly before doing its recovery actions. Signed-off-by: Borislav Petkov Cc: Tony Luck Cc: Vishal Verma Cc: Link: http://lkml.kernel.org/r/20170519093915.15413-2-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4fd5195deed0..3f9a3d2a5209 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -266,6 +266,7 @@ static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *s #endif int mce_available(struct cpuinfo_x86 *c); +bool mce_is_memory_error(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5abd4bf73d6e..5cfbaeb6529a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -499,16 +499,14 @@ static int mce_usable_address(struct mce *m) return 1; } -static bool memory_error(struct mce *m) +bool mce_is_memory_error(struct mce *m) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { + if (m->cpuvendor == X86_VENDOR_AMD) { /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { + } else if (m->cpuvendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -529,6 +527,7 @@ static bool memory_error(struct mce *m) return false; } +EXPORT_SYMBOL_GPL(mce_is_memory_error); static bool cec_add_mce(struct mce *m) { @@ -536,7 +535,7 @@ static bool cec_add_mce(struct mce *m) return false; /* We eat only correctable DRAM errors with usable addresses. */ - if (memory_error(m) && + if (mce_is_memory_error(m) && !(m->status & MCI_STATUS_UC) && mce_usable_address(m)) if (!cec_add_elem(m->addr >> PAGE_SHIFT)) @@ -713,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) + if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m)) if (m.status & MCI_STATUS_ADDRV) m.severity = severity; From fc08a4703a418a398bbb575ac311d36d110ac786 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 19 May 2017 11:39:10 +0200 Subject: [PATCH 088/254] acpi, nfit: Fix the memory error check in nfit_handle_mce() The check for an MCE being a memory error in the NFIT mce handler was bogus. Use the new mce_is_memory_error() helper to detect the error properly. Reported-by: Tony Luck Signed-off-by: Vishal Verma Signed-off-by: Borislav Petkov Cc: Link: http://lkml.kernel.org/r/20170519093915.15413-3-bp@alien8.de Signed-off-by: Thomas Gleixner --- drivers/acpi/nfit/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index 3ba1c3472cf9..fd86bec98dea 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -26,7 +26,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, struct nfit_spa *nfit_spa; /* We only care about memory errors */ - if (!(mce->status & MCACOD)) + if (!mce_is_memory_error(mce)) return NOTIFY_DONE; /* From e480eabae232b92ff44ce63678280373713920a4 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 18 May 2017 21:33:44 +0200 Subject: [PATCH 089/254] drm/radeon: Fix oops upon driver load on PowerXpress laptops Nicolai Stange reports the following oops which is caused by dereferencing rdev->pdev before it's subsequently set by radeon_device_init(). Fix it. BUG: unable to handle kernel NULL pointer dereference at 00000000000007cb IP: radeon_driver_load_kms+0xeb/0x230 [radeon] ... Call Trace: drm_dev_register+0x146/0x1d0 [drm] drm_get_pci_dev+0x9a/0x180 [drm] radeon_pci_probe+0xb8/0xe0 [radeon] local_pci_probe+0x45/0xa0 pci_device_probe+0x14f/0x1a0 driver_probe_device+0x29c/0x450 __driver_attach+0xdf/0xf0 ? driver_probe_device+0x450/0x450 bus_for_each_dev+0x6c/0xc0 driver_attach+0x1e/0x20 bus_add_driver+0x170/0x270 driver_register+0x60/0xe0 ? 0xffffffffc0508000 __pci_register_driver+0x4c/0x50 drm_pci_init+0xeb/0x100 [drm] ? vga_switcheroo_register_handler+0x6a/0x90 ? 0xffffffffc0508000 radeon_init+0x98/0xb6 [radeon] do_one_initcall+0x52/0x1a0 ? __vunmap+0x81/0xb0 ? kmem_cache_alloc_trace+0x159/0x1b0 ? do_init_module+0x27/0x1f8 do_init_module+0x5f/0x1f8 load_module+0x27ce/0x2be0 SYSC_finit_module+0xdf/0x110 ? SYSC_finit_module+0xdf/0x110 SyS_finit_module+0xe/0x10 do_syscall_64+0x67/0x150 entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 7ffb0ce31cf9 ("drm/radeon: Don't register Thunderbolt eGPU with vga_switcheroo") Reported-and-tested-by: Nicolai Stange Signed-off-by: Lukas Wunner Link: http://patchwork.freedesktop.org/patch/msgid/cfb91ba052af06117137eec0637543a2626a7979.1495135190.git.lukas@wunner.de --- drivers/gpu/drm/radeon/radeon_kms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index e3e7cb1d10a2..4761f27f2ca2 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -116,7 +116,7 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags) if ((radeon_runtime_pm != 0) && radeon_has_atpx() && ((flags & RADEON_IS_IGP) == 0) && - !pci_is_thunderbolt_attached(rdev->pdev)) + !pci_is_thunderbolt_attached(dev->pdev)) flags |= RADEON_IS_PX; /* radeon_device_init should report only fatal error From 5165da5923d6c7df6f2927b0113b2e4d9288661e Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Fri, 5 May 2017 11:06:50 +0200 Subject: [PATCH 090/254] i2c: i2c-tiny-usb: fix buffer not being DMA capable Since v4.9 i2c-tiny-usb generates the below call trace and longer works, since it can't communicate with the USB device. The reason is, that since v4.9 the USB stack checks, that the buffer it should transfer is DMA capable. This was a requirement since v2.2 days, but it usually worked nevertheless. [ 17.504959] ------------[ cut here ]------------ [ 17.505488] WARNING: CPU: 0 PID: 93 at drivers/usb/core/hcd.c:1587 usb_hcd_map_urb_for_dma+0x37c/0x570 [ 17.506545] transfer buffer not dma capable [ 17.507022] Modules linked in: [ 17.507370] CPU: 0 PID: 93 Comm: i2cdetect Not tainted 4.11.0-rc8+ #10 [ 17.508103] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 17.509039] Call Trace: [ 17.509320] ? dump_stack+0x5c/0x78 [ 17.509714] ? __warn+0xbe/0xe0 [ 17.510073] ? warn_slowpath_fmt+0x5a/0x80 [ 17.510532] ? nommu_map_sg+0xb0/0xb0 [ 17.510949] ? usb_hcd_map_urb_for_dma+0x37c/0x570 [ 17.511482] ? usb_hcd_submit_urb+0x336/0xab0 [ 17.511976] ? wait_for_completion_timeout+0x12f/0x1a0 [ 17.512549] ? wait_for_completion_timeout+0x65/0x1a0 [ 17.513125] ? usb_start_wait_urb+0x65/0x160 [ 17.513604] ? usb_control_msg+0xdc/0x130 [ 17.514061] ? usb_xfer+0xa4/0x2a0 [ 17.514445] ? __i2c_transfer+0x108/0x3c0 [ 17.514899] ? i2c_transfer+0x57/0xb0 [ 17.515310] ? i2c_smbus_xfer_emulated+0x12f/0x590 [ 17.515851] ? _raw_spin_unlock_irqrestore+0x11/0x20 [ 17.516408] ? i2c_smbus_xfer+0x125/0x330 [ 17.516876] ? i2c_smbus_xfer+0x125/0x330 [ 17.517329] ? i2cdev_ioctl_smbus+0x1c1/0x2b0 [ 17.517824] ? i2cdev_ioctl+0x75/0x1c0 [ 17.518248] ? do_vfs_ioctl+0x9f/0x600 [ 17.518671] ? vfs_write+0x144/0x190 [ 17.519078] ? SyS_ioctl+0x74/0x80 [ 17.519463] ? entry_SYSCALL_64_fastpath+0x1e/0xad [ 17.519959] ---[ end trace d047c04982f5ac50 ]--- Cc: Signed-off-by: Sebastian Reichel Reviewed-by: Greg Kroah-Hartman Acked-by: Till Harbaum Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-tiny-usb.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-tiny-usb.c b/drivers/i2c/busses/i2c-tiny-usb.c index 0ed77eeff31e..a2e3dd715380 100644 --- a/drivers/i2c/busses/i2c-tiny-usb.c +++ b/drivers/i2c/busses/i2c-tiny-usb.c @@ -178,22 +178,39 @@ static int usb_read(struct i2c_adapter *adapter, int cmd, int value, int index, void *data, int len) { struct i2c_tiny_usb *dev = (struct i2c_tiny_usb *)adapter->algo_data; + void *dmadata = kmalloc(len, GFP_KERNEL); + int ret; + + if (!dmadata) + return -ENOMEM; /* do control transfer */ - return usb_control_msg(dev->usb_dev, usb_rcvctrlpipe(dev->usb_dev, 0), + ret = usb_control_msg(dev->usb_dev, usb_rcvctrlpipe(dev->usb_dev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_INTERFACE | - USB_DIR_IN, value, index, data, len, 2000); + USB_DIR_IN, value, index, dmadata, len, 2000); + + memcpy(data, dmadata, len); + kfree(dmadata); + return ret; } static int usb_write(struct i2c_adapter *adapter, int cmd, int value, int index, void *data, int len) { struct i2c_tiny_usb *dev = (struct i2c_tiny_usb *)adapter->algo_data; + void *dmadata = kmemdup(data, len, GFP_KERNEL); + int ret; + + if (!dmadata) + return -ENOMEM; /* do control transfer */ - return usb_control_msg(dev->usb_dev, usb_sndctrlpipe(dev->usb_dev, 0), + ret = usb_control_msg(dev->usb_dev, usb_sndctrlpipe(dev->usb_dev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_INTERFACE, - value, index, data, len, 2000); + value, index, dmadata, len, 2000); + + kfree(dmadata); + return ret; } static void i2c_tiny_usb_free(struct i2c_tiny_usb *dev) From e2c824924cdb41528932c550647406ad81336b18 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 22 May 2017 07:46:55 +0200 Subject: [PATCH 091/254] i2c: designware: Fix bogus sda_hold_time due to uninitialized vars We need to initializes those variables to 0 for platforms that do not provide ACPI parameters. Otherwise, we set sda_hold_time to random values, breaking e.g. Galileo and IOT2000 boards. Fixes: 9d6408433019 ("i2c: designware: don't infer timings described by ACPI from clock rate") Signed-off-by: Jan Kiszka Reviewed-by: Ard Biesheuvel Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 6283b99d2b17..d1263b82d646 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -94,9 +94,9 @@ static void dw_i2c_acpi_params(struct platform_device *pdev, char method[], static int dw_i2c_acpi_configure(struct platform_device *pdev) { struct dw_i2c_dev *dev = platform_get_drvdata(pdev); + u32 ss_ht = 0, fp_ht = 0, hs_ht = 0, fs_ht = 0; acpi_handle handle = ACPI_HANDLE(&pdev->dev); const struct acpi_device_id *id; - u32 ss_ht, fp_ht, hs_ht, fs_ht; struct acpi_device *adev; const char *uid; From 63691587f7b0028326ddd1226c378aaaeca4d4e4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 May 2017 16:32:46 +0200 Subject: [PATCH 092/254] ALSA: hda - Apply dual-codec quirk for MSI Z270-Gaming mobo MSI Z270-Gamin mobo has also two ALC1220 codecs like Gigabyte AZ370- Gaming mobo. Apply the same quirk to this one. Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 9c22ad694534..3fdd5af190a4 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2328,6 +2328,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK_VENDOR(0x1462, "MSI", ALC882_FIXUP_GPIO3), SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte EP45-DS3/Z87X-UD3H", ALC889_FIXUP_FRONT_HP_NO_PRESENCE), SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), + SND_PCI_QUIRK(0x1462, 0xda57, "MSI Z270-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK(0x147b, 0x107a, "Abit AW9D-MAX", ALC882_FIXUP_ABIT_AW9D_MAX), SND_PCI_QUIRK_VENDOR(0x1558, "Clevo laptop", ALC882_FIXUP_EAPD), SND_PCI_QUIRK(0x161f, 0x2054, "Medion laptop", ALC883_FIXUP_EAPD), From ba90d6a6b00a84f2a18112145c113e5ef628e561 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 May 2017 16:38:47 +0200 Subject: [PATCH 093/254] ALSA: hda - Provide dual-codecs model option for a few Realtek codecs Recently some laptops and mobos are equipped with the dual Realtek codecs that require special quirks. For making the debugging easier, add the model "dual-codecs" to be passed via module option. Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 3fdd5af190a4..918e45268915 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2343,6 +2343,7 @@ static const struct hda_model_fixup alc882_fixup_models[] = { {.id = ALC883_FIXUP_ACER_EAPD, .name = "acer-aspire"}, {.id = ALC882_FIXUP_INV_DMIC, .name = "inv-dmic"}, {.id = ALC882_FIXUP_NO_PRIMARY_HP, .name = "no-primary-hp"}, + {.id = ALC1220_FIXUP_GB_DUAL_CODECS, .name = "dual-codecs"}, {} }; @@ -6015,6 +6016,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC292_FIXUP_TPT440_DOCK, .name = "tpt440-dock"}, {.id = ALC292_FIXUP_TPT440, .name = "tpt440"}, {.id = ALC292_FIXUP_TPT460, .name = "tpt460"}, + {.id = ALC233_FIXUP_LENOVO_MULTI_CODECS, .name = "dual-codecs"}, {} }; #define ALC225_STANDARD_PINS \ @@ -7342,6 +7344,7 @@ static const struct hda_model_fixup alc662_fixup_models[] = { {.id = ALC662_FIXUP_ASUS_MODE8, .name = "asus-mode8"}, {.id = ALC662_FIXUP_INV_DMIC, .name = "inv-dmic"}, {.id = ALC668_FIXUP_DELL_MIC_NO_PRESENCE, .name = "dell-headset-multi"}, + {.id = ALC662_FIXUP_LENOVO_MULTI_CODECS, .name = "dual-codecs"}, {} }; From a79e7df97592b2326be81d5dae286bdb5c529a01 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 May 2017 16:41:24 +0200 Subject: [PATCH 094/254] ALSA: hda - Update the list of quirk models I've forgotten to sync the documentation with the actually available options for some time. Now all updated. Signed-off-by: Takashi Iwai --- Documentation/sound/hd-audio/models.rst | 114 ++++++++++++++---------- 1 file changed, 65 insertions(+), 49 deletions(-) diff --git a/Documentation/sound/hd-audio/models.rst b/Documentation/sound/hd-audio/models.rst index 5338673c88d9..773d2bfacc6c 100644 --- a/Documentation/sound/hd-audio/models.rst +++ b/Documentation/sound/hd-audio/models.rst @@ -16,6 +16,8 @@ ALC880 6-jack in back, 2-jack in front 6stack-digout 6-jack with a SPDIF out +6stack-automute + 6-jack with headphone jack detection ALC260 ====== @@ -62,6 +64,8 @@ lenovo-dock Enables docking station I/O for some Lenovos hp-gpio-led GPIO LED support on HP laptops +hp-dock-gpio-mic1-led + HP dock with mic LED support dell-headset-multi Headset jack, which can also be used as mic-in dell-headset-dock @@ -72,6 +76,12 @@ alc283-sense-combo Combo jack sensing on ALC283 tpt440-dock Pin configs for Lenovo Thinkpad Dock support +tpt440 + Lenovo Thinkpad T440s setup +tpt460 + Lenovo Thinkpad T460/560 setup +dual-codecs + Lenovo laptops with dual codecs ALC66x/67x/892 ============== @@ -97,6 +107,8 @@ inv-dmic Inverted internal mic workaround dell-headset-multi Headset jack, which can also be used as mic-in +dual-codecs + Lenovo laptops with dual codecs ALC680 ====== @@ -114,6 +126,8 @@ inv-dmic Inverted internal mic workaround no-primary-hp VAIO Z/VGC-LN51JGB workaround (for fixed speaker DAC) +dual-codecs + ALC1220 dual codecs for Gaming mobos ALC861/660 ========== @@ -206,65 +220,47 @@ auto Conexant 5045 ============= -laptop-hpsense - Laptop with HP sense (old model laptop) -laptop-micsense - Laptop with Mic sense (old model fujitsu) -laptop-hpmicsense - Laptop with HP and Mic senses -benq - Benq R55E -laptop-hp530 - HP 530 laptop -test - for testing/debugging purpose, almost all controls can be - adjusted. Appearing only when compiled with $CONFIG_SND_DEBUG=y +cap-mix-amp + Fix max input level on mixer widget +toshiba-p105 + Toshiba P105 quirk +hp-530 + HP 530 quirk Conexant 5047 ============= -laptop - Basic Laptop config -laptop-hp - Laptop config for some HP models (subdevice 30A5) -laptop-eapd - Laptop config with EAPD support -test - for testing/debugging purpose, almost all controls can be - adjusted. Appearing only when compiled with $CONFIG_SND_DEBUG=y +cap-mix-amp + Fix max input level on mixer widget Conexant 5051 ============= -laptop - Basic Laptop config (default) -hp - HP Spartan laptop -hp-dv6736 - HP dv6736 -hp-f700 - HP Compaq Presario F700 -ideapad - Lenovo IdeaPad laptop -toshiba - Toshiba Satellite M300 +lenovo-x200 + Lenovo X200 quirk Conexant 5066 ============= -laptop - Basic Laptop config (default) -hp-laptop - HP laptops, e g G60 -asus - Asus K52JU, Lenovo G560 -dell-laptop - Dell laptops -dell-vostro - Dell Vostro -olpc-xo-1_5 - OLPC XO 1.5 -ideapad - Lenovo IdeaPad U150 +stereo-dmic + Workaround for inverted stereo digital mic +gpio1 + Enable GPIO1 pin +headphone-mic-pin + Enable headphone mic NID 0x18 without detection +tp410 + Thinkpad T400 & co quirks thinkpad - Lenovo Thinkpad + Thinkpad mute/mic LED quirk +lemote-a1004 + Lemote A1004 quirk +lemote-a1205 + Lemote A1205 quirk +olpc-xo + OLPC XO quirk +mute-led-eapd + Mute LED control via EAPD +hp-dock + HP dock support +mute-led-gpio + Mute LED control via GPIO STAC9200 ======== @@ -444,6 +440,8 @@ dell-eq Dell desktops/laptops alienware Alienware M17x +asus-mobo + Pin configs for ASUS mobo with 5.1/SPDIF out auto BIOS setup (default) @@ -477,6 +475,8 @@ hp-envy-ts-bass Pin fixup for HP Envy TS bass speaker (NID 0x10) hp-bnb13-eq Hardware equalizer setup for HP laptops +hp-envy-ts-bass + HP Envy TS bass support auto BIOS setup (default) @@ -496,10 +496,22 @@ auto Cirrus Logic CS4206/4207 ======================== +mbp53 + MacBook Pro 5,3 mbp55 MacBook Pro 5,5 imac27 IMac 27 Inch +imac27_122 + iMac 12,2 +apple + Generic Apple quirk +mbp101 + MacBookPro 10,1 +mbp81 + MacBookPro 8,1 +mba42 + MacBookAir 4,2 auto BIOS setup (default) @@ -509,6 +521,10 @@ mba6 MacBook Air 6,1 and 6,2 gpio0 Enable GPIO 0 amp +mbp11 + MacBookPro 11,2 +macmini + MacMini 7,1 auto BIOS setup (default) From 232cd35d0804cc241eb887bb8d4d9b3b9881c64a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 May 2017 14:17:48 -0700 Subject: [PATCH 095/254] ipv6: fix out of bound writes in __ip6_append_data() Andrey Konovalov and idaifish@gmail.com reported crashes caused by one skb shared_info being overwritten from __ip6_append_data() Andrey program lead to following state : copy -4200 datalen 2000 fraglen 2040 maxfraglen 2040 alloclen 2048 transhdrlen 0 offset 0 fraggap 6200 The skb_copy_and_csum_bits(skb_prev, maxfraglen, data + transhdrlen, fraggap, 0); is overwriting skb->head and skb_shared_info Since we apparently detect this rare condition too late, move the code earlier to even avoid allocating skb and risking crashes. Once again, many thanks to Andrey and syzkaller team. Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Reported-by: Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d4a31becbd25..bf8a58a1c32d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1466,6 +1466,11 @@ alloc_new_skb: */ alloclen += sizeof(struct frag_hdr); + copy = datalen - transhdrlen - fraggap; + if (copy < 0) { + err = -EINVAL; + goto error; + } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len, @@ -1515,13 +1520,9 @@ alloc_new_skb: data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } - copy = datalen - transhdrlen - fraggap; - - if (copy < 0) { - err = -EINVAL; - kfree_skb(skb); - goto error; - } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + if (copy > 0 && + getfrag(from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; From 9e7b9a25e170722f15ed54f5b963e9867f79195d Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 22 May 2017 13:09:19 +0200 Subject: [PATCH 096/254] mmc: cavium: Prevent crash with incomplete DT In case the DT specifies neither a regulator nor a gpio for the shared power the driver will crash accessing the regulator. Prevent the crash by checking the regulator before use. Use mmc_regulator_get_supply() instead of open coding the same logic. Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c index 58b51ba6aabd..b8aaf0fdb77c 100644 --- a/drivers/mmc/host/cavium.c +++ b/drivers/mmc/host/cavium.c @@ -839,14 +839,14 @@ static void cvm_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) cvm_mmc_reset_bus(slot); if (host->global_pwr_gpiod) host->set_shared_power(host, 0); - else + else if (!IS_ERR(mmc->supply.vmmc)) mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0); break; case MMC_POWER_UP: if (host->global_pwr_gpiod) host->set_shared_power(host, 1); - else + else if (!IS_ERR(mmc->supply.vmmc)) mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd); break; } @@ -968,20 +968,15 @@ static int cvm_mmc_of_parse(struct device *dev, struct cvm_mmc_slot *slot) return -EINVAL; } - mmc->supply.vmmc = devm_regulator_get_optional(dev, "vmmc"); - if (IS_ERR(mmc->supply.vmmc)) { - if (PTR_ERR(mmc->supply.vmmc) == -EPROBE_DEFER) - return -EPROBE_DEFER; - /* - * Legacy Octeon firmware has no regulator entry, fall-back to - * a hard-coded voltage to get a sane OCR. - */ + ret = mmc_regulator_get_supply(mmc); + if (ret == -EPROBE_DEFER) + return ret; + /* + * Legacy Octeon firmware has no regulator entry, fall-back to + * a hard-coded voltage to get a sane OCR. + */ + if (IS_ERR(mmc->supply.vmmc)) mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; - } else { - ret = mmc_regulator_get_ocrmask(mmc->supply.vmmc); - if (ret > 0) - mmc->ocr_avail = ret; - } /* Common MMC bindings */ ret = mmc_of_parse(mmc); From c2372c20425bd75a5527b3e2204059762190f6ca Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 22 May 2017 13:09:20 +0200 Subject: [PATCH 097/254] of/platform: Make of_platform_device_destroy globally visible of_platform_device_destroy is the counterpart to of_platform_device_create which is a non-static function. After creating a platform device it might be neccessary to destroy it to deal with -EPROBE_DEFER where a repeated of_platform_device_create call would fail otherwise. Therefore also make of_platform_device_destroy globally visible. Signed-off-by: Jan Glauber Acked-by: Rob Herring Signed-off-by: Ulf Hansson --- drivers/of/platform.c | 3 ++- include/linux/of_platform.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 71fecc2debfc..703a42118ffc 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -523,7 +523,7 @@ static int __init of_platform_default_populate_init(void) arch_initcall_sync(of_platform_default_populate_init); #endif -static int of_platform_device_destroy(struct device *dev, void *data) +int of_platform_device_destroy(struct device *dev, void *data) { /* Do not touch devices not populated from the device tree */ if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED)) @@ -544,6 +544,7 @@ static int of_platform_device_destroy(struct device *dev, void *data) of_node_clear_flag(dev->of_node, OF_POPULATED_BUS); return 0; } +EXPORT_SYMBOL_GPL(of_platform_device_destroy); /** * of_platform_depopulate() - Remove devices populated from device tree diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index dc8224ae28d5..e0d1946270f3 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -64,6 +64,7 @@ extern struct platform_device *of_platform_device_create(struct device_node *np, const char *bus_id, struct device *parent); +extern int of_platform_device_destroy(struct device *dev, void *data); extern int of_platform_bus_probe(struct device_node *root, const struct of_device_id *matches, struct device *parent); From 8fb83b142823cdd1f85f78dcf9e861e9033919f9 Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 22 May 2017 13:09:21 +0200 Subject: [PATCH 098/254] mmc: cavium: Fix probing race with regulator If the regulator probing is not yet finished this driver might catch a -EPROBE_DEFER. Returning after this condition did not remove the created platform device. On a repeated call to the probe function the of_platform_device_create fails. Calling of_platform_device_destroy after EPROBE_DEFER resolves this bug. Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium-octeon.c | 11 ++++++++++- drivers/mmc/host/cavium-thunderx.c | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium-octeon.c b/drivers/mmc/host/cavium-octeon.c index cbb566377508..951d2cdd7888 100644 --- a/drivers/mmc/host/cavium-octeon.c +++ b/drivers/mmc/host/cavium-octeon.c @@ -288,11 +288,20 @@ static int octeon_mmc_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "Error populating slots\n"); octeon_mmc_set_shared_power(host, 0); - return ret; + goto error; } i++; } return 0; + +error: + for (i = 0; i < CAVIUM_MAX_MMC; i++) { + if (host->slot[i]) + cvm_mmc_of_slot_remove(host->slot[i]); + if (host->slot_pdev[i]) + of_platform_device_destroy(&host->slot_pdev[i]->dev, NULL); + } + return ret; } static int octeon_mmc_remove(struct platform_device *pdev) diff --git a/drivers/mmc/host/cavium-thunderx.c b/drivers/mmc/host/cavium-thunderx.c index fe3d77267cd6..b9cc95998799 100644 --- a/drivers/mmc/host/cavium-thunderx.c +++ b/drivers/mmc/host/cavium-thunderx.c @@ -146,6 +146,12 @@ static int thunder_mmc_probe(struct pci_dev *pdev, return 0; error: + for (i = 0; i < CAVIUM_MAX_MMC; i++) { + if (host->slot[i]) + cvm_mmc_of_slot_remove(host->slot[i]); + if (host->slot_pdev[i]) + of_platform_device_destroy(&host->slot_pdev[i]->dev, NULL); + } clk_disable_unprepare(host->clk); return ret; } From bd703a1524e851b8d6d646be9dafd794d4eb6045 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 20 May 2017 01:52:11 +0300 Subject: [PATCH 099/254] net: atheros: atl2: don't return zero on failure path in atl2_probe() If dma mask checks fail in atl2_probe(), it breaks off initialization, deallocates all resources, but returns zero. The patch adds proper error code return value and make error code setup unified. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/atlx/atl2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c index 63f2deec2a52..77a1c03255de 100644 --- a/drivers/net/ethernet/atheros/atlx/atl2.c +++ b/drivers/net/ethernet/atheros/atlx/atl2.c @@ -1353,6 +1353,7 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) && pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) { printk(KERN_ERR "atl2: No usable DMA configuration, aborting\n"); + err = -EIO; goto err_dma; } @@ -1366,10 +1367,11 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * pcibios_set_master to do the needed arch specific settings */ pci_set_master(pdev); - err = -ENOMEM; netdev = alloc_etherdev(sizeof(struct atl2_adapter)); - if (!netdev) + if (!netdev) { + err = -ENOMEM; goto err_alloc_etherdev; + } SET_NETDEV_DEV(netdev, &pdev->dev); @@ -1408,8 +1410,6 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; - err = -EIO; - netdev->hw_features = NETIF_F_HW_VLAN_CTAG_RX; netdev->features |= (NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX); From 751da2a69b7cc82d83dc310ed7606225f2d6e014 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 19 May 2017 19:43:45 -0400 Subject: [PATCH 100/254] bonding: fix accounting of active ports in 3ad As of 7bb11dc9f59d and 0622cab0341c, bond slaves in a 3ad bond are not removed from the aggregator when they are down, and the active slave count is NOT equal to number of ports in the aggregator, but rather the number of ports in the aggregator that are still enabled. The sysfs spew for bonding_show_ad_num_ports() has a comment that says "Show number of active 802.3ad ports.", but it's currently showing total number of ports, both active and inactive. Remedy it by using the same logic introduced in 0622cab0341c in __bond_3ad_get_active_agg_info(), so sysfs, procfs and netlink all report the number of active ports. Note that this means that IFLA_BOND_AD_INFO_NUM_PORTS really means NUM_ACTIVE_PORTS instead of NUM_PORTS, and thus perhaps should be renamed for clarity. Lightly tested on a dual i40e lacp bond, simulating link downs with an ip link set dev down, was able to produce the state where I could see both in the same aggregator, but a number of ports count of 1. MII Status: up Active Aggregator Info: Aggregator ID: 1 Number of ports: 2 <--- Slave Interface: ens10 MII Status: up <--- Aggregator ID: 1 Slave Interface: ens11 MII Status: up Aggregator ID: 1 MII Status: up Active Aggregator Info: Aggregator ID: 1 Number of ports: 1 <--- Slave Interface: ens10 MII Status: down <--- Aggregator ID: 1 Slave Interface: ens11 MII Status: up Aggregator ID: 1 CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- drivers/net/bonding/bond_3ad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index c5fd4259da33..b44a6aeb346d 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2577,7 +2577,7 @@ int __bond_3ad_get_active_agg_info(struct bonding *bond, return -1; ad_info->aggregator_id = aggregator->aggregator_identifier; - ad_info->ports = aggregator->num_of_ports; + ad_info->ports = __agg_active_ports(aggregator); ad_info->actor_key = aggregator->actor_oper_aggregator_key; ad_info->partner_key = aggregator->partner_oper_aggregator_key; ether_addr_copy(ad_info->partner_system, From f5f968f2371ccdebb8a365487649673c9af68d09 Mon Sep 17 00:00:00 2001 From: Srinath Mannam Date: Thu, 18 May 2017 22:27:40 +0530 Subject: [PATCH 101/254] mmc: sdhci-iproc: suppress spurious interrupt with Multiblock read The stingray SDHCI hardware supports ACMD12 and automatically issues after multi block transfer completed. If ACMD12 in SDHCI is disabled, spurious tx done interrupts are seen on multi block read command with below error message: Got data interrupt 0x00000002 even though no data operation was in progress. This patch uses SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12 to enable ACM12 support in SDHCI hardware and suppress spurious interrupt. Signed-off-by: Srinath Mannam Reviewed-by: Ray Jui Reviewed-by: Scott Branden Acked-by: Adrian Hunter Fixes: b580c52d58d9 ("mmc: sdhci-iproc: add IPROC SDHCI driver") Cc: Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-iproc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c index 3275d4995812..61666d269771 100644 --- a/drivers/mmc/host/sdhci-iproc.c +++ b/drivers/mmc/host/sdhci-iproc.c @@ -187,7 +187,8 @@ static const struct sdhci_iproc_data iproc_cygnus_data = { }; static const struct sdhci_pltfm_data sdhci_iproc_pltfm_data = { - .quirks = SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK, + .quirks = SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | + SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, .quirks2 = SDHCI_QUIRK2_ACMD23_BROKEN, .ops = &sdhci_iproc_ops, }; From e4eda884db7930cee434828759064b4711604078 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 22 May 2017 12:27:07 -0400 Subject: [PATCH 102/254] net: Make IP alignment calulations clearer. The assignmnet: ip_align = strict ? 2 : NET_IP_ALIGN; in compare_pkt_ptr_alignment() trips up Coverity because we can only get to this code when strict is true, therefore ip_align will always be 2 regardless of NET_IP_ALIGN's value. So just assign directly to '2' and explain the situation in the comment above. Reported-by: "Gustavo A. R. Silva" Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1eddb713b815..c72cd41f5b8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -808,11 +808,15 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, reg_off += reg->aux_off; } - /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking - * we force this to 2 which is universally what architectures use - * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + /* For platforms that do not have a Kconfig enabling + * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of + * NET_IP_ALIGN is universally set to '2'. And on platforms + * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get + * to this code only in strict mode where we want to emulate + * the NET_IP_ALIGN==2 checking. Therefore use an + * unconditional IP align value of '2'. */ - ip_align = strict ? 2 : NET_IP_ALIGN; + ip_align = 2; if ((ip_align + reg_off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", ip_align, reg_off, off, size); From 72ccc471e13b8266d2ee2104521df5b92ba08e9c Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 19 May 2017 14:46:46 -0400 Subject: [PATCH 103/254] bonding: fix randomly populated arp target array In commit dc9c4d0fe023, the arp_target array moved from a static global to a local variable. By the nature of static globals, the array used to be initialized to all 0. At present, it's full of random data, which that gets interpreted as arp_target values, when none have actually been specified. Systems end up booting with spew along these lines: [ 32.161783] IPv6: ADDRCONF(NETDEV_UP): lacp0: link is not ready [ 32.168475] IPv6: ADDRCONF(NETDEV_UP): lacp0: link is not ready [ 32.175089] 8021q: adding VLAN 0 to HW filter on device lacp0 [ 32.193091] IPv6: ADDRCONF(NETDEV_UP): lacp0: link is not ready [ 32.204892] lacp0: Setting MII monitoring interval to 100 [ 32.211071] lacp0: Removing ARP target 216.124.228.17 [ 32.216824] lacp0: Removing ARP target 218.160.255.255 [ 32.222646] lacp0: Removing ARP target 185.170.136.184 [ 32.228496] lacp0: invalid ARP target 255.255.255.255 specified for removal [ 32.236294] lacp0: option arp_ip_target: invalid value (-255.255.255.255) [ 32.243987] lacp0: Removing ARP target 56.125.228.17 [ 32.249625] lacp0: Removing ARP target 218.160.255.255 [ 32.255432] lacp0: Removing ARP target 15.157.233.184 [ 32.261165] lacp0: invalid ARP target 255.255.255.255 specified for removal [ 32.268939] lacp0: option arp_ip_target: invalid value (-255.255.255.255) [ 32.276632] lacp0: Removing ARP target 16.0.0.0 [ 32.281755] lacp0: Removing ARP target 218.160.255.255 [ 32.287567] lacp0: Removing ARP target 72.125.228.17 [ 32.293165] lacp0: Removing ARP target 218.160.255.255 [ 32.298970] lacp0: Removing ARP target 8.125.228.17 [ 32.304458] lacp0: Removing ARP target 218.160.255.255 None of these were actually specified as ARP targets, and the driver does seem to clean up the mess okay, but it's rather noisy and confusing, leaks values to userspace, and the 255.255.255.255 spew shows up even when debug prints are disabled. The fix: just zero out arp_target at init time. While we're in here, init arp_all_targets_value in the right place. Fixes: dc9c4d0fe023 ("bonding: reduce scope of some global variables") CC: Mahesh Bandewar CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: netdev@vger.kernel.org CC: stable@vger.kernel.org Signed-off-by: Jarod Wilson Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2be78807fd6e..73313318399c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4271,10 +4271,10 @@ static int bond_check_params(struct bond_params *params) int arp_validate_value, fail_over_mac_value, primary_reselect_value, i; struct bond_opt_value newval; const struct bond_opt_value *valptr; - int arp_all_targets_value; + int arp_all_targets_value = 0; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; - __be32 arp_target[BOND_MAX_ARP_TARGETS]; + __be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 }; int arp_ip_count; int bond_mode = BOND_MODE_ROUNDROBIN; int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; @@ -4501,7 +4501,6 @@ static int bond_check_params(struct bond_params *params) arp_validate_value = 0; } - arp_all_targets_value = 0; if (arp_all_targets) { bond_opt_initstr(&newval, arp_all_targets); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS), From 499fde662f1957e3cb8d192a94a099ebe19c714b Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 19 May 2017 11:21:59 -0700 Subject: [PATCH 104/254] vsock: use new wait API for vsock_stream_sendmsg() As reported by Michal, vsock_stream_sendmsg() could still sleep at vsock_stream_has_space() after prepare_to_wait(): vsock_stream_has_space vmci_transport_stream_has_space vmci_qpair_produce_free_space qp_lock qp_acquire_queue_mutex mutex_lock Just switch to the new wait API like we did for commit d9dc8b0f8b4e ("net: fix sleeping for sk_wait_event()"). Reported-by: Michal Kubecek Cc: Stefan Hajnoczi Cc: Jorgen Hansen Cc: "Michael S. Tsirkin" Cc: Claudio Imbrenda Signed-off-by: Cong Wang Reviewed-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 6f7f6757ceef..dfc8c51e4d74 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1540,8 +1540,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, long timeout; int err; struct vsock_transport_send_notify_data send_data; - - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); sk = sock->sk; vsk = vsock_sk(sk); @@ -1584,11 +1583,10 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - while (total_written < len) { ssize_t written; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && @@ -1597,33 +1595,30 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); if (err < 0) { - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } release_sock(sk); - timeout = schedule_timeout(timeout); + timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } else if (timeout == 0) { err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after From 0544f5494a03b8846db74e02be5685d1f32b06c9 Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Mon, 10 Apr 2017 17:12:34 +0200 Subject: [PATCH 105/254] nvme-rdma: support devices with queue size < 32 In the case of small NVMe-oF queue size (<32) we may enter a deadlock caused by the fact that the IB completions aren't sent waiting for 32 and the send queue will fill up. The error is seen as (using mlx5): [ 2048.693355] mlx5_0:mlx5_ib_post_send:3765:(pid 7273): [ 2048.693360] nvme nvme1: nvme_rdma_post_send failed with error code -12 This patch changes the way the signaling is done so that it depends on the queue depth now. The magic define has been removed completely. Cc: stable@vger.kernel.org Signed-off-by: Marta Rybczynska Signed-off-by: Samuel Jones Acked-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index dd1c6deef82f..e2c18f3d9dcf 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1038,6 +1038,19 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) nvme_rdma_wr_error(cq, wc, "SEND"); } +static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) +{ + int sig_limit; + + /* + * We signal completion every queue depth/2 and also handle the + * degenerated case of a device with queue_depth=1, where we + * would need to signal every message. + */ + sig_limit = max(queue->queue_size / 2, 1); + return (++queue->sig_count % sig_limit) == 0; +} + static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, struct ib_send_wr *first, bool flush) @@ -1065,9 +1078,6 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * Would have been way to obvious to handle this in hardware or * at least the RDMA stack.. * - * This messy and racy code sniplet is copy and pasted from the iSER - * initiator, and the magic '32' comes from there as well. - * * Always signal the flushes. The magic request used for the flush * sequencer is not allocated in our driver's tagset and it's * triggered to be freed by blk_cleanup_queue(). So we need to @@ -1075,7 +1085,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * embedded in request's payload, is not freed when __ib_process_cq() * calls wr_cqe->done(). */ - if ((++queue->sig_count % 32) == 0 || flush) + if (nvme_rdma_queue_sig_limit(queue) || flush) wr.send_flags |= IB_SEND_SIGNALED; if (first) From 806f026f9b901eaf1a6baeb48b5da18d6a4f818e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 22 May 2017 23:05:03 +0800 Subject: [PATCH 106/254] nvme: use blk_mq_start_hw_queues() in nvme_kill_queues() Inside nvme_kill_queues(), we have to start hw queues for draining requests in sw queues, .dispatch list and requeue list, so use blk_mq_start_hw_queues() instead of blk_mq_start_stopped_hw_queues() which only run queues if queues are stopped, but the queues may have been started already, for example nvme_start_queues() is called in reset work function. blk_mq_start_hw_queues() run hw queues in current context, instead of running asynchronously like before. Given nvme_kill_queues() is run from either remove context or reset worker context, both are fine to run hw queue directly. And the mutex of namespaces_mutex isn't a problem too becasue nvme_start_freeze() runs hw queue in this way already. Cc: stable@vger.kernel.org Reported-by: Zhang Yi Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d5e0906262ea..40d5e4a9e8d7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2437,7 +2437,13 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); blk_mq_abort_requeue_list(ns->queue); - blk_mq_start_stopped_hw_queues(ns->queue, true); + + /* + * Forcibly start all queues to avoid having stuck requests. + * Note that we must ensure the queues are not stopped + * when the final removal happens. + */ + blk_mq_start_hw_queues(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } From 986f75c876dbafed98eba7cb516c5118f155db23 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 22 May 2017 23:05:04 +0800 Subject: [PATCH 107/254] nvme: avoid to use blk_mq_abort_requeue_list() NVMe may add request into requeue list simply and not kick off the requeue if hw queues are stopped. Then blk_mq_abort_requeue_list() is called in both nvme_kill_queues() and nvme_ns_remove() for dealing with this issue. Unfortunately blk_mq_abort_requeue_list() is absolutely a race maker, for example, one request may be requeued during the aborting. So this patch just calls blk_mq_kick_requeue_list() in nvme_kill_queues() to handle this issue like what nvme_start_queues() does. Now all requests in requeue list when queues are stopped will be handled by blk_mq_kick_requeue_list() when queues are restarted, either in nvme_start_queues() or in nvme_kill_queues(). Cc: stable@vger.kernel.org Reported-by: Zhang Yi Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 40d5e4a9e8d7..04e115834702 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2098,7 +2098,6 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (ns->ndev) nvme_nvm_unregister_sysfs(ns); del_gendisk(ns->disk); - blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } @@ -2436,7 +2435,6 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) continue; revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); - blk_mq_abort_requeue_list(ns->queue); /* * Forcibly start all queues to avoid having stuck requests. @@ -2444,6 +2442,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) * when the final removal happens. */ blk_mq_start_hw_queues(ns->queue); + + /* draining requests in requeue list */ + blk_mq_kick_requeue_list(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } From 7254a50a5db40ca6739ddf37e0a45e6912532b2c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 22 May 2017 23:05:05 +0800 Subject: [PATCH 108/254] blk-mq: remove blk_mq_abort_requeue_list() No one uses it any more, so remove it. Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig --- block/blk-mq.c | 19 ------------------- include/linux/blk-mq.h | 1 - 2 files changed, 20 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a69ad122ed66..f2224ffd225d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -628,25 +628,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -void blk_mq_abort_requeue_list(struct request_queue *q) -{ - unsigned long flags; - LIST_HEAD(rq_list); - - spin_lock_irqsave(&q->requeue_lock, flags); - list_splice_init(&q->requeue_list, &rq_list); - spin_unlock_irqrestore(&q->requeue_lock, flags); - - while (!list_empty(&rq_list)) { - struct request *rq; - - rq = list_first_entry(&rq_list, struct request, queuelist); - list_del_init(&rq->queuelist); - blk_mq_end_request(rq, -EIO); - } -} -EXPORT_SYMBOL(blk_mq_abort_requeue_list); - struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c47aa248c640..fcd641032f8d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -238,7 +238,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); -void blk_mq_abort_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq); bool blk_mq_queue_stopped(struct request_queue *q); From 2d76b2f8b54abd16225cd80afca36ed43f113c41 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 22 May 2017 16:46:13 +0200 Subject: [PATCH 109/254] net: sched: cls_matchall: fix null pointer dereference Since the head is guaranteed by the check above to be null, the call_rcu would explode. Remove the previously logically dead code that was made logically very much alive and kicking. Fixes: 985538eee06f ("net/sched: remove redundant null check on head") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index dee469fed967..51859b8edd7e 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -203,7 +203,6 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) head; rcu_assign_pointer(tp->root, new); - call_rcu(&head->rcu, mall_destroy_rcu); return 0; err_replace_hw_filter: From 0ce872bf8b5c4d425a41940a523ff1b8daa0b275 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:15 -0700 Subject: [PATCH 110/254] nvme_fc: get rid of local reconnect_delay Remove the local copy of reconnect_delay. Use the value in the controller options directly. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index dca7165fabcf..c3ab1043efbd 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -165,7 +165,6 @@ struct nvme_fc_ctrl { struct work_struct delete_work; struct work_struct reset_work; struct delayed_work connect_work; - int reconnect_delay; int connect_attempts; struct kref ref; @@ -2615,9 +2614,9 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->reconnect_delay); + ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->reconnect_delay * HZ); + ctrl->ctrl.opts->reconnect_delay * HZ); } else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); @@ -2695,9 +2694,9 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->reconnect_delay); + ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->reconnect_delay * HZ); + ctrl->ctrl.opts->reconnect_delay * HZ); } else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reconnect complete\n", @@ -2755,7 +2754,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); - ctrl->reconnect_delay = opts->reconnect_delay; spin_lock_init(&ctrl->lock); /* io queue count */ From 5bbecdbc8e7ffaaf47ac1f02014bf3bedda3fd11 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:16 -0700 Subject: [PATCH 111/254] nvme_fc: Support ctrl_loss_tmo Sync with Sagi's recent addition of ctrl_loss_tmo in the core fabrics layer. Remove local connect limits and connect_attempts variable. Use fabrics new nr_connects variable and use of nvmf_should_reconnect() Refactor duplicate reconnect failure code. Addresses review comment by Sagi on controller reset support: http://lists.infradead.org/pipermail/linux-nvme/2017-April/009261.html Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 120 ++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 69 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index c3ab1043efbd..a0f05d5e966c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -45,8 +45,6 @@ enum nvme_fc_queue_flags { #define NVMEFC_QUEUE_DELAY 3 /* ms units */ -#define NVME_FC_MAX_CONNECT_ATTEMPTS 1 - struct nvme_fc_queue { struct nvme_fc_ctrl *ctrl; struct device *dev; @@ -165,7 +163,6 @@ struct nvme_fc_ctrl { struct work_struct delete_work; struct work_struct reset_work; struct delayed_work connect_work; - int connect_attempts; struct kref ref; u32 flags; @@ -2305,7 +2302,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) int ret; bool changed; - ctrl->connect_attempts++; + ++ctrl->ctrl.opts->nr_reconnects; /* * Create the admin queue @@ -2402,7 +2399,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->connect_attempts = 0; + ctrl->ctrl.opts->nr_reconnects = 0; kref_get(&ctrl->ctrl.kref); @@ -2545,16 +2542,22 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) nvme_put_ctrl(&ctrl->ctrl); } +static bool +__nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return true; + + if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) + return true; + + return false; +} + static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) { - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - - if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) - return -EBUSY; - - return 0; + return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0; } /* @@ -2579,6 +2582,35 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) return ret; } +static void +nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) +{ + /* If we are resetting/deleting then do nothing */ + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { + WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || + ctrl->ctrl.state == NVME_CTRL_LIVE); + return; + } + + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", + ctrl->cnum, status); + + if (nvmf_should_reconnect(&ctrl->ctrl)) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", + ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); + queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, + ctrl->ctrl.opts->reconnect_delay * HZ); + } else { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: Max reconnect attempts (%d) " + "reached. Removing controller\n", + ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); + WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); + } +} + static void nvme_fc_reset_ctrl_work(struct work_struct *work) { @@ -2590,34 +2622,9 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) nvme_fc_delete_association(ctrl); ret = nvme_fc_create_association(ctrl); - if (ret) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", - ctrl->cnum, ret); - if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Max reconnect attempts (%d) " - "reached. Removing controller\n", - ctrl->cnum, ctrl->connect_attempts); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, - NVME_CTRL_DELETING)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: failed to change state " - "to DELETING\n", ctrl->cnum); - return; - } - - WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work)); - return; - } - - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->ctrl.opts->reconnect_delay * HZ); - } else + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); } @@ -2670,34 +2677,9 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) struct nvme_fc_ctrl, connect_work); ret = nvme_fc_create_association(ctrl); - if (ret) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt failed (%d)\n", - ctrl->cnum, ret); - if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Max reconnect attempts (%d) " - "reached. Removing controller\n", - ctrl->cnum, ctrl->connect_attempts); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, - NVME_CTRL_DELETING)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: failed to change state " - "to DELETING\n", ctrl->cnum); - return; - } - - WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work)); - return; - } - - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->ctrl.opts->reconnect_delay * HZ); - } else + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reconnect complete\n", ctrl->cnum); @@ -2969,7 +2951,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) static struct nvmf_transport_ops nvme_fc_transport = { .name = "fc", .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, - .allowed_opts = NVMF_OPT_RECONNECT_DELAY, + .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, .create_ctrl = nvme_fc_create_ctrl, }; From a5321aa5efea05ae748dc5b3e8053584213325ca Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:18 -0700 Subject: [PATCH 112/254] nvme_fc: revise comment on teardown Per the recommendation by Sagi on: http://lists.infradead.org/pipermail/linux-nvme/2017-April/009261.html An extra reference was pointed out. There's no issue with the references, but rather a literal interpretation of what the comment is saying. Reword the comment to avoid confusion. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index a0f05d5e966c..0b7f7dd2779a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2532,10 +2532,10 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) /* * tear down the controller - * This will result in the last reference on the nvme ctrl to - * expire, calling the transport nvme_fc_nvme_ctrl_freed() callback. - * From there, the transport will tear down it's logical queues and - * association. + * After the last reference on the nvme ctrl is removed, + * the transport nvme_fc_nvme_ctrl_freed() callback will be + * invoked. From there, the transport will tear down it's + * logical queues and association. */ nvme_uninit_ctrl(&ctrl->ctrl); From 589ff7753bb54edd3ee4a9399ccc3ac48d9b22d7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:19 -0700 Subject: [PATCH 113/254] nvme_fc: set logging level on resets/deletes Per the review by Sagi on: http://lists.infradead.org/pipermail/linux-nvme/2017-April/009261.html Looked at existing warn vs info vs err dev_xxx levels for the messages printed on reconnects and deletes: - Resets due to error and resets transitioned to deletes are dev_warn - Other reset/disconnect messages are dev_info - Removed chatty io queue related messages Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 0b7f7dd2779a..e4817f9f4323 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1747,7 +1747,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport association error detected: %s\n", ctrl->cnum, errmsg); - dev_info(ctrl->ctrl.device, + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); /* stop the queues on error, cleanup is in reset thread */ @@ -2191,9 +2191,6 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) if (!opts->nr_io_queues) return 0; - dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", - opts->nr_io_queues); - nvme_fc_init_io_queues(ctrl); memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); @@ -2264,9 +2261,6 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->queue_count == 1) return 0; - dev_info(ctrl->ctrl.device, "Recreating %d I/O queues.\n", - opts->nr_io_queues); - nvme_fc_init_io_queues(ctrl); ret = blk_mq_reinit_tagset(&ctrl->tag_set); @@ -2592,7 +2586,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) return; } - dev_warn(ctrl->ctrl.device, + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", ctrl->cnum, status); @@ -2603,7 +2597,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { - dev_info(ctrl->ctrl.device, + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); @@ -2638,7 +2632,7 @@ nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - dev_warn(ctrl->ctrl.device, + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) From e392e1f1f408fe8baf1046c970d05cbf1f0ec945 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:24 -0700 Subject: [PATCH 114/254] nvme_fc: correct nvme status set on abort correct nvme status set on abort. Patch that changed status to being actual nvme status crossed in the night with the patch that added abort values. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e4817f9f4323..775869c69df6 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1372,9 +1372,9 @@ done: complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); if (!complete_rq) { if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - status = cpu_to_le16(NVME_SC_ABORT_REQ); + status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); if (blk_queue_dying(rq->q)) - status |= cpu_to_le16(NVME_SC_DNR); + status |= cpu_to_le16(NVME_SC_DNR << 1); } nvme_end_request(rq, status, result); } else From 2cb657bc0242dfdca20869685bf179774ef1a6fb Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 15 May 2017 17:10:22 -0700 Subject: [PATCH 115/254] nvme_fc: remove extra controller reference taken on reconnect fix extra controller reference taken on reconnect by moving reference to initial controller create Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 775869c69df6..14a009e43aa5 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2395,8 +2395,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.opts->nr_reconnects = 0; - kref_get(&ctrl->ctrl.kref); - if (ctrl->queue_count > 1) { nvme_start_queues(&ctrl->ctrl); nvme_queue_scan(&ctrl->ctrl); @@ -2793,7 +2791,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = NULL; /* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); /* as we're past the point where we transition to the ref * counting teardown path, if we return a bad pointer here, @@ -2809,6 +2806,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, return ERR_PTR(ret); } + kref_get(&ctrl->ctrl.kref); + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", ctrl->cnum, ctrl->ctrl.opts->subsysnqn); From aace34c0bb8ea3c8bdcec865b6a4be4db0a68e33 Mon Sep 17 00:00:00 2001 From: Tin Huynh Date: Mon, 22 May 2017 16:19:20 +0700 Subject: [PATCH 116/254] leds: pca955x: Correct I2C Functionality The driver checks an incorrect flag of functionality of adapter. When a driver requires i2c_smbus_read_byte_data and i2c_smbus_write_byte_data, it should check I2C_FUNC_SMBUS_BYTE_DATA instead I2C_FUNC_I2C. This patch fixes the problem. Signed-off-by: Tin Huynh Signed-off-by: Jacek Anaszewski --- drivers/leds/leds-pca955x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/leds-pca955x.c b/drivers/leds/leds-pca955x.c index 78a7ce816a47..9a873118ea5f 100644 --- a/drivers/leds/leds-pca955x.c +++ b/drivers/leds/leds-pca955x.c @@ -285,7 +285,7 @@ static int pca955x_probe(struct i2c_client *client, "slave address 0x%02x\n", client->name, chip->bits, client->addr); - if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) return -EIO; if (pdata) { From 04dc1b2fff4e96cb4142227fbdc63c8871ad4ed9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2017 17:48:50 +0200 Subject: [PATCH 117/254] futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock() Markus reported that the glibc/nptl/tst-robustpi8 test was failing after commit: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") The following trace shows the problem: ld-linux-x86-64-2161 [019] .... 410.760971: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_LOCK_PI ld-linux-x86-64-2161 [019] ...1 410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0 ld-linux-x86-64-2165 [011] .... 410.760978: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_UNLOCK_PI ld-linux-x86-64-2165 [011] d..1 410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0 ld-linux-x86-64-2165 [011] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000 ld-linux-x86-64-2161 [019] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161 which then returns with -ETIMEDOUT. That wrecks the lock state, because now the owner isn't aware it acquired the lock and removes the pending robust list entry. If 2161 is killed, the robust list will not clear out this futex and the subsequent acquire on this futex will then (correctly) result in -ESRCH which is unexpected by glibc, triggers an internal assertion and dies. Task 2161 Task 2165 rt_mutex_wait_proxy_lock() timeout(); /* T2161 is still queued in the waiter list */ return -ETIMEDOUT; futex_unlock_pi() spin_lock(hb->lock); rtmutex_unlock() remove_rtmutex_waiter(T2161); mark_lock_available(); /* Make the next waiter owner of the user space side */ futex_uval = 2161; spin_unlock(hb->lock); spin_lock(hb->lock); rt_mutex_cleanup_proxy_lock() if (rtmutex_owner() !== current) ... return FAIL; .... return -ETIMEOUT; This means that rt_mutex_cleanup_proxy_lock() needs to call try_to_take_rt_mutex() so it can take over the rtmutex correctly which was assigned by the waker. If the rtmutex is owned by some other task then this call is harmless and just confirmes that the waiter is not able to acquire it. While there, fix what looks like a merge error which resulted in rt_mutex_cleanup_proxy_lock() having two calls to fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any. Both should have one, since both potentially touch the waiter list. Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()") Reported-by: Markus Trippelsdorf Bug-Spotted-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Florian Weimer Cc: Darren Hart Cc: Sebastian Andrzej Siewior Cc: Markus Trippelsdorf Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b95509416909..28cd09e635ed 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1785,12 +1785,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, int ret; raw_spin_lock_irq(&lock->wait_lock); - - set_current_state(TASK_INTERRUPTIBLE); - /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1821,16 +1823,26 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, bool cleanup = false; raw_spin_lock_irq(&lock->wait_lock); + /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); /* * Unless we're the owner; we're still enqueued on the wait_list. * So check if we became owner, if not, take us off the wait_list. */ if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); - fixup_rt_mutex_waiters(lock); cleanup = true; } - /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. From 4d6501dce079c1eb6bf0b1d8f528a5e81770109e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 9 May 2017 09:39:59 +0200 Subject: [PATCH 118/254] kthread: Fix use-after-free if kthread fork fails If a kthread forks (e.g. usermodehelper since commit 1da5c46fa965) but fails in copy_process() between calling dup_task_struct() and setting p->set_child_tid, then the value of p->set_child_tid will be inherited from the parent and get prematurely freed by free_kthread_struct(). kthread() - worker_thread() - process_one_work() | - call_usermodehelper_exec_work() | - kernel_thread() | - _do_fork() | - copy_process() | - dup_task_struct() | - arch_dup_task_struct() | - tsk->set_child_tid = current->set_child_tid // implied | - ... | - goto bad_fork_* | - ... | - free_task(tsk) | - free_kthread_struct(tsk) | - kfree(tsk->set_child_tid) - ... - schedule() - __schedule() - wq_worker_sleeping() - kthread_data(task)->flags // UAF The problem started showing up with commit 1da5c46fa965 since it reused ->set_child_tid for the kthread worker data. A better long-term solution might be to get rid of the ->set_child_tid abuse. The comment in set_kthread_struct() also looks slightly wrong. Debugged-by: Jamie Iles Fixes: 1da5c46fa965 ("kthread: Make struct kthread kmalloc'ed") Signed-off-by: Vegard Nossum Acked-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: Jamie Iles Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170509073959.17858-1-vegard.nossum@oracle.com Signed-off-by: Thomas Gleixner --- kernel/fork.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index d681f8f10d2d..b7cdea10239c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1553,6 +1553,18 @@ static __latent_entropy struct task_struct *copy_process( if (!p) goto fork_out; + /* + * This _must_ happen before we call free_task(), i.e. before we jump + * to any of the bad_fork_* labels. This is to avoid freeing + * p->set_child_tid which is (ab)used as a kthread's data pointer for + * kernel threads (PF_KTHREAD). + */ + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + ftrace_graph_init_task(p); rt_mutex_init_task(p); @@ -1716,11 +1728,6 @@ static __latent_entropy struct task_struct *copy_process( } } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? - */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif From 5b81fc3cc625e857275573cb4240bbab553f919c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 17 May 2017 13:07:24 -0700 Subject: [PATCH 119/254] blk-throttle: add hierarchy support for latency target and idle time For idle time, children's setting should not be bigger than parent's. For latency target, children's setting should not be smaller than parent's. The leaf nodes will adjust their settings according to the hierarchy and compare their IO with the settings and do upgrade/downgrade. parents nodes don't need to track their IO latency/idle time. Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 50 +++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b78db2e5fdff..16174f8cb0a1 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -157,6 +157,7 @@ struct throtl_grp { unsigned long last_check_time; unsigned long latency_target; /* us */ + unsigned long latency_target_conf; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; @@ -165,6 +166,7 @@ struct throtl_grp { unsigned long checked_last_finish_time; /* ns / 1024 */ unsigned long avg_idletime; /* ns / 1024 */ unsigned long idletime_threshold; /* us */ + unsigned long idletime_threshold_conf; /* us */ unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ @@ -482,6 +484,7 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) /* LIMIT_LOW will have default value 0 */ tg->latency_target = DFL_LATENCY_TARGET; + tg->latency_target_conf = DFL_LATENCY_TARGET; return &tg->pd; } @@ -512,6 +515,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd) tg->td = td; tg->idletime_threshold = td->dft_idletime_threshold; + tg->idletime_threshold_conf = td->dft_idletime_threshold; } /* @@ -1367,8 +1371,25 @@ static void tg_conf_updated(struct throtl_grp *tg) * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) - tg_update_has_rules(blkg_to_tg(blkg)); + blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) { + struct throtl_grp *this_tg = blkg_to_tg(blkg); + struct throtl_grp *parent_tg; + + tg_update_has_rules(this_tg); + /* ignore root/second level */ + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || + !blkg->parent->parent) + continue; + parent_tg = blkg_to_tg(blkg->parent); + /* + * make sure all children has lower idle time threshold and + * higher latency target + */ + this_tg->idletime_threshold = min(this_tg->idletime_threshold, + parent_tg->idletime_threshold); + this_tg->latency_target = max(this_tg->latency_target, + parent_tg->latency_target); + } /* * We're already holding queue_lock and know @tg is valid. Let's @@ -1497,8 +1518,8 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, tg->iops_conf[READ][off] == iops_dft && tg->iops_conf[WRITE][off] == iops_dft && (off != LIMIT_LOW || - (tg->idletime_threshold == tg->td->dft_idletime_threshold && - tg->latency_target == DFL_LATENCY_TARGET))) + (tg->idletime_threshold_conf == tg->td->dft_idletime_threshold && + tg->latency_target_conf == DFL_LATENCY_TARGET))) return 0; if (tg->bps_conf[READ][off] != bps_dft) @@ -1514,17 +1535,17 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops_conf[WRITE][off]); if (off == LIMIT_LOW) { - if (tg->idletime_threshold == ULONG_MAX) + if (tg->idletime_threshold_conf == ULONG_MAX) strcpy(idle_time, " idle=max"); else snprintf(idle_time, sizeof(idle_time), " idle=%lu", - tg->idletime_threshold); + tg->idletime_threshold_conf); - if (tg->latency_target == ULONG_MAX) + if (tg->latency_target_conf == ULONG_MAX) strcpy(latency_time, " latency=max"); else snprintf(latency_time, sizeof(latency_time), - " latency=%lu", tg->latency_target); + " latency=%lu", tg->latency_target_conf); } seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", @@ -1563,8 +1584,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[2] = tg->iops_conf[READ][index]; v[3] = tg->iops_conf[WRITE][index]; - idle_time = tg->idletime_threshold; - latency_time = tg->latency_target; + idle_time = tg->idletime_threshold_conf; + latency_time = tg->latency_target_conf; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; @@ -1628,10 +1649,10 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, blk_throtl_update_limit_valid(tg->td); if (tg->td->limit_valid[LIMIT_LOW]) tg->td->limit_index = LIMIT_LOW; - tg->idletime_threshold = (idle_time == ULONG_MAX) ? - ULONG_MAX : idle_time; - tg->latency_target = (latency_time == ULONG_MAX) ? - ULONG_MAX : latency_time; + tg->idletime_threshold_conf = idle_time; + tg->idletime_threshold = tg->idletime_threshold_conf; + tg->latency_target_conf = latency_time; + tg->latency_target = tg->latency_target_conf; } tg_conf_updated(tg); ret = 0; @@ -2385,6 +2406,7 @@ void blk_throtl_register_queue(struct request_queue *q) struct throtl_grp *tg = blkg_to_tg(blkg); tg->idletime_threshold = td->dft_idletime_threshold; + tg->idletime_threshold_conf = td->dft_idletime_threshold; } rcu_read_unlock(); } From 4cff729f62d1bd433178f1ffe09db5718835e925 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 17 May 2017 13:07:25 -0700 Subject: [PATCH 120/254] blk-throttle: output some debug info in trace These info are important to understand what's happening and help debug. Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 16174f8cb0a1..1f8d62f5e808 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1748,12 +1748,18 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) * - IO latency is largely below threshold */ unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); + bool ret; time = min_t(unsigned long, MAX_IDLE_TIME, time); - return (ktime_get_ns() >> 10) - tg->last_finish_time > time || + ret = (ktime_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); + throtl_log(&tg->service_queue, + "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", + tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, + tg->bio_cnt, ret, tg->td->scale); + return ret; } static bool throtl_tg_can_upgrade(struct throtl_grp *tg) @@ -1849,6 +1855,7 @@ static void throtl_upgrade_state(struct throtl_data *td) struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; + throtl_log(&td->service_queue, "upgrade to max"); td->limit_index = LIMIT_MAX; td->low_upgrade_time = jiffies; td->scale = 0; @@ -1871,6 +1878,7 @@ static void throtl_downgrade_state(struct throtl_data *td, int new) { td->scale /= 2; + throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); if (td->scale) { td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; return; @@ -2044,6 +2052,11 @@ static void throtl_update_latency_buckets(struct throtl_data *td) td->avg_buckets[i].valid = true; last_latency = td->avg_buckets[i].latency; } + + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) + throtl_log(&td->service_queue, + "Latency bucket %d: latency=%ld, valid=%d", i, + td->avg_buckets[i].latency, td->avg_buckets[i].valid); } #else static inline void throtl_update_latency_buckets(struct throtl_data *td) From 9bb67aeb96784527dbc784c7a1b234461299363c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 17 May 2017 13:07:26 -0700 Subject: [PATCH 121/254] blk-throttle: respect 0 bps/iops settings for io.low If a cgroup with low limit 0 for both bps/iops, the cgroup's low limit is ignored and we throttle the cgroup with its max limit. In this way, other cgroups with a low limit will not get protected. To fix this, we don't do the exception any more. cgroup will be throttled to a limit 0 if it uese default setting. To avoid completed stall, we give such cgroup tiny IO resources. Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 1f8d62f5e808..f6a9f42a0ad7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -27,6 +27,8 @@ static int throtl_quantum = 32; #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ /* default latency target is 0, eg, guarantee IO latency by default */ #define DFL_LATENCY_TARGET (0) +#define MIN_THROTL_BPS (320 * 1024) +#define MIN_THROTL_IOPS (10) #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) @@ -296,8 +298,14 @@ static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) td = tg->td; ret = tg->bps[rw][td->limit_index]; - if (ret == 0 && td->limit_index == LIMIT_LOW) - return tg->bps[rw][LIMIT_MAX]; + if (ret == 0 && td->limit_index == LIMIT_LOW) { + /* intermediate node or iops isn't 0 */ + if (!list_empty(&blkg->blkcg->css.children) || + tg->iops[rw][td->limit_index]) + return U64_MAX; + else + return MIN_THROTL_BPS; + } if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { @@ -317,10 +325,17 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; + td = tg->td; ret = tg->iops[rw][td->limit_index]; - if (ret == 0 && tg->td->limit_index == LIMIT_LOW) - return tg->iops[rw][LIMIT_MAX]; + if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { + /* intermediate node or bps isn't 0 */ + if (!list_empty(&blkg->blkcg->css.children) || + tg->bps[rw][td->limit_index]) + return UINT_MAX; + else + return MIN_THROTL_IOPS; + } if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { @@ -1353,7 +1368,7 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static void tg_conf_updated(struct throtl_grp *tg) +static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; @@ -1371,7 +1386,8 @@ static void tg_conf_updated(struct throtl_grp *tg) * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) { + blkg_for_each_descendant_pre(blkg, pos_css, + global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); struct throtl_grp *parent_tg; @@ -1434,7 +1450,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; - tg_conf_updated(tg); + tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_finish(&ctx); @@ -1522,16 +1538,16 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, tg->latency_target_conf == DFL_LATENCY_TARGET))) return 0; - if (tg->bps_conf[READ][off] != bps_dft) + if (tg->bps_conf[READ][off] != U64_MAX) snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps_conf[READ][off]); - if (tg->bps_conf[WRITE][off] != bps_dft) + if (tg->bps_conf[WRITE][off] != U64_MAX) snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps_conf[WRITE][off]); - if (tg->iops_conf[READ][off] != iops_dft) + if (tg->iops_conf[READ][off] != UINT_MAX) snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops_conf[READ][off]); - if (tg->iops_conf[WRITE][off] != iops_dft) + if (tg->iops_conf[WRITE][off] != UINT_MAX) snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops_conf[WRITE][off]); if (off == LIMIT_LOW) { @@ -1654,7 +1670,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->latency_target_conf = latency_time; tg->latency_target = tg->latency_target_conf; } - tg_conf_updated(tg); + tg_conf_updated(tg, index == LIMIT_LOW && + tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish: blkg_conf_finish(&ctx); From b4f428ef2844e9fa8154f2faaca249aa74e222a7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 17 May 2017 13:07:27 -0700 Subject: [PATCH 122/254] blk-throttle: force user to configure all settings for io.low Default value of io.low limit is 0. If user doesn't configure the limit, last patch makes cgroup be throttled to very tiny bps/iops, which could stall the system. A cgroup with default settings of io.low limit really means nothing, so we force user to configure all settings, otherwise io.low limit doesn't take effect. With this stragety, default setting of latency/idle isn't important, so just set them to very conservative and safe value. Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 78 ++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f6a9f42a0ad7..fc13dd0c6e39 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -22,13 +22,11 @@ static int throtl_quantum = 32; #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) -#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ -#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ -/* default latency target is 0, eg, guarantee IO latency by default */ -#define DFL_LATENCY_TARGET (0) #define MIN_THROTL_BPS (320 * 1024) #define MIN_THROTL_IOPS (10) +#define DFL_LATENCY_TARGET (-1L) +#define DFL_IDLE_THRESHOLD (0) #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) @@ -205,8 +203,6 @@ struct throtl_data unsigned int limit_index; bool limit_valid[LIMIT_CNT]; - unsigned long dft_idletime_threshold; /* us */ - unsigned long low_upgrade_time; unsigned long low_downgrade_time; @@ -500,6 +496,8 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->latency_target = DFL_LATENCY_TARGET; tg->latency_target_conf = DFL_LATENCY_TARGET; + tg->idletime_threshold = DFL_IDLE_THRESHOLD; + tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; } @@ -528,9 +526,6 @@ static void throtl_pd_init(struct blkg_policy_data *pd) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->idletime_threshold = td->dft_idletime_threshold; - tg->idletime_threshold_conf = td->dft_idletime_threshold; } /* @@ -1534,7 +1529,7 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, tg->iops_conf[READ][off] == iops_dft && tg->iops_conf[WRITE][off] == iops_dft && (off != LIMIT_LOW || - (tg->idletime_threshold_conf == tg->td->dft_idletime_threshold && + (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && tg->latency_target_conf == DFL_LATENCY_TARGET))) return 0; @@ -1660,16 +1655,31 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->iops_conf[READ][LIMIT_MAX]); tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], tg->iops_conf[WRITE][LIMIT_MAX]); + tg->idletime_threshold_conf = idle_time; + tg->latency_target_conf = latency_time; - if (index == LIMIT_LOW) { - blk_throtl_update_limit_valid(tg->td); - if (tg->td->limit_valid[LIMIT_LOW]) - tg->td->limit_index = LIMIT_LOW; - tg->idletime_threshold_conf = idle_time; + /* force user to configure all settings for low limit */ + if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || + tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || + tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || + tg->latency_target_conf == DFL_LATENCY_TARGET) { + tg->bps[READ][LIMIT_LOW] = 0; + tg->bps[WRITE][LIMIT_LOW] = 0; + tg->iops[READ][LIMIT_LOW] = 0; + tg->iops[WRITE][LIMIT_LOW] = 0; + tg->idletime_threshold = DFL_IDLE_THRESHOLD; + tg->latency_target = DFL_LATENCY_TARGET; + } else if (index == LIMIT_LOW) { tg->idletime_threshold = tg->idletime_threshold_conf; - tg->latency_target_conf = latency_time; tg->latency_target = tg->latency_target_conf; } + + blk_throtl_update_limit_valid(tg->td); + if (tg->td->limit_valid[LIMIT_LOW]) { + if (index == LIMIT_LOW) + tg->td->limit_index = LIMIT_LOW; + } else + tg->td->limit_index = LIMIT_MAX; tg_conf_updated(tg, index == LIMIT_LOW && tg->td->limit_valid[LIMIT_LOW]); ret = 0; @@ -1760,17 +1770,19 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) /* * cgroup is idle if: * - single idle is too long, longer than a fixed value (in case user - * configure a too big threshold) or 4 times of slice + * configure a too big threshold) or 4 times of idletime threshold * - average think time is more than threshold * - IO latency is largely below threshold */ - unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); + unsigned long time; bool ret; - time = min_t(unsigned long, MAX_IDLE_TIME, time); - ret = (ktime_get_ns() >> 10) - tg->last_finish_time > time || - tg->avg_idletime > tg->idletime_threshold || - (tg->latency_target && tg->bio_cnt && + time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); + ret = tg->latency_target == DFL_LATENCY_TARGET || + tg->idletime_threshold == DFL_IDLE_THRESHOLD || + (ktime_get_ns() >> 10) - tg->last_finish_time > time || + tg->avg_idletime > tg->idletime_threshold || + (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); throtl_log(&tg->service_queue, "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", @@ -2405,19 +2417,14 @@ void blk_throtl_exit(struct request_queue *q) void blk_throtl_register_queue(struct request_queue *q) { struct throtl_data *td; - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; td = q->td; BUG_ON(!td); - if (blk_queue_nonrot(q)) { + if (blk_queue_nonrot(q)) td->throtl_slice = DFL_THROTL_SLICE_SSD; - td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; - } else { + else td->throtl_slice = DFL_THROTL_SLICE_HD; - td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; - } #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ td->throtl_slice = DFL_THROTL_SLICE_HD; @@ -2426,19 +2433,6 @@ void blk_throtl_register_queue(struct request_queue *q) td->track_bio_latency = !q->mq_ops && !q->request_fn; if (!td->track_bio_latency) blk_stat_enable_accounting(q); - - /* - * some tg are created before queue is fully initialized, eg, nonrot - * isn't initialized yet - */ - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - tg->idletime_threshold = td->dft_idletime_threshold; - tg->idletime_threshold_conf = td->dft_idletime_threshold; - } - rcu_read_unlock(); } #ifdef CONFIG_BLK_DEV_THROTTLING_LOW From c849e55178f559c4bbed43efb113cb7602aade89 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 16 May 2017 19:21:08 +0200 Subject: [PATCH 123/254] PCI: endpoint: Make PCI_ENDPOINT depend on HAS_DMA If NO_DMA=y: drivers/built-in.o: In function `__pci_epc_create': (.text+0xef4e): undefined reference to `bad_dma_ops' drivers/built-in.o: In function `pci_epc_add_epf': (.text+0xf676): undefined reference to `bad_dma_ops' drivers/built-in.o: In function `pci_epf_alloc_space': (.text+0xfa32): undefined reference to `bad_dma_ops' drivers/built-in.o: In function `pci_epf_free_space': (.text+0xfac4): undefined reference to `bad_dma_ops' Add a dependency on HAS_DMA to fix this. Signed-off-by: Geert Uytterhoeven Signed-off-by: Bjorn Helgaas --- drivers/pci/endpoint/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/endpoint/Kconfig b/drivers/pci/endpoint/Kconfig index c23f146fb5a6..c09623ca8c3b 100644 --- a/drivers/pci/endpoint/Kconfig +++ b/drivers/pci/endpoint/Kconfig @@ -6,6 +6,7 @@ menu "PCI Endpoint" config PCI_ENDPOINT bool "PCI Endpoint Support" + depends on HAS_DMA help Enable this configuration option to support configurable PCI endpoint. This should be enabled if the platform has a PCI From e40cf640b8f632091a30ef0b030c83546f07c902 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 22 May 2017 16:52:24 -0500 Subject: [PATCH 124/254] switchtec: Use new cdev_device_add() helper function Convert from "cdev_add() + device_add()" to cdev_device_add(), and from "device_del() + cdev_del()" to cdev_device_del(). [bhelgaas: changelog] Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index cc6e085008fb..abaa227a5f34 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -1291,7 +1291,6 @@ static struct switchtec_dev *stdev_create(struct pci_dev *pdev) cdev = &stdev->cdev; cdev_init(cdev, &switchtec_fops); cdev->owner = THIS_MODULE; - cdev->kobj.parent = &dev->kobj; return stdev; @@ -1479,11 +1478,7 @@ static int switchtec_pci_probe(struct pci_dev *pdev, SWITCHTEC_EVENT_EN_IRQ, &stdev->mmio_part_cfg->mrpc_comp_hdr); - rc = cdev_add(&stdev->cdev, stdev->dev.devt, 1); - if (rc) - goto err_put; - - rc = device_add(&stdev->dev); + rc = cdev_device_add(&stdev->cdev, &stdev->dev); if (rc) goto err_devadd; @@ -1492,7 +1487,6 @@ static int switchtec_pci_probe(struct pci_dev *pdev, return 0; err_devadd: - cdev_del(&stdev->cdev); stdev_kill(stdev); err_put: ida_simple_remove(&switchtec_minor_ida, MINOR(stdev->dev.devt)); @@ -1506,8 +1500,7 @@ static void switchtec_pci_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); - device_del(&stdev->dev); - cdev_del(&stdev->cdev); + cdev_device_del(&stdev->cdev, &stdev->dev); ida_simple_remove(&switchtec_minor_ida, MINOR(stdev->dev.devt)); dev_info(&stdev->dev, "unregistered.\n"); From 9871e9bb5cf6ff0b51457ca74c270c5c5230b224 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 22 May 2017 16:52:30 -0500 Subject: [PATCH 125/254] switchtec: Fix minor bug with partition ID register When a switch endpoint is configured without NTB, the mmio_ntb registers will read all zeros. However, in corner case configurations where the partition ID is not zero and NTB is not enabled, the code will have the wrong partition ID and this causes the driver to use the wrong set of drivers. To fix this we simply take the partition ID from the system info region. Reported-by: Dingbao Chen Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index abaa227a5f34..f6a63406c76e 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -1441,12 +1441,15 @@ static int switchtec_init_pci(struct switchtec_dev *stdev, stdev->mmio_sys_info = stdev->mmio + SWITCHTEC_GAS_SYS_INFO_OFFSET; stdev->mmio_flash_info = stdev->mmio + SWITCHTEC_GAS_FLASH_INFO_OFFSET; stdev->mmio_ntb = stdev->mmio + SWITCHTEC_GAS_NTB_OFFSET; - stdev->partition = ioread8(&stdev->mmio_ntb->partition_id); + stdev->partition = ioread8(&stdev->mmio_sys_info->partition_id); stdev->partition_count = ioread8(&stdev->mmio_ntb->partition_count); stdev->mmio_part_cfg_all = stdev->mmio + SWITCHTEC_GAS_PART_CFG_OFFSET; stdev->mmio_part_cfg = &stdev->mmio_part_cfg_all[stdev->partition]; stdev->mmio_pff_csr = stdev->mmio + SWITCHTEC_GAS_PFF_CSR_OFFSET; + if (stdev->partition_count < 1) + stdev->partition_count = 1; + init_pff(stdev); pci_set_drvdata(pdev, stdev); From 415b6185c541dc0a21457ff307cdb61950a6eb9f Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 22 May 2017 17:06:30 -0500 Subject: [PATCH 126/254] PCI: imx6: Fix config read timeout handling Commit cc7b0d495589 ("PCI: designware: Update PCI config space remap function") made PCI configuration requests non-posted, which means we now get a synchronous abort when the CFG space read to probe for downstream devices times out. Synchronous aborts need to be handled differently from the async aborts we were getting before, in particular the PC needs to be advanced when resolving the abort. This is mostly a copy of what other PCI drivers do on ARM to handle those aborts. [bhelgaas: changelog, "Fixes"] Fixes: cc7b0d495589 ("PCI: designware: Update PCI config space remap function") Tested-by: Fabio Estevam Tested-by: Peter Senna Tschudin Signed-off-by: Lucas Stach Signed-off-by: Bjorn Helgaas Acked-by: Richard Zhu --- drivers/pci/dwc/pci-imx6.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/drivers/pci/dwc/pci-imx6.c b/drivers/pci/dwc/pci-imx6.c index a98cba55c7f0..19a289b8cc94 100644 --- a/drivers/pci/dwc/pci-imx6.c +++ b/drivers/pci/dwc/pci-imx6.c @@ -252,7 +252,34 @@ static void imx6_pcie_reset_phy(struct imx6_pcie *imx6_pcie) static int imx6q_pcie_abort_handler(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { - return 0; + unsigned long pc = instruction_pointer(regs); + unsigned long instr = *(unsigned long *)pc; + int reg = (instr >> 12) & 15; + + /* + * If the instruction being executed was a read, + * make it look like it read all-ones. + */ + if ((instr & 0x0c100000) == 0x04100000) { + unsigned long val; + + if (instr & 0x00400000) + val = 255; + else + val = -1; + + regs->uregs[reg] = val; + regs->ARM_pc += 4; + return 0; + } + + if ((instr & 0x0e100090) == 0x00100090) { + regs->uregs[reg] = -1; + regs->ARM_pc += 4; + return 0; + } + + return 1; } static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) @@ -819,8 +846,8 @@ static int __init imx6_pcie_init(void) * we can install the handler here without risking it * accessing some uninitialized driver state. */ - hook_fault_code(16 + 6, imx6q_pcie_abort_handler, SIGBUS, 0, - "imprecise external abort"); + hook_fault_code(8, imx6q_pcie_abort_handler, SIGBUS, 0, + "external abort on non-linefetch"); return platform_driver_register(&imx6_pcie_driver); } From c10e8031d5b34cde06b039ca2e8af87a33d5ba11 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 May 2017 13:07:49 -0700 Subject: [PATCH 127/254] efi-pstore: Fix write/erase id tracking Prior to the pstore interface refactoring, the "id" generated during a backend pstore_write() was only retained by the internal pstore inode tracking list. Additionally the "part" was ignored, so EFI would encode this in the id. This corrects the misunderstandings and correctly sets "id" during pstore_write(), and uses "part" directly during pstore_erase(). Reported-by: Marta Lofstedt Fixes: 76cc9580e3fb ("pstore: Replace arguments for write() API") Fixes: a61072aae693 ("pstore: Replace arguments for erase() API") Signed-off-by: Kees Cook Tested-by: Marta Lofstedt --- drivers/firmware/efi/efi-pstore.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 44148fd4c9f2..dda2e96328c0 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -53,6 +53,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry, if (sscanf(name, "dump-type%u-%u-%d-%lu-%c", &record->type, &part, &cnt, &time, &data_type) == 5) { record->id = generic_id(time, part, cnt); + record->part = part; record->count = cnt; record->time.tv_sec = time; record->time.tv_nsec = 0; @@ -64,6 +65,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry, } else if (sscanf(name, "dump-type%u-%u-%d-%lu", &record->type, &part, &cnt, &time) == 4) { record->id = generic_id(time, part, cnt); + record->part = part; record->count = cnt; record->time.tv_sec = time; record->time.tv_nsec = 0; @@ -77,6 +79,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry, * multiple logs, remains. */ record->id = generic_id(time, part, 0); + record->part = part; record->count = 0; record->time.tv_sec = time; record->time.tv_nsec = 0; @@ -241,9 +244,15 @@ static int efi_pstore_write(struct pstore_record *record) efi_guid_t vendor = LINUX_EFI_CRASH_GUID; int i, ret = 0; + record->time.tv_sec = get_seconds(); + record->time.tv_nsec = 0; + + record->id = generic_id(record->time.tv_sec, record->part, + record->count); + snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lu-%c", record->type, record->part, record->count, - get_seconds(), record->compressed ? 'C' : 'D'); + record->time.tv_sec, record->compressed ? 'C' : 'D'); for (i = 0; i < DUMP_NAME_LEN; i++) efi_name[i] = name[i]; @@ -255,7 +264,6 @@ static int efi_pstore_write(struct pstore_record *record) if (record->reason == KMSG_DUMP_OOPS) efivar_run_worker(); - record->id = record->part; return ret; }; @@ -287,7 +295,7 @@ static int efi_pstore_erase_func(struct efivar_entry *entry, void *data) * holding multiple logs, remains. */ snprintf(name_old, sizeof(name_old), "dump-type%u-%u-%lu", - ed->record->type, (unsigned int)ed->record->id, + ed->record->type, ed->record->part, ed->record->time.tv_sec); for (i = 0; i < DUMP_NAME_LEN; i++) @@ -320,10 +328,7 @@ static int efi_pstore_erase(struct pstore_record *record) char name[DUMP_NAME_LEN]; efi_char16_t efi_name[DUMP_NAME_LEN]; int found, i; - unsigned int part; - do_div(record->id, 1000); - part = do_div(record->id, 100); snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lu", record->type, record->part, record->count, record->time.tv_sec); From aa3d4409b664813ceb86a24bd09458cdd29cbb8a Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Mon, 22 May 2017 17:19:45 -0700 Subject: [PATCH 128/254] Input: edt-ft5x06 - increase allowed data range for threshold parameter The datasheet and application note does not mention an allowed range for the M09_REGISTER_THRESHOLD parameter. One of our customers needs to set lower values than 20 and they seem to work just fine on EDT EP0xx0M09 with T5x06 touch. So, lacking a known lower limit, we increase the range for thresholds, and set the lower limit to 0. The documentation is updated accordingly. Signed-off-by: Schoefegger Stefan Signed-off-by: Manfred Schlaegl Signed-off-by: Martin Kepplinger Acked-by: Rob Herring Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/touchscreen/edt-ft5x06.txt | 2 +- Documentation/input/devices/edt-ft5x06.rst | 2 +- drivers/input/touchscreen/edt-ft5x06.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt index 6db22103e2dd..025cf8c9324a 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt +++ b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt @@ -36,7 +36,7 @@ Optional properties: control gpios - threshold: allows setting the "click"-threshold in the range - from 20 to 80. + from 0 to 80. - gain: allows setting the sensitivity in the range from 0 to 31. Note that lower values indicate higher diff --git a/Documentation/input/devices/edt-ft5x06.rst b/Documentation/input/devices/edt-ft5x06.rst index 2032f0b7a8fa..1ccc94b192b7 100644 --- a/Documentation/input/devices/edt-ft5x06.rst +++ b/Documentation/input/devices/edt-ft5x06.rst @@ -15,7 +15,7 @@ It has been tested with the following devices: The driver allows configuration of the touch screen via a set of sysfs files: /sys/class/input/eventX/device/device/threshold: - allows setting the "click"-threshold in the range from 20 to 80. + allows setting the "click"-threshold in the range from 0 to 80. /sys/class/input/eventX/device/device/gain: allows setting the sensitivity in the range from 0 to 31. Note that diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c index 8cf8d8d5d4ef..f872817e81e4 100644 --- a/drivers/input/touchscreen/edt-ft5x06.c +++ b/drivers/input/touchscreen/edt-ft5x06.c @@ -471,7 +471,7 @@ static EDT_ATTR(gain, S_IWUSR | S_IRUGO, WORK_REGISTER_GAIN, static EDT_ATTR(offset, S_IWUSR | S_IRUGO, WORK_REGISTER_OFFSET, M09_REGISTER_OFFSET, 0, 31); static EDT_ATTR(threshold, S_IWUSR | S_IRUGO, WORK_REGISTER_THRESHOLD, - M09_REGISTER_THRESHOLD, 20, 80); + M09_REGISTER_THRESHOLD, 0, 80); static EDT_ATTR(report_rate, S_IWUSR | S_IRUGO, WORK_REGISTER_REPORT_RATE, NO_REGISTER, 3, 14); From 089b50d95948f691589cca4d81f1f8761747dbaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20Roussin-B=C3=A9langer?= Date: Fri, 19 May 2017 14:59:03 -0700 Subject: [PATCH 129/254] Input: atmel_mxt_ts - add T100 as a readable object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using the 'object' sysfs attribute, T100 is not displayed in the output. Signed-off-by: Maxime Roussin-Bélanger Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/atmel_mxt_ts.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c index 2302aef2b2d4..dd042a9b0aaa 100644 --- a/drivers/input/touchscreen/atmel_mxt_ts.c +++ b/drivers/input/touchscreen/atmel_mxt_ts.c @@ -350,6 +350,7 @@ static bool mxt_object_readable(unsigned int type) case MXT_TOUCH_KEYARRAY_T15: case MXT_TOUCH_PROXIMITY_T23: case MXT_TOUCH_PROXKEY_T52: + case MXT_TOUCH_MULTITOUCHSCREEN_T100: case MXT_PROCI_GRIPFACE_T20: case MXT_PROCG_NOISE_T22: case MXT_PROCI_ONETOUCH_T24: From ad258fb918dae8b1ec79a85b4c7f518e4f902869 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 22 May 2017 07:46:55 +0200 Subject: [PATCH 130/254] i2c: designware: Fix bogus sda_hold_time due to uninitialized vars We need to initializes those variables to 0 for platforms that do not provide ACPI parameters. Otherwise, we set sda_hold_time to random values, breaking e.g. Galileo and IOT2000 boards. Reported-and-tested-by: Linus Torvalds Reported-by: Tobias Klausmann Fixes: 9d6408433019 ("i2c: designware: don't infer timings described by ACPI from clock rate") Signed-off-by: Jan Kiszka Reviewed-by: Ard Biesheuvel Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang Signed-off-by: Linus Torvalds --- drivers/i2c/busses/i2c-designware-platdrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 6283b99d2b17..d1263b82d646 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -94,9 +94,9 @@ static void dw_i2c_acpi_params(struct platform_device *pdev, char method[], static int dw_i2c_acpi_configure(struct platform_device *pdev) { struct dw_i2c_dev *dev = platform_get_drvdata(pdev); + u32 ss_ht = 0, fp_ht = 0, hs_ht = 0, fs_ht = 0; acpi_handle handle = ACPI_HANDLE(&pdev->dev); const struct acpi_device_id *id; - u32 ss_ht, fp_ht, hs_ht, fs_ht; struct acpi_device *adev; const char *uid; From 1fc2e41f7af4572b07190f9dec28396b418e9a36 Mon Sep 17 00:00:00 2001 From: Alexander Tsoy Date: Mon, 22 May 2017 20:58:11 +0300 Subject: [PATCH 131/254] ALSA: hda - apply STAC_9200_DELL_M22 quirk for Dell Latitude D430 This model is actually called 92XXM2-8 in Windows driver. But since pin configs for M22 and M28 are identical, just reuse M22 quirk. Fixes external microphone (tested) and probably docking station ports (not tested). Signed-off-by: Alexander Tsoy Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_sigmatel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index faa3d38bac0b..6cefdf6c0b75 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1559,6 +1559,8 @@ static const struct snd_pci_quirk stac9200_fixup_tbl[] = { "Dell Inspiron 1501", STAC_9200_DELL_M26), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x01f6, "unknown Dell", STAC_9200_DELL_M26), + SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0201, + "Dell Latitude D430", STAC_9200_DELL_M22), /* Panasonic */ SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-74", STAC_9200_PANASONIC), /* Gateway machines needs EAPD to be set on resume */ From 429030bc944ee9a8bbe5d9bb23dcda0ae2205450 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 19 May 2017 14:58:19 -0300 Subject: [PATCH 132/254] drm: qxl: Delay entering atomic context during cursor update qxl_release_map will enter an atomic context, but since we still need to alloc memory for BOs, we better delay that until we have everything we need, in case we need to sleep inside the allocation. This avoids the Sleep in atomic state below, which was reported by Mike. [ 43.910362] BUG: sleeping function called from invalid context at mm/slab.h:432 [ 43.910955] in_atomic(): 1, irqs_disabled(): 0, pid: 2077, name: Xorg [ 43.911472] Preemption disabled at: [ 43.911478] [] qxl_bo_kmap_atomic_page+0xa5/0x100 [qxl] [ 43.912103] CPU: 0 PID: 2077 Comm: Xorg Tainted: G E 4.12.0-master #38 [ 43.912550] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20161202_174313-build11a 04/01/2014 [ 43.913202] Call Trace: [ 43.913371] dump_stack+0x65/0x89 [ 43.913581] ? qxl_bo_kmap_atomic_page+0xa5/0x100 [qxl] [ 43.913876] ___might_sleep+0x11a/0x190 [ 43.914095] __might_sleep+0x4a/0x80 [ 43.914319] ? qxl_bo_create+0x50/0x190 [qxl] [ 43.914565] kmem_cache_alloc_trace+0x46/0x180 [ 43.914836] qxl_bo_create+0x50/0x190 [qxl] [ 43.915082] ? refcount_dec_and_test+0x11/0x20 [ 43.915332] ? ttm_mem_io_reserve+0x41/0xe0 [ttm] [ 43.915595] qxl_alloc_bo_reserved+0x37/0xb0 [qxl] [ 43.915884] qxl_cursor_atomic_update+0x8f/0x260 [qxl] [ 43.916172] ? drm_atomic_helper_update_legacy_modeset_state+0x1d6/0x210 [drm_kms_helper] [ 43.916623] drm_atomic_helper_commit_planes+0xec/0x230 [drm_kms_helper] [ 43.916995] drm_atomic_helper_commit_tail+0x2b/0x60 [drm_kms_helper] [ 43.917398] commit_tail+0x65/0x70 [drm_kms_helper] [ 43.917693] drm_atomic_helper_commit+0xa9/0x100 [drm_kms_helper] [ 43.918039] drm_atomic_commit+0x4b/0x50 [drm] [ 43.918334] drm_atomic_helper_update_plane+0xf1/0x110 [drm_kms_helper] [ 43.918902] __setplane_internal+0x19f/0x280 [drm] [ 43.919240] drm_mode_cursor_universal+0x101/0x1c0 [drm] [ 43.919541] drm_mode_cursor_common+0x15b/0x1d0 [drm] [ 43.919858] drm_mode_cursor2_ioctl+0xe/0x10 [drm] [ 43.920157] drm_ioctl+0x211/0x460 [drm] [ 43.920383] ? drm_mode_cursor_ioctl+0x50/0x50 [drm] [ 43.920664] ? handle_mm_fault+0x93/0x160 [ 43.920893] do_vfs_ioctl+0x96/0x6e0 [ 43.921117] ? __fget+0x73/0xa0 [ 43.921322] SyS_ioctl+0x41/0x70 [ 43.921545] entry_SYSCALL_64_fastpath+0x1a/0xa5 [ 43.922188] RIP: 0033:0x7f1145804bc7 [ 43.922526] RSP: 002b:00007ffcd3e50508 EFLAGS: 00003246 ORIG_RAX: 0000000000000010 [ 43.923367] RAX: ffffffffffffffda RBX: 0000000000000040 RCX: 00007f1145804bc7 [ 43.923852] RDX: 00007ffcd3e50540 RSI: 00000000c02464bb RDI: 000000000000000b [ 43.924299] RBP: 0000000000000040 R08: 0000000000000040 R09: 000000000000000c [ 43.924694] R10: 00007ffcd3e50340 R11: 0000000000003246 R12: 0000000000000018 [ 43.925128] R13: 00000000022bc390 R14: 0000000000000040 R15: 00007ffcd3e5062c Reported-by: Mike Galbraith Signed-off-by: Gabriel Krisman Bertazi Link: http://patchwork.freedesktop.org/patch/msgid/20170519175819.15682-1-krisman@collabora.co.uk Signed-off-by: Gerd Hoffmann --- drivers/gpu/drm/qxl/qxl_display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index 058340a002c2..4a340efd8ba6 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -575,8 +575,6 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, if (ret) return; - cmd = (struct qxl_cursor_cmd *) qxl_release_map(qdev, release); - if (fb != old_state->fb) { obj = to_qxl_framebuffer(fb)->obj; user_bo = gem_to_qxl_bo(obj); @@ -614,6 +612,7 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, qxl_bo_kunmap(cursor_bo); qxl_bo_kunmap(user_bo); + cmd = (struct qxl_cursor_cmd *) qxl_release_map(qdev, release); cmd->u.set.visible = 1; cmd->u.set.shape = qxl_bo_physical_address(qdev, cursor_bo, 0); @@ -624,6 +623,7 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, if (ret) goto out_free_release; + cmd = (struct qxl_cursor_cmd *) qxl_release_map(qdev, release); cmd->type = QXL_CURSOR_MOVE; } From f928543404bdf6bb4e8d6a6c3ced5edebd0d6f38 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 22 May 2017 15:59:45 +0200 Subject: [PATCH 133/254] drm: Fix deadlock retry loop in page_flip_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I failed to properly onion-wrap the unwind code: We acquire the vblank reference before we start with the wait-wound locking dance, hence we must make sure we retry before we drop the reference. Oops. v2: The vblank_put must be after the frambuffer_put (Michel). I suck at unwrapping code that doesn't use separate labels for each stage, but checks each pointer first ... While re-reading everything I also realized that we must clean up the fb refcounts, and specifically plane->old_fb before we drop the locks, either in the final unlocking, or in the w/w retry path. Hence the correct fix is to drop the vblank_put to the very bottom. Fixes: 29dc0d1de182 ("drm: Roll out acquire context for the page_flip ioctl") Cc: Harry Wentland Cc: Daniel Vetter Cc: Jani Nikula Cc: Sean Paul Cc: David Airlie Cc: dri-devel@lists.freedesktop.org Reported-by: Tommi Rantala Cc: Tommi Rantala Cc: Michel Dänzer Tested-by: Tommi Rantala Reviewed-by: Michel Dänzer Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20170522135945.28831-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_plane.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c index fedd4d60d9cd..5dc8c4350602 100644 --- a/drivers/gpu/drm/drm_plane.c +++ b/drivers/gpu/drm/drm_plane.c @@ -948,8 +948,6 @@ retry: } out: - if (ret && crtc->funcs->page_flip_target) - drm_crtc_vblank_put(crtc); if (fb) drm_framebuffer_put(fb); if (crtc->primary->old_fb) @@ -964,5 +962,8 @@ out: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); + if (ret && crtc->funcs->page_flip_target) + drm_crtc_vblank_put(crtc); + return ret; } From c477ebe21fabe0010a2ed324ce3a1762c757d867 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Sat, 6 May 2017 11:41:30 +0200 Subject: [PATCH 134/254] mmc: dt: pwrseq-simple: Invent power-off-delay-us During power off, after the GPIO pin has been asserted, some devices like the Wifi chip from TI, Wl18xx, needs a delay before the host continues with clock gating and turning off regulators as to follow a graceful shutdown sequence. Therefore invent an optional power-off-delay-us DT binding for mmc-pwrseq-simple, to allow us to support this constraint. Cc: devicetree@vger.kernel.org Cc: Rob Herring Cc: linux-mmc@vger.kernel.org Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt index e25436861867..9029b45b8a22 100644 --- a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt +++ b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt @@ -18,6 +18,8 @@ Optional properties: "ext_clock" (External clock provided to the card). - post-power-on-delay-ms : Delay in ms after powering the card and de-asserting the reset-gpios (if any) +- power-off-delay-us : Delay in us after asserting the reset-gpios (if any) + during power off of the card. Example: From e9256e142f597edf90c68cec22db4c4aebaa27de Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Sat, 6 May 2017 11:43:05 +0200 Subject: [PATCH 135/254] mmc: pwrseq_simple: Parse DTS for the power-off-delay-us property If the optional power-off-delay-us property is found, insert the corresponding delay after asserting the GPIO during power off. This enables a graceful shutdown sequence for some devices. Cc: linux-mmc@vger.kernel.org Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- drivers/mmc/core/pwrseq_simple.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mmc/core/pwrseq_simple.c b/drivers/mmc/core/pwrseq_simple.c index 1304160de168..13ef162cf066 100644 --- a/drivers/mmc/core/pwrseq_simple.c +++ b/drivers/mmc/core/pwrseq_simple.c @@ -27,6 +27,7 @@ struct mmc_pwrseq_simple { struct mmc_pwrseq pwrseq; bool clk_enabled; u32 post_power_on_delay_ms; + u32 power_off_delay_us; struct clk *ext_clk; struct gpio_descs *reset_gpios; }; @@ -78,6 +79,10 @@ static void mmc_pwrseq_simple_power_off(struct mmc_host *host) mmc_pwrseq_simple_set_gpios_value(pwrseq, 1); + if (pwrseq->power_off_delay_us) + usleep_range(pwrseq->power_off_delay_us, + 2 * pwrseq->power_off_delay_us); + if (!IS_ERR(pwrseq->ext_clk) && pwrseq->clk_enabled) { clk_disable_unprepare(pwrseq->ext_clk); pwrseq->clk_enabled = false; @@ -119,6 +124,8 @@ static int mmc_pwrseq_simple_probe(struct platform_device *pdev) device_property_read_u32(dev, "post-power-on-delay-ms", &pwrseq->post_power_on_delay_ms); + device_property_read_u32(dev, "power-off-delay-us", + &pwrseq->power_off_delay_us); pwrseq->pwrseq.dev = dev; pwrseq->pwrseq.ops = &mmc_pwrseq_simple_ops; From f74ac688c981138c914f9afba50b646146e35585 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 24 Apr 2017 22:40:22 +0200 Subject: [PATCH 136/254] mfd: dts: hi655x: Add clock binding for the pmic The hi655x PMIC provides the regulators but also a clock. The latter is missing in the definition, so extend the documentation to include this as well. Signed-off-by: Daniel Lezcano Acked-by: Rob Herring Acked-by: Lee Jones [Ulf: Split patch and updated changelog] Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt b/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt index 05485699d70e..9630ac0e4b56 100644 --- a/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt +++ b/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt @@ -16,6 +16,11 @@ Required properties: - reg: Base address of PMIC on Hi6220 SoC. - interrupt-controller: Hi655x has internal IRQs (has own IRQ domain). - pmic-gpios: The GPIO used by PMIC IRQ. +- #clock-cells: From common clock binding; shall be set to 0 + +Optional properties: +- clock-output-names: From common clock binding to override the + default output clock name Example: pmic: pmic@f8000000 { @@ -24,4 +29,5 @@ Example: interrupt-controller; #interrupt-cells = <2>; pmic-gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>; + #clock-cells = <0>; } From 307ded8968868e55343e063fbe96cff1efd77eb6 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 24 Apr 2017 22:40:22 +0200 Subject: [PATCH 137/254] arm64: dts: hikey: Add clock for the pmic mfd The hi655x PMIC provides the regulators but also a clock. The latter is missing so let's add it. This clock is used by WiFi/Bluetooth chip, but that connection is done in a separate change on top of this one. Signed-off-by: Daniel Lezcano Acked-by: Rob Herring Acked-by: Lee Jones [Ulf: Split patch and updated changelog] Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index 75bce2d0b1a8..d22eb3a646c4 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -330,6 +330,7 @@ pmic: pmic@f8000000 { compatible = "hisilicon,hi655x-pmic"; reg = <0x0 0xf8000000 0x0 0x1000>; + #clock-cells = <0>; interrupt-controller; #interrupt-cells = <2>; pmic-gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>; From 1b32a5ff98fbb271d2235ddcfe3b58f514f8260a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 May 2017 12:46:55 +0200 Subject: [PATCH 138/254] arm64: dts: hi6220: Move the fixed_5v_hub regulator to the hikey dts The regulator is a part of the hikey board, therefore let's move it from the hi6220 SoC dtsi file into the hikey dts file . Let's also rename the regulator according to the datasheet (5V_HUB) to better reflect the HW. Signed-off-by: Ulf Hansson Acked-by: Daniel Lezcano Acked-by: Arnd Bergmann --- arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts | 10 ++++++++++ arch/arm64/boot/dts/hisilicon/hi6220.dtsi | 12 +----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index d22eb3a646c4..0f6cba77fc76 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -81,6 +81,16 @@ }; }; + reg_5v_hub: regulator@0 { + compatible = "regulator-fixed"; + regulator-name = "5V_HUB"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-boot-on; + gpio = <&gpio0 7 0>; + regulator-always-on; + }; + soc { spi0: spi@f7106000 { status = "ok"; diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi index 1e5129b19280..951152d44c02 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -725,20 +725,10 @@ status = "disabled"; }; - fixed_5v_hub: regulator@0 { - compatible = "regulator-fixed"; - regulator-name = "fixed_5v_hub"; - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; - regulator-boot-on; - gpio = <&gpio0 7 0>; - regulator-always-on; - }; - usb_phy: usbphy { compatible = "hisilicon,hi6220-usb-phy"; #phy-cells = <0>; - phy-supply = <&fixed_5v_hub>; + phy-supply = <®_5v_hub>; hisilicon,peripheral-syscon = <&sys_ctrl>; }; From 84f7c60b31f10e3a438153bc7408ad536f585641 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 May 2017 13:51:27 +0200 Subject: [PATCH 139/254] arm64: dts: hikey: Add the SYS_5V and the VDD_3V3 regulators Add these regulators to better describe the HW, but also because those is needed in following changes. Signed-off-by: Ulf Hansson Acked-by: Daniel Lezcano Acked-by: Arnd Bergmann --- .../arm64/boot/dts/hisilicon/hi6220-hikey.dts | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index 0f6cba77fc76..802f4a4bed30 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -81,7 +81,26 @@ }; }; - reg_5v_hub: regulator@0 { + reg_sys_5v: regulator@0 { + compatible = "regulator-fixed"; + regulator-name = "SYS_5V"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-boot-on; + regulator-always-on; + }; + + reg_vdd_3v3: regulator@1 { + compatible = "regulator-fixed"; + regulator-name = "VDD_3V3"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + vin-supply = <®_sys_5v>; + }; + + reg_5v_hub: regulator@2 { compatible = "regulator-fixed"; regulator-name = "5V_HUB"; regulator-min-microvolt = <5000000>; @@ -89,6 +108,7 @@ regulator-boot-on; gpio = <&gpio0 7 0>; regulator-always-on; + vin-supply = <®_sys_5v>; }; soc { From 76f1dfb687150e852aa74573962cfc158a9570cc Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 May 2017 14:18:26 +0200 Subject: [PATCH 140/254] arm64: dts: hi6220: Move board data from the dwmmc nodes to hikey dts Move the board specific descriptions for the dwmmc nodes in the hi6220 SoC dtsi, into the hikey dts as it's there these belongs. While changing this, let's take the opportunity to drop the use of the "ti,non-removable" binding for one of the dwmmc device nodes, as it's not a valid binding and not used. Drop also the unnecessary use of "num-slots = <0x1>" for all of the dwmmc nodes, as there is no need to set this since when default number of slots is one. Signed-off-by: Ulf Hansson Acked-by: Daniel Lezcano Acked-by: Arnd Bergmann --- .../arm64/boot/dts/hisilicon/hi6220-hikey.dts | 23 ++++++++++++++++++- arch/arm64/boot/dts/hisilicon/hi6220.dtsi | 19 --------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index 802f4a4bed30..5132d8ed4664 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -286,8 +286,29 @@ /* GPIO blocks 16 thru 19 do not appear to be routed to pins */ + dwmmc_0: dwmmc0@f723d000 { + cap-mmc-highspeed; + non-removable; + bus-width = <0x8>; + vmmc-supply = <&ldo19>; + }; + + dwmmc_1: dwmmc1@f723e000 { + card-detect-delay = <200>; + cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; + vqmmc-supply = <&ldo7>; + vmmc-supply = <&ldo10>; + bus-width = <0x4>; + disable-wp; + cd-gpios = <&gpio1 0 1>; + }; + dwmmc_2: dwmmc2@f723f000 { - ti,non-removable; + broken-cd; + bus-width = <0x4>; non-removable; /* WL_EN */ vmmc-supply = <&wlan_en_reg>; diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi index 951152d44c02..5013e4b2ea71 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -756,17 +756,12 @@ dwmmc_0: dwmmc0@f723d000 { compatible = "hisilicon,hi6220-dw-mshc"; - num-slots = <0x1>; - cap-mmc-highspeed; - non-removable; reg = <0x0 0xf723d000 0x0 0x1000>; interrupts = <0x0 0x48 0x4>; clocks = <&sys_ctrl 2>, <&sys_ctrl 1>; clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC0>; reset-names = "reset"; - bus-width = <0x8>; - vmmc-supply = <&ldo19>; pinctrl-names = "default"; pinctrl-0 = <&emmc_pmx_func &emmc_clk_cfg_func &emmc_cfg_func &emmc_rst_cfg_func>; @@ -774,13 +769,7 @@ dwmmc_1: dwmmc1@f723e000 { compatible = "hisilicon,hi6220-dw-mshc"; - num-slots = <0x1>; - card-detect-delay = <200>; hisilicon,peripheral-syscon = <&ao_ctrl>; - cap-sd-highspeed; - sd-uhs-sdr12; - sd-uhs-sdr25; - sd-uhs-sdr50; reg = <0x0 0xf723e000 0x0 0x1000>; interrupts = <0x0 0x49 0x4>; #address-cells = <0x1>; @@ -789,11 +778,6 @@ clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC1>; reset-names = "reset"; - vqmmc-supply = <&ldo7>; - vmmc-supply = <&ldo10>; - bus-width = <0x4>; - disable-wp; - cd-gpios = <&gpio1 0 1>; pinctrl-names = "default", "idle"; pinctrl-0 = <&sd_pmx_func &sd_clk_cfg_func &sd_cfg_func>; pinctrl-1 = <&sd_pmx_idle &sd_clk_cfg_idle &sd_cfg_idle>; @@ -801,15 +785,12 @@ dwmmc_2: dwmmc2@f723f000 { compatible = "hisilicon,hi6220-dw-mshc"; - num-slots = <0x1>; reg = <0x0 0xf723f000 0x0 0x1000>; interrupts = <0x0 0x4a 0x4>; clocks = <&sys_ctrl HI6220_MMC2_CIUCLK>, <&sys_ctrl HI6220_MMC2_CLK>; clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC2>; reset-names = "reset"; - bus-width = <0x4>; - broken-cd; pinctrl-names = "default", "idle"; pinctrl-0 = <&sdio_pmx_func &sdio_clk_cfg_func &sdio_cfg_func>; pinctrl-1 = <&sdio_pmx_idle &sdio_clk_cfg_idle &sdio_cfg_idle>; From ea452678734eb782126f999bf5c4fb3e71d3b196 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 May 2017 16:11:33 +0200 Subject: [PATCH 141/254] arm64: dts: hikey: Fix WiFi support The description of the connection between the dwmmc (SDIO) controller and the Wifi chip, which is attached to the SDIO bus is wrong. Currently the SDIO card can't be detected and thus the Wifi doesn't work. Let's fix this by assigning the correct vmmc supply, which is the always on regulator VDD_3V3 and remove the WLAN enable regulator altogether. Then to properly deal with the power on/off sequence, add a mmc-pwrseq node to describe the resources needed to detect the SDIO card. Except for the WLAN enable GPIO and its corresponding assert/de-assert delays, the mmc-pwrseq node also contains a handle to a clock provided by the hi655x pmic. This clock is also needed to be able to turn on the WiFi chip. Signed-off-by: Ulf Hansson Acked-by: Arnd Bergmann --- .../arm64/boot/dts/hisilicon/hi6220-hikey.dts | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index 5132d8ed4664..49f6a6242cf9 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -111,6 +111,15 @@ vin-supply = <®_sys_5v>; }; + wl1835_pwrseq: wl1835-pwrseq { + compatible = "mmc-pwrseq-simple"; + /* WLAN_EN GPIO */ + reset-gpios = <&gpio0 5 GPIO_ACTIVE_LOW>; + clocks = <&pmic>; + clock-names = "ext_clock"; + power-off-delay-us = <10>; + }; + soc { spi0: spi@f7106000 { status = "ok"; @@ -307,11 +316,10 @@ }; dwmmc_2: dwmmc2@f723f000 { - broken-cd; bus-width = <0x4>; non-removable; - /* WL_EN */ - vmmc-supply = <&wlan_en_reg>; + vmmc-supply = <®_vdd_3v3>; + mmc-pwrseq = <&wl1835_pwrseq>; #address-cells = <0x1>; #size-cells = <0x0>; @@ -323,18 +331,6 @@ interrupts = <3 IRQ_TYPE_EDGE_RISING>; }; }; - - wlan_en_reg: regulator@1 { - compatible = "regulator-fixed"; - regulator-name = "wlan-en-regulator"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; - /* WLAN_EN GPIO */ - gpio = <&gpio0 5 0>; - /* WLAN card specific delay */ - startup-delay-us = <70000>; - enable-active-high; - }; }; leds { From 1b57b6210f4e52904393be97c62122aae69bc8aa Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 23 May 2017 09:58:07 +0100 Subject: [PATCH 142/254] cfg80211: make cfg80211_sched_scan_results() work from atomic context Drivers should be able to call cfg80211_sched_scan_results() from atomic context. However, with the introduction of multiple scheduled scan feature this requirement was not taken into account resulting in regression shown below. [ 119.021594] BUG: scheduling while atomic: irq/47-iwlwifi/517/0x00000200 [ 119.021604] Modules linked in: [...] [ 119.021759] CPU: 1 PID: 517 Comm: irq/47-iwlwifi Not tainted 4.12.0-rc2-t440s-20170522+ #1 [ 119.021763] Hardware name: LENOVO 20AQS03H00/20AQS03H00, BIOS GJET91WW (2.41 ) 09/21/2016 [ 119.021766] Call Trace: [ 119.021778] ? dump_stack+0x5c/0x84 [ 119.021784] ? __schedule_bug+0x4c/0x70 [ 119.021792] ? __schedule+0x496/0x5c0 [ 119.021798] ? schedule+0x2d/0x80 [ 119.021804] ? schedule_preempt_disabled+0x5/0x10 [ 119.021810] ? __mutex_lock.isra.0+0x18e/0x4c0 [ 119.021817] ? __wake_up+0x2f/0x50 [ 119.021833] ? cfg80211_sched_scan_results+0x19/0x60 [cfg80211] [ 119.021844] ? cfg80211_sched_scan_results+0x19/0x60 [cfg80211] [ 119.021859] ? iwl_mvm_rx_lmac_scan_iter_complete_notif+0x17/0x30 [iwlmvm] [ 119.021869] ? iwl_pcie_rx_handle+0x2a9/0x7e0 [iwlwifi] [ 119.021878] ? iwl_pcie_irq_handler+0x17c/0x730 [iwlwifi] [ 119.021884] ? irq_forced_thread_fn+0x60/0x60 [ 119.021887] ? irq_thread_fn+0x16/0x40 [ 119.021892] ? irq_thread+0x109/0x180 [ 119.021896] ? wake_threads_waitq+0x30/0x30 [ 119.021901] ? kthread+0xf2/0x130 [ 119.021905] ? irq_thread_dtor+0x90/0x90 [ 119.021910] ? kthread_create_on_node+0x40/0x40 [ 119.021915] ? ret_from_fork+0x26/0x40 Fixes: b34939b98369 ("cfg80211: add request id to cfg80211_sched_scan_*() api") Reported-by: Sander Eikelenboom Signed-off-by: Arend van Spriel Signed-off-by: Johannes Berg --- net/wireless/scan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 14d5f0c8c45f..9f0901f3e42b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -322,9 +322,9 @@ cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) { struct cfg80211_sched_scan_request *pos; - ASSERT_RTNL(); + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); - list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { + list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list) { if (pos->reqid == reqid) return pos; } @@ -398,13 +398,13 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) trace_cfg80211_sched_scan_results(wiphy, reqid); /* ignore if we're not scanning */ - rtnl_lock(); + rcu_read_lock(); request = cfg80211_find_sched_scan_req(rdev, reqid); if (request) { request->report_results = true; queue_work(cfg80211_wq, &rdev->sched_scan_res_wk); } - rtnl_unlock(); + rcu_read_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_results); From c70d9d809fdeecedb96972457ee45c49a232d97f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 May 2017 15:40:12 -0500 Subject: [PATCH 143/254] ptrace: Properly initialize ptracer_cred on fork When I introduced ptracer_cred I failed to consider the weirdness of fork where the task_struct copies the old value by default. This winds up leaving ptracer_cred set even when a process forks and the child process does not wind up being ptraced. Because ptracer_cred is not set on non-ptraced processes whose parents were ptraced this has broken the ability of the enlightenment window manager to start setuid children. Fix this by properly initializing ptracer_cred in ptrace_init_task This must be done with a little bit of care to preserve the current value of ptracer_cred when ptrace carries through fork. Re-reading the ptracer_cred from the ptracing process at this point is inconsistent with how PT_PTRACE_CAP has been maintained all of these years. Tested-by: Takashi Iwai Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP") Signed-off-by: "Eric W. Biederman" --- include/linux/ptrace.h | 7 +++++-- kernel/ptrace.c | 20 +++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 422bc2e4cb6a..ef3eb8bbfee4 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -54,7 +54,8 @@ extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, - struct task_struct *new_parent); + struct task_struct *new_parent, + const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 @@ -206,7 +207,7 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; - __ptrace_link(child, current->parent); + __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); @@ -215,6 +216,8 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) set_tsk_thread_flag(child, TIF_SIGPENDING); } + else + child->ptracer_cred = NULL; } /** diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 266ddcc1d8bb..60f356d91060 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, } +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -459,7 +465,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); From cdc5a7f363be34287ac6c2345e5d1d3b37cf4a23 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 9 May 2017 20:45:06 +0300 Subject: [PATCH 144/254] net/mlx5e: Use the correct delete call on offloaded TC encap entry detach We wrongly direcly invoke hlist_del_rcu() and not hash_del_rcu() which does a slightly different call now and may change later, fix that. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Signed-off-by: Or Gerlitz Reported-by: Paul Blakey Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 11c27e4fadf6..a90dd26ea51c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -384,7 +384,7 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv, if (e->flags & MLX5_ENCAP_ENTRY_VALID) mlx5_encap_dealloc(priv->mdev, e->encap_id); - hlist_del_rcu(&e->encap_hlist); + hash_del_rcu(&e->encap_hlist); kfree(e->encap_header); kfree(e); } From 3aa4266405a6c2e03eb0ff12d7c573d3d903da4c Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 10 May 2017 13:48:41 +0300 Subject: [PATCH 145/254] net/sched: act_csum: Add accessors for offloading drivers Add the accessors for realizing if this is a csum action, and for which fields checksum is needed. Signed-off-by: Or Gerlitz Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- include/net/tc_act/tc_csum.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index f31fb6331a53..3248beaf16b0 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -3,6 +3,7 @@ #include #include +#include struct tcf_csum { struct tc_action common; @@ -11,4 +12,18 @@ struct tcf_csum { }; #define to_tcf_csum(a) ((struct tcf_csum *)a) +static inline bool is_tcf_csum(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->type == TCA_ACT_CSUM) + return true; +#endif + return false; +} + +static inline u32 tcf_csum_update_flags(const struct tc_action *a) +{ + return to_tcf_csum(a)->update_flags; +} + #endif /* __NET_TC_CSUM_H */ From 26c02749936f064abf771a0f5f49b280fcfd8b66 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 10 May 2017 13:59:54 +0300 Subject: [PATCH 146/254] net/mlx5e: Allow TC csum offload if applied together with pedit action When offloading header re-writes, the HW re-calculates the relevant L3/L4 checksums. Hence, when upper layers (as done by OVS) ask for TC checksum action offload together with pedit offload, don't err. This command now works: tc filter add dev ens1f0 protocol ip parent ffff: prio 20 flower skip_sw ip_proto tcp dst_port 9001 action pedit ex munge tcp dport set 0x1234 pipe action csum tcp Signed-off-by: Or Gerlitz Reported-by: Paul Blakey Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a90dd26ea51c..9dd83c7e4c51 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include "en.h" @@ -1109,6 +1110,28 @@ out_err: return err; } +static bool csum_offload_supported(struct mlx5e_priv *priv, u32 action, u32 update_flags) +{ + u32 prot_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR | TCA_CSUM_UPDATE_FLAG_TCP | + TCA_CSUM_UPDATE_FLAG_UDP; + + /* The HW recalcs checksums only if re-writing headers */ + if (!(action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)) { + netdev_warn(priv->netdev, + "TC csum action is only offloaded with pedit\n"); + return false; + } + + if (update_flags & ~prot_flags) { + netdev_warn(priv->netdev, + "can't offload TC csum action for some header/s - flags %#x\n", + update_flags); + return false; + } + + return true; +} + static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, struct mlx5e_tc_flow_parse_attr *parse_attr, struct mlx5e_tc_flow *flow) @@ -1149,6 +1172,14 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, continue; } + if (is_tcf_csum(a)) { + if (csum_offload_supported(priv, attr->action, + tcf_csum_update_flags(a))) + continue; + + return -EOPNOTSUPP; + } + if (is_tcf_skbedit_mark(a)) { u32 mark = tcf_skbedit_mark(a); @@ -1651,6 +1682,14 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, continue; } + if (is_tcf_csum(a)) { + if (csum_offload_supported(priv, attr->action, + tcf_csum_update_flags(a))) + continue; + + return -EOPNOTSUPP; + } + if (is_tcf_mirred_egress_redirect(a)) { int ifindex = tcf_mirred_ifindex(a); struct net_device *out_dev, *encap_dev = NULL; From d824bf3fe2d352fc2c52b7ede05b1a0e95d946be Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 9 May 2017 19:02:42 +0300 Subject: [PATCH 147/254] net/mlx5e: Properly enforce disallowing of partial field re-write offload Currently we don't support partial header re-writes through TC pedit action offloading. However, the code that enforces that wasn't err-ing on cases where the first and last bits of the mask are set but there is some zero bit between them, such as in the below example, fix that! tc filter add dev enp1s0 protocol ip parent ffff: prio 10 flower ip_proto udp dst_port 2001 skip_sw action pedit munge ip src set 1.0.0.1 retain 0xff0000ff Fixes: d79b6df6b10a ('net/mlx5e: Add parsing of TC pedit actions to HW format') Signed-off-by: Or Gerlitz Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9dd83c7e4c51..0387c321f0a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -926,7 +926,7 @@ static int offload_pedit_fields(struct pedit_headers *masks, struct mlx5e_tc_flow_parse_attr *parse_attr) { struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals; - int i, action_size, nactions, max_actions, first, last; + int i, action_size, nactions, max_actions, first, last, first_z; void *s_masks_p, *a_masks_p, *vals_p; u32 s_mask, a_mask, val; struct mlx5_fields *f; @@ -985,9 +985,10 @@ static int offload_pedit_fields(struct pedit_headers *masks, memcpy(&val, vals_p, f->size); field_bsize = f->size * BITS_PER_BYTE; + first_z = find_first_zero_bit(&mask, field_bsize); first = find_first_bit(&mask, field_bsize); last = find_last_bit(&mask, field_bsize); - if (first > 0 || last != (field_bsize - 1)) { + if (first > 0 || last != (field_bsize - 1) || first_z < last) { printk(KERN_WARNING "mlx5: partial rewrite (mask %lx) is currently not offloaded\n", mask); return -EOPNOTSUPP; From e3ca4e0583a02a04503d9c827fb5c5d50abc4ff5 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 9 May 2017 13:37:26 +0300 Subject: [PATCH 148/254] net/mlx5e: Fix warnings around parsing of TC pedit actions The sparse tool emits these correct complaints: drivers/net/ethernet/mellanox/mlx5/core//en_tc.c:1005:25: warning: cast to restricted __be32 drivers/net/ethernet/mellanox/mlx5/core//en_tc.c:1007:25: warning: cast to restricted __be16 The value is provided from user-space in network order, but there's no way for them to realize that, avoid the warnings by casting to the appropriate type. Fixes: d79b6df6b10a ('net/mlx5e: Add parsing of TC pedit actions to HW format') Signed-off-by: Or Gerlitz Reported-by: Leon Romanovsky Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 0387c321f0a2..ec63158ab643 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -928,9 +928,9 @@ static int offload_pedit_fields(struct pedit_headers *masks, struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals; int i, action_size, nactions, max_actions, first, last, first_z; void *s_masks_p, *a_masks_p, *vals_p; - u32 s_mask, a_mask, val; struct mlx5_fields *f; u8 cmd, field_bsize; + u32 s_mask, a_mask; unsigned long mask; void *action; @@ -947,7 +947,8 @@ static int offload_pedit_fields(struct pedit_headers *masks, for (i = 0; i < ARRAY_SIZE(fields); i++) { f = &fields[i]; /* avoid seeing bits set from previous iterations */ - s_mask = a_mask = mask = val = 0; + s_mask = 0; + a_mask = 0; s_masks_p = (void *)set_masks + f->offset; a_masks_p = (void *)add_masks + f->offset; @@ -982,9 +983,8 @@ static int offload_pedit_fields(struct pedit_headers *masks, memset(a_masks_p, 0, f->size); } - memcpy(&val, vals_p, f->size); - field_bsize = f->size * BITS_PER_BYTE; + first_z = find_first_zero_bit(&mask, field_bsize); first = find_first_bit(&mask, field_bsize); last = find_last_bit(&mask, field_bsize); @@ -1004,11 +1004,11 @@ static int offload_pedit_fields(struct pedit_headers *masks, } if (field_bsize == 32) - MLX5_SET(set_action_in, action, data, ntohl(val)); + MLX5_SET(set_action_in, action, data, ntohl(*(__be32 *)vals_p)); else if (field_bsize == 16) - MLX5_SET(set_action_in, action, data, ntohs(val)); + MLX5_SET(set_action_in, action, data, ntohs(*(__be16 *)vals_p)); else if (field_bsize == 8) - MLX5_SET(set_action_in, action, data, val); + MLX5_SET(set_action_in, action, data, *(u8 *)vals_p); action += action_size; nactions++; From b57fe691961cc8f00541f9a435c70df45d41e514 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Thu, 27 Apr 2017 17:59:00 +0300 Subject: [PATCH 149/254] net/mlx5e: IPoIB, handle RX packet correctly IPoIB packet contains the pseudo header area, we need to pull it prior to reset_mac_header in order to let the GRO work well. In more details: GRO checks the mac address of the new coming packet, it does that by comparing the hard_header_len size of the current packet to the previous one in that session, the comparison is over hard_header_len size. Now, the driver prepares that area in the skb by allocating area from the reserved part and resetting the correct mac header to it. Fixes: 9d6bd752c63c ("net/mlx5e: IPoIB, RX handler") Signed-off-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 7b1566f0ae58..66b5fec15313 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1041,6 +1041,8 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) #define MLX5_IB_GRH_BYTES 40 #define MLX5_IPOIB_ENCAP_LEN 4 #define MLX5_GID_SIZE 16 +#define MLX5_IPOIB_PSEUDO_LEN 20 +#define MLX5_IPOIB_HARD_LEN (MLX5_IPOIB_PSEUDO_LEN + MLX5_IPOIB_ENCAP_LEN) static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, @@ -1048,6 +1050,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, struct sk_buff *skb) { struct net_device *netdev = rq->netdev; + char *pseudo_header; u8 *dgid; u8 g; @@ -1076,8 +1079,11 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, if (likely(netdev->features & NETIF_F_RXHASH)) mlx5e_skb_set_hash(cqe, skb); + /* 20 bytes of ipoib header and 4 for encap existing */ + pseudo_header = skb_push(skb, MLX5_IPOIB_PSEUDO_LEN); + memset(pseudo_header, 0, MLX5_IPOIB_PSEUDO_LEN); skb_reset_mac_header(skb); - skb_pull(skb, MLX5_IPOIB_ENCAP_LEN); + skb_pull(skb, MLX5_IPOIB_HARD_LEN); skb->dev = netdev; From 73dd3a4839c1d27c36d4dcc92e1ff44225ecbeb7 Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Thu, 23 Feb 2017 11:19:36 +0200 Subject: [PATCH 150/254] net/mlx5: Avoid using pending command interface slots Currently when firmware command gets stuck or it takes long time to complete, the driver command will get timeout and the command slot is freed and can be used for new commands, and if the firmware receive new command on the old busy slot its behavior is unexpected and this could be harmful. To fix this when the driver command gets timeout we return failure, but we don't free the command slot and we wait for the firmware to explicitly respond to that command. Once all the entries are busy we will stop processing new firmware commands. Fixes: 9cba4ebcf374 ('net/mlx5: Fix potential deadlock in command mode change') Signed-off-by: Mohamad Haj Yahia Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 41 ++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- .../net/ethernet/mellanox/mlx5/core/health.c | 2 +- include/linux/mlx5/driver.h | 7 +++- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 5bdaf3d545b2..10d282841f5b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -774,7 +774,7 @@ static void cb_timeout_handler(struct work_struct *work) mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } static void cmd_work_handler(struct work_struct *work) @@ -804,6 +804,7 @@ static void cmd_work_handler(struct work_struct *work) } cmd->ent_arr[ent->idx] = ent; + set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); lay = get_inst(cmd, ent->idx); ent->lay = lay; memset(lay, 0, sizeof(*lay)); @@ -825,6 +826,20 @@ static void cmd_work_handler(struct work_struct *work) if (ent->callback) schedule_delayed_work(&ent->cb_timeout_work, cb_timeout); + /* Skip sending command to fw if internal error */ + if (pci_channel_offline(dev->pdev) || + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + u8 status = 0; + u32 drv_synd; + + ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status); + MLX5_SET(mbox_out, ent->out, status, status); + MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); + + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + return; + } + /* ring doorbell after the descriptor is valid */ mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); wmb(); @@ -835,7 +850,7 @@ static void cmd_work_handler(struct work_struct *work) poll_timeout(ent); /* make sure we read the descriptor after ownership is SW */ rmb(); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT)); } } @@ -879,7 +894,7 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) wait_for_completion(&ent->done); } else if (!wait_for_completion_timeout(&ent->done, timeout)) { ent->ret = -ETIMEDOUT; - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } err = ent->ret; @@ -1375,7 +1390,7 @@ static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg) } } -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced) { struct mlx5_cmd *cmd = &dev->cmd; struct mlx5_cmd_work_ent *ent; @@ -1395,6 +1410,19 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) struct semaphore *sem; ent = cmd->ent_arr[i]; + + /* if we already completed the command, ignore it */ + if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, + &ent->state)) { + /* only real completion can free the cmd slot */ + if (!forced) { + mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", + ent->idx); + free_ent(cmd, ent->idx); + } + continue; + } + if (ent->callback) cancel_delayed_work(&ent->cb_timeout_work); if (ent->page_queue) @@ -1417,7 +1445,10 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n", ent->ret, deliv_status_to_str(ent->status), ent->status); } - free_ent(cmd, ent->idx); + + /* only real completion will free the entry slot */ + if (!forced) + free_ent(cmd, ent->idx); if (ent->callback) { ds = ent->ts2 - ent->ts1; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index ea5d8d37a75c..33eae5ad2fb0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -422,7 +422,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) break; case MLX5_EVENT_TYPE_CMD: - mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector)); + mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false); break; case MLX5_EVENT_TYPE_PORT_CHANGE: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index d0515391d33b..44f59b1d6f0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -90,7 +90,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev) spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); mlx5_core_dbg(dev, "vector 0x%llx\n", vector); - mlx5_cmd_comp_handler(dev, vector); + mlx5_cmd_comp_handler(dev, vector, true); return; no_trig: diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index bcdf739ee41a..93273d9ea4d1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -787,7 +787,12 @@ enum { typedef void (*mlx5_cmd_cbk_t)(int status, void *context); +enum { + MLX5_CMD_ENT_STATE_PENDING_COMP, +}; + struct mlx5_cmd_work_ent { + unsigned long state; struct mlx5_cmd_msg *in; struct mlx5_cmd_msg *out; void *uout; @@ -976,7 +981,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec); +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced); void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type); int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, int nent, u64 mask, const char *name, From b665d98edc9ab295169be2fc5bb4e89a46de0a1a Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 18 May 2017 13:34:43 +0300 Subject: [PATCH 151/254] net/mlx5: Tolerate irq_set_affinity_hint() failures Add tolerance to failures of irq_set_affinity_hint(). Its role is to give hints that optimizes performance, and should not block the driver load. In non-SMP systems, functionality is not available as there is a single core, and all these calls definitely fail. Hence, do not call the function and avoid the warning prints. Fixes: db058a186f98 ("net/mlx5_core: Set irq affinity hints") Signed-off-by: Tariq Toukan Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 0c123d571b4c..fe5546bb4153 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -612,7 +612,6 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) struct mlx5_priv *priv = &mdev->priv; struct msix_entry *msix = priv->msix_arr; int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; - int err; if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { mlx5_core_warn(mdev, "zalloc_cpumask_var failed"); @@ -622,18 +621,12 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node), priv->irq_info[i].mask); - err = irq_set_affinity_hint(irq, priv->irq_info[i].mask); - if (err) { - mlx5_core_warn(mdev, "irq_set_affinity_hint failed,irq 0x%.4x", - irq); - goto err_clear_mask; - } +#ifdef CONFIG_SMP + if (irq_set_affinity_hint(irq, priv->irq_info[i].mask)) + mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq); +#endif return 0; - -err_clear_mask: - free_cpumask_var(priv->irq_info[i].mask); - return err; } static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i) From 7bd897cfce1eb373892d35d7f73201b0f9b221c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 May 2017 17:28:36 +0300 Subject: [PATCH 152/254] block: fix an error code in add_partition() We don't set an error code on this path. It means that we return NULL instead of an error pointer and the caller does a NULL dereference. Fixes: 6d1d8050b4bc ("block, partition: add partition_meta_info to hd_struct") Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- block/partition-generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index ff07b9143ca4..c5ec8246e25e 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -320,8 +320,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (info) { struct partition_meta_info *pinfo = alloc_part_info(disk); - if (!pinfo) + if (!pinfo) { + err = -ENOMEM; goto out_free_stats; + } memcpy(pinfo, info, sizeof(*info)); p->info = pinfo; } From 7f65b1f5adc5f8496ca8bec4947de66fefe36220 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 22 May 2017 14:50:30 +0200 Subject: [PATCH 153/254] cdc-ether: divorce initialisation with a filter reset and a generic method Some devices need their multicast filter reset but others are crashed by that. So the methods need to be separated. Signed-off-by: Oliver Neukum Reported-by: "Ridgway, Keith" Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 33 +++++++++++++++++++++++++-------- include/linux/usb/usbnet.h | 1 + 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index f3ae88fdf332..8ab281b478f2 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -310,13 +310,6 @@ skip: return -ENODEV; } - /* Some devices don't initialise properly. In particular - * the packet filter is not reset. There are devices that - * don't do reset all the way. So the packet filter should - * be set to a sane initial value. - */ - usbnet_cdc_update_filter(dev); - return 0; bad_desc: @@ -325,6 +318,30 @@ bad_desc: } EXPORT_SYMBOL_GPL(usbnet_generic_cdc_bind); + +/* like usbnet_generic_cdc_bind() but handles filter initialization + * correctly + */ +int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf) +{ + int rv; + + rv = usbnet_generic_cdc_bind(dev, intf); + if (rv < 0) + goto bail_out; + + /* Some devices don't initialise properly. In particular + * the packet filter is not reset. There are devices that + * don't do reset all the way. So the packet filter should + * be set to a sane initial value. + */ + usbnet_cdc_update_filter(dev); + +bail_out: + return rv; +} +EXPORT_SYMBOL_GPL(usbnet_ether_cdc_bind); + void usbnet_cdc_unbind(struct usbnet *dev, struct usb_interface *intf) { struct cdc_state *info = (void *) &dev->data; @@ -417,7 +434,7 @@ int usbnet_cdc_bind(struct usbnet *dev, struct usb_interface *intf) BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data) < sizeof(struct cdc_state))); - status = usbnet_generic_cdc_bind(dev, intf); + status = usbnet_ether_cdc_bind(dev, intf); if (status < 0) return status; diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 7dffa5624ea6..97116379db5f 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -206,6 +206,7 @@ struct cdc_state { }; extern int usbnet_generic_cdc_bind(struct usbnet *, struct usb_interface *); +extern int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf); extern int usbnet_cdc_bind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_unbind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_status(struct usbnet *, struct urb *); From 12e8b570e732eaa5eae3a2895ba3fbcf91bde2b4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 22 May 2017 20:13:07 +0200 Subject: [PATCH 154/254] mlx5: fix bug reading rss_hash_type from CQE Masks for extracting part of the Completion Queue Entry (CQE) field rss_hash_type was swapped, namely CQE_RSS_HTYPE_IP and CQE_RSS_HTYPE_L4. The bug resulted in setting skb->l4_hash, even-though the rss_hash_type indicated that hash was NOT computed over the L4 (UDP or TCP) part of the packet. Added comments from the datasheet, to make it more clear what these masks are selecting. Signed-off-by: Jesper Dangaard Brouer Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index dd9a263ed368..a940ec6a046c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -787,8 +787,14 @@ enum { }; enum { - CQE_RSS_HTYPE_IP = 0x3 << 6, - CQE_RSS_HTYPE_L4 = 0x3 << 2, + CQE_RSS_HTYPE_IP = 0x3 << 2, + /* cqe->rss_hash_type[3:2] - IP destination selected for hash + * (00 = none, 01 = IPv4, 10 = IPv6, 11 = Reserved) + */ + CQE_RSS_HTYPE_L4 = 0x3 << 6, + /* cqe->rss_hash_type[7:6] - L4 destination selected for hash + * (00 = none, 01 = TCP. 10 = UDP, 11 = IPSEC.SPI + */ }; enum { From 223220356d5ebc05ead9a8d697abb0c0a906fc81 Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 21 May 2017 12:27:00 -0700 Subject: [PATCH 155/254] partitions/msdos: FreeBSD UFS2 file systems are not recognized The code in block/partitions/msdos.c recognizes FreeBSD, OpenBSD and NetBSD partitions and does a reasonable job picking out OpenBSD and NetBSD UFS subpartitions. But for FreeBSD the subpartitions are always "bad". Kernel: Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 93e7c1b32edd..5610cd537da7 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -300,6 +300,8 @@ static void parse_bsd(struct parsed_partitions *state, continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); + if (memcmp(flavour, "bsd\0", 4) == 0) + bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ continue; From 6f4dbd149d2a151b89d1a5bbf7530ee5546c7908 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:33:16 +0200 Subject: [PATCH 156/254] libceph: use kbasename() and kill ceph_file_part() Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- include/linux/ceph/ceph_debug.h | 6 +++--- net/ceph/ceph_common.c | 13 ------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h index aa2e19182d99..51c5bd64bd00 100644 --- a/include/linux/ceph/ceph_debug.h +++ b/include/linux/ceph/ceph_debug.h @@ -3,6 +3,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include + #ifdef CONFIG_CEPH_LIB_PRETTYDEBUG /* @@ -12,12 +14,10 @@ */ # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) -extern const char *ceph_file_part(const char *s, int len); # define dout(fmt, ...) \ pr_debug("%.*s %12.12s:%-4d : " fmt, \ 8 - (int)sizeof(KBUILD_MODNAME), " ", \ - ceph_file_part(__FILE__, sizeof(__FILE__)), \ - __LINE__, ##__VA_ARGS__) + kbasename(__FILE__), __LINE__, ##__VA_ARGS__) # else /* faux printk call just to see any compiler warnings. */ # define dout(fmt, ...) do { \ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4fd02831beed..47e94b560ba0 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -56,19 +56,6 @@ static const struct kernel_param_ops param_ops_supported_features = { module_param_cb(supported_features, ¶m_ops_supported_features, NULL, S_IRUGO); -/* - * find filename portion of a path (/foo/bar/baz -> baz) - */ -const char *ceph_file_part(const char *s, int len) -{ - const char *e = s + len; - - while (e != s && *(e-1) != '/') - e--; - return e; -} -EXPORT_SYMBOL(ceph_file_part); - const char *ceph_msg_type_name(int type) { switch (type) { From 1759f7b0e3fab1d1882d7c680af9d12c5c111b0e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:38:17 +0200 Subject: [PATCH 157/254] libceph: make ceph_msg_data_advance() return void Both callers ignore the returned bool. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/messenger.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5766a6c896c4..d7ab481b2508 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1174,8 +1174,8 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, * Returns true if the result moves the cursor on to the next piece * of the data item. */ -static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, - size_t bytes) +static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) { bool new_piece; @@ -1207,8 +1207,6 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, new_piece = true; } cursor->need_crc = new_piece; - - return new_piece; } static size_t sizeof_footer(struct ceph_connection *con) @@ -1577,7 +1575,6 @@ static int write_partial_message_data(struct ceph_connection *con) size_t page_offset; size_t length; bool last_piece; - bool need_crc; int ret; page = ceph_msg_data_next(cursor, &page_offset, &length, @@ -1592,7 +1589,7 @@ static int write_partial_message_data(struct ceph_connection *con) } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - need_crc = ceph_msg_data_advance(cursor, (size_t)ret); + ceph_msg_data_advance(cursor, (size_t)ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -2299,7 +2296,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); - (void) ceph_msg_data_advance(cursor, (size_t)ret); + ceph_msg_data_advance(cursor, (size_t)ret); } if (do_datacrc) con->in_data_crc = crc; From f3b4e55ded9b3c52831a7d2ab9e511293c99fc11 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 11:59:22 +0200 Subject: [PATCH 158/254] libceph: drop version variable from ceph_monmap_decode() It's set but not used: CEPH_FEATURE_MONNAMES feature bit isn't advertised, which guarantees a v1 MonMap. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/mon_client.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 29a0ef351c5e..250f11f78609 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -43,15 +43,13 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) int i, err = -EINVAL; struct ceph_fsid fsid; u32 epoch, num_mon; - u16 version; u32 len; ceph_decode_32_safe(&p, end, len, bad); ceph_decode_need(&p, end, len, bad); dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); - - ceph_decode_16_safe(&p, end, version, bad); + p += sizeof(u16); /* skip version */ ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); From d18a1247c4070390fc0c2d83d89a72afe921882e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 12:21:56 +0200 Subject: [PATCH 159/254] libceph: validate blob_struct_v in process_one_ticket() None of these are validated in userspace, but since we do validate reply_struct_v in ceph_x_proc_ticket_reply(), tkt_struct_v (first) and CephXServiceTicket struct_v (second) in process_one_ticket(), validate CephXTicketBlob struct_v as well. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/auth_x.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 2034fb926670..d0126df33f1f 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -215,6 +215,9 @@ static int process_one_ticket(struct ceph_auth_client *ac, dout(" ticket blob is %d bytes\n", dlen); ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad); blob_struct_v = ceph_decode_8(ptp); + if (blob_struct_v != 1) + goto bad; + new_secret_id = ceph_decode_64(ptp); ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend); if (ret) From b51456a6096ebf9f4ceb2cc7e176b471d4b70af0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 May 2017 14:24:36 +0200 Subject: [PATCH 160/254] libceph: fix error handling in process_one_ticket() Don't leak key internals after new_session_key is populated. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/auth_x.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index d0126df33f1f..8757fb87dab8 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -151,7 +151,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, struct timespec validity; void *tp, *tpend; void **ptp; - struct ceph_crypto_key new_session_key; + struct ceph_crypto_key new_session_key = { 0 }; struct ceph_buffer *new_ticket_blob; unsigned long new_expires, new_renew_after; u64 new_secret_id; @@ -237,13 +237,13 @@ static int process_one_ticket(struct ceph_auth_client *ac, type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); xi->have_keys |= th->service; - -out: - return ret; + return 0; bad: ret = -EINVAL; - goto out; +out: + ceph_crypto_key_destroy(&new_session_key); + return ret; } static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, From 293dffaad8d500e1a5336eeb90d544cf40d4fbd8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 May 2017 17:25:10 +0300 Subject: [PATCH 161/254] libceph: NULL deref on crush_decode() error path If there is not enough space then ceph_decode_32_safe() does a goto bad. We need to return an error code in that situation. The current code returns ERR_PTR(0) which is NULL. The callers are not expecting that and it results in a NULL dereference. Fixes: f24e9980eb86 ("ceph: OSD client") Signed-off-by: Dan Carpenter Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index ffe9e904d4d1..55e3a477f92d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -317,6 +317,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) u32 yes; struct crush_rule *r; + err = -EINVAL; ceph_decode_32_safe(p, end, yes, bad); if (!yes) { dout("crush_decode NO rule %d off %x %p to %p\n", From 4d071c3238987325b9e50e33051a40d1cce311cc Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 23 May 2017 14:18:17 -0500 Subject: [PATCH 162/254] PCI/PM: Add needs_resume flag to avoid suspend complete optimization Some drivers - like i915 - may not support the system suspend direct complete optimization due to differences in their runtime and system suspend sequence. Add a flag that when set resumes the device before calling the driver's system suspend handlers which effectively disables the optimization. Needed by a future patch fixing suspend/resume on i915. Suggested by Rafael. Signed-off-by: Imre Deak Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Cc: stable@vger.kernel.org --- drivers/pci/pci.c | 3 ++- include/linux/pci.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b01bd5bba8e6..563901cd9c06 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2144,7 +2144,8 @@ bool pci_dev_keep_suspended(struct pci_dev *pci_dev) if (!pm_runtime_suspended(dev) || pci_target_state(pci_dev) != pci_dev->current_state - || platform_pci_need_resume(pci_dev)) + || platform_pci_need_resume(pci_dev) + || (pci_dev->dev_flags & PCI_DEV_FLAGS_NEEDS_RESUME)) return false; /* diff --git a/include/linux/pci.h b/include/linux/pci.h index 33c2b0b77429..df7dd9021646 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -183,6 +183,11 @@ enum pci_dev_flags { PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT = (__force pci_dev_flags_t) (1 << 9), /* Do not use FLR even if device advertises PCI_AF_CAP */ PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10), + /* + * Resume before calling the driver's system suspend hooks, disabling + * the direct_complete optimization. + */ + PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11), }; enum pci_irq_reroute_variant { From 82bc9a42cf854fdf63155759c0aa790bd1f361b0 Mon Sep 17 00:00:00 2001 From: Patrik Jakobsson Date: Tue, 18 Apr 2017 13:43:32 +0200 Subject: [PATCH 163/254] drm/gma500/psb: Actually use VBT mode when it is found With LVDS we were incorrectly picking the pre-programmed mode instead of the prefered mode provided by VBT. Make sure we pick the VBT mode if one is provided. It is likely that the mode read-out code is still wrong but this patch fixes the immediate problem on most machines. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=78562 Cc: Signed-off-by: Patrik Jakobsson Link: http://patchwork.freedesktop.org/patch/msgid/20170418114332.12183-1-patrik.r.jakobsson@gmail.com --- drivers/gpu/drm/gma500/psb_intel_lvds.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/gma500/psb_intel_lvds.c b/drivers/gpu/drm/gma500/psb_intel_lvds.c index 0066fe7e622e..be3eefec5152 100644 --- a/drivers/gpu/drm/gma500/psb_intel_lvds.c +++ b/drivers/gpu/drm/gma500/psb_intel_lvds.c @@ -759,20 +759,23 @@ void psb_intel_lvds_init(struct drm_device *dev, if (scan->type & DRM_MODE_TYPE_PREFERRED) { mode_dev->panel_fixed_mode = drm_mode_duplicate(dev, scan); + DRM_DEBUG_KMS("Using mode from DDC\n"); goto out; /* FIXME: check for quirks */ } } /* Failed to get EDID, what about VBT? do we need this? */ - if (mode_dev->vbt_mode) + if (dev_priv->lfp_lvds_vbt_mode) { mode_dev->panel_fixed_mode = - drm_mode_duplicate(dev, mode_dev->vbt_mode); + drm_mode_duplicate(dev, dev_priv->lfp_lvds_vbt_mode); - if (!mode_dev->panel_fixed_mode) - if (dev_priv->lfp_lvds_vbt_mode) - mode_dev->panel_fixed_mode = - drm_mode_duplicate(dev, - dev_priv->lfp_lvds_vbt_mode); + if (mode_dev->panel_fixed_mode) { + mode_dev->panel_fixed_mode->type |= + DRM_MODE_TYPE_PREFERRED; + DRM_DEBUG_KMS("Using mode from VBT\n"); + goto out; + } + } /* * If we didn't get EDID, try checking if the panel is already turned @@ -789,6 +792,7 @@ void psb_intel_lvds_init(struct drm_device *dev, if (mode_dev->panel_fixed_mode) { mode_dev->panel_fixed_mode->type |= DRM_MODE_TYPE_PREFERRED; + DRM_DEBUG_KMS("Using pre-programmed mode\n"); goto out; /* FIXME: check for quirks */ } } From 43fe8b8eb81eee713400340716cf945f59d21496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 May 2017 23:27:38 +0200 Subject: [PATCH 164/254] posix-timers: Make signal printks conditional A recent commit added extra printks for CPU/RT limits. This can result in excessive spam in dmesg. Make the printks conditional on print_fatal_signals. Fixes: e7ea7c9806a2 ("rlimits: Print more information when CPU/RT limits are exceeded") Reported-by: Dave Jones Signed-off-by: Thomas Gleixner Cc: Arun Raghavan --- kernel/time/posix-cpu-timers.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1370f067fb51..d2a1e6dd0291 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -825,8 +825,10 @@ static void check_thread_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ - pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -838,8 +840,10 @@ static void check_thread_timers(struct task_struct *tsk, soft += USEC_PER_SEC; sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } - pr_info("RT Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } @@ -936,8 +940,10 @@ static void check_process_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ - pr_info("RT Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -945,8 +951,10 @@ static void check_process_timers(struct task_struct *tsk, /* * At the soft limit, send a SIGXCPU every second. */ - pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); if (soft < hard) { soft++; From 0e774888675d7ec693379a228490ce611135cbc2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 26 Apr 2017 10:50:02 +0900 Subject: [PATCH 165/254] thermal: qoriq: remove useless call for of_thermal_get_trip_points() Building this driver with W=1 reports: warning: variable 'trip' set but not used [-Wunused-but-set-variable] The call for of_thermal_get_trip_points() is useless. Signed-off-by: Masahiro Yamada Signed-off-by: Eduardo Valentin --- drivers/thermal/qoriq_thermal.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c index 644ba526d9ea..4362a69ac88d 100644 --- a/drivers/thermal/qoriq_thermal.c +++ b/drivers/thermal/qoriq_thermal.c @@ -195,7 +195,6 @@ static struct thermal_zone_of_device_ops tmu_tz_ops = { static int qoriq_tmu_probe(struct platform_device *pdev) { int ret; - const struct thermal_trip *trip; struct qoriq_tmu_data *data; struct device_node *np = pdev->dev.of_node; u32 site = 0; @@ -243,8 +242,6 @@ static int qoriq_tmu_probe(struct platform_device *pdev) goto err_tmu; } - trip = of_thermal_get_trip_points(data->tz); - /* Enable monitoring */ site |= 0x1 << (15 - data->sensor_id); tmu_write(data, site | TMR_ME | TMR_ALPF, &data->regs->tmr); From c4b379d0640a35b6c87200c3bdac0df2e6137022 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 8 May 2017 11:36:43 +0100 Subject: [PATCH 166/254] thermal: core: make thermal_emergency_poweroff static Making thermal_emergency_poweroff static fixes sparse warning: drivers/thermal/thermal_core.c:6: warning: symbol 'thermal_emergency_poweroff' was not declared. Should it be static? Fixes: ef1d87e06ab4 ("thermal: core: Add a back up thermal shutdown mechanism") Acked-by: Keerthy Signed-off-by: Colin Ian King Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index b21b9cc2c8d6..5a51c740e372 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -359,7 +359,7 @@ static DECLARE_DELAYED_WORK(thermal_emergency_poweroff_work, * This may be called from any critical situation to trigger a system shutdown * after a known period of time. By default this is not scheduled. */ -void thermal_emergency_poweroff(void) +static void thermal_emergency_poweroff(void) { int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS; /* From 748c23d88610b5df378d86e7192e4bd1f58adb35 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 26 Apr 2017 16:45:25 +0200 Subject: [PATCH 167/254] ti-soc-thermal: Use devm_kcalloc() in ti_bandgap_build() A multiplication for the size determination of a memory allocation indicated that an array data structure should be processed. Thus use the corresponding function "devm_kcalloc". This issue was detected by using the Coccinelle software. Acked-by: Keerthy Tested-by: Keerthy Signed-off-by: Markus Elfring Signed-off-by: Eduardo Valentin --- drivers/thermal/ti-soc-thermal/ti-bandgap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.c b/drivers/thermal/ti-soc-thermal/ti-bandgap.c index ba9c302454fb..f19cb7612a65 100644 --- a/drivers/thermal/ti-soc-thermal/ti-bandgap.c +++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.c @@ -1224,8 +1224,8 @@ static struct ti_bandgap *ti_bandgap_build(struct platform_device *pdev) bgp->conf = of_id->data; /* register shadow for context save and restore */ - bgp->regval = devm_kzalloc(&pdev->dev, sizeof(*bgp->regval) * - bgp->conf->sensor_count, GFP_KERNEL); + bgp->regval = devm_kcalloc(&pdev->dev, bgp->conf->sensor_count, + sizeof(*bgp->regval), GFP_KERNEL); if (!bgp->regval) { dev_err(&pdev->dev, "Unable to allocate mem for driver ref\n"); return ERR_PTR(-ENOMEM); From 57e521151b56ee0c5164c442b7928c66711bfbc4 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 26 Apr 2017 17:03:07 +0200 Subject: [PATCH 168/254] ti-soc-thermal: Delete error messages for failed memory allocations in ti_bandgap_build() The script "checkpatch.pl" pointed information out like the following. WARNING: Possible unnecessary 'out of memory' message Thus remove such statements here. Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf Acked-by: Keerthy Tested-by: Keerthy Signed-off-by: Markus Elfring Signed-off-by: Eduardo Valentin --- drivers/thermal/ti-soc-thermal/ti-bandgap.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.c b/drivers/thermal/ti-soc-thermal/ti-bandgap.c index f19cb7612a65..109fb0a5f19f 100644 --- a/drivers/thermal/ti-soc-thermal/ti-bandgap.c +++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.c @@ -1214,10 +1214,8 @@ static struct ti_bandgap *ti_bandgap_build(struct platform_device *pdev) } bgp = devm_kzalloc(&pdev->dev, sizeof(*bgp), GFP_KERNEL); - if (!bgp) { - dev_err(&pdev->dev, "Unable to allocate mem for driver ref\n"); + if (!bgp) return ERR_PTR(-ENOMEM); - } of_id = of_match_device(of_ti_bandgap_match, &pdev->dev); if (of_id) @@ -1226,10 +1224,8 @@ static struct ti_bandgap *ti_bandgap_build(struct platform_device *pdev) /* register shadow for context save and restore */ bgp->regval = devm_kcalloc(&pdev->dev, bgp->conf->sensor_count, sizeof(*bgp->regval), GFP_KERNEL); - if (!bgp->regval) { - dev_err(&pdev->dev, "Unable to allocate mem for driver ref\n"); + if (!bgp->regval) return ERR_PTR(-ENOMEM); - } i = 0; do { From 8b8656d64c0cce9007e062273190d8e97096f0ac Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 26 Apr 2017 17:11:28 +0200 Subject: [PATCH 169/254] ti-soc-thermal: Fix a typo in a comment line Add a missing character in this description for a function. Acked-by: Keerthy Tested-by: Keerthy Signed-off-by: Markus Elfring Signed-off-by: Eduardo Valentin --- drivers/thermal/ti-soc-thermal/ti-bandgap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.c b/drivers/thermal/ti-soc-thermal/ti-bandgap.c index 109fb0a5f19f..696ab3046b87 100644 --- a/drivers/thermal/ti-soc-thermal/ti-bandgap.c +++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.c @@ -1010,7 +1010,7 @@ ti_bandgap_force_single_read(struct ti_bandgap *bgp, int id) } /** - * ti_bandgap_set_continous_mode() - One time enabling of continuous mode + * ti_bandgap_set_continuous_mode() - One time enabling of continuous mode * @bgp: pointer to struct ti_bandgap * * Call this function only if HAS(MODE_CONFIG) is set. As this driver may From a54c51863ed1294078a435151e625313b4365ac5 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Fri, 28 Apr 2017 16:11:30 -0400 Subject: [PATCH 170/254] thermal: broadcom: ns-thermal: default on iProc SoCs Tweak the Kconfig description to mention support for NSP and make the default on for iProc based platforms. Signed-off-by: Jon Mason Signed-off-by: Eduardo Valentin --- drivers/thermal/broadcom/Kconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/broadcom/Kconfig b/drivers/thermal/broadcom/Kconfig index ab08af4654ef..42c098e86f84 100644 --- a/drivers/thermal/broadcom/Kconfig +++ b/drivers/thermal/broadcom/Kconfig @@ -9,8 +9,9 @@ config BCM2835_THERMAL config BCM_NS_THERMAL tristate "Northstar thermal driver" depends on ARCH_BCM_IPROC || COMPILE_TEST + default y if ARCH_BCM_IPROC help - Northstar is a family of SoCs that includes e.g. BCM4708, BCM47081, - BCM4709 and BCM47094. It contains DMU (Device Management Unit) block - with a thermal sensor that allows checking CPU temperature. This - driver provides support for it. + Support for the Northstar and Northstar Plus family of SoCs (e.g. + BCM4708, BCM4709, BCM5301x, BCM95852X, etc). It contains DMU (Device + Management Unit) block with a thermal sensor that allows checking CPU + temperature. From 7d4df089d77306914426a604c890175f91a9a459 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Wed, 24 May 2017 15:21:23 +0900 Subject: [PATCH 171/254] perf report: Don't crash on invalid maps in `-g srcline` mode I just hit a segfault when doing `perf report -g srcline`. Valgrind pointed me at this code as the culprit: ==8359== Invalid read of size 8 ==8359== at 0x3096D9: map__rip_2objdump (map.c:430) ==8359== by 0x2FC1A3: match_chain_srcline (callchain.c:645) ==8359== by 0x2FC1A3: match_chain (callchain.c:700) ==8359== by 0x2FC1A3: append_chain (callchain.c:895) ==8359== by 0x2FC1A3: append_chain_children (callchain.c:846) ==8359== by 0x2FF719: callchain_append (callchain.c:944) ==8359== by 0x2FF719: hist_entry__append_callchain (callchain.c:1058) ==8359== by 0x32FA06: iter_add_single_cumulative_entry (hist.c:908) ==8359== by 0x33195C: hist_entry_iter__add (hist.c:1050) ==8359== by 0x258F65: process_sample_event (builtin-report.c:204) ==8359== by 0x30D60C: perf_session__deliver_event (session.c:1310) ==8359== by 0x30D60C: ordered_events__deliver_event (session.c:119) ==8359== by 0x310D12: __ordered_events__flush (ordered-events.c:210) ==8359== by 0x310D12: ordered_events__flush.part.3 (ordered-events.c:277) ==8359== by 0x30DD3C: perf_session__process_user_event (session.c:1349) ==8359== by 0x30DD3C: perf_session__process_event (session.c:1475) ==8359== by 0x30FC3C: __perf_session__process_events (session.c:1867) ==8359== by 0x30FC3C: perf_session__process_events (session.c:1921) ==8359== by 0x25A985: __cmd_report (builtin-report.c:575) ==8359== by 0x25A985: cmd_report (builtin-report.c:1054) ==8359== by 0x2B9A80: run_builtin (perf.c:296) ==8359== Address 0x70 is not stack'd, malloc'd or (recently) free'd This patch fixes the issue. Signed-off-by: Milian Wolff [ Remove dependency from another change ] Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yao Jin Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170524062129.32529-2-namhyung@kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/callchain.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 81fc29ac798f..b4204b43ed58 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -621,15 +621,20 @@ enum match_result { static enum match_result match_chain_srcline(struct callchain_cursor_node *node, struct callchain_list *cnode) { - char *left = get_srcline(cnode->ms.map->dso, - map__rip_2objdump(cnode->ms.map, cnode->ip), - cnode->ms.sym, true, false); - char *right = get_srcline(node->map->dso, - map__rip_2objdump(node->map, node->ip), - node->sym, true, false); + char *left = NULL; + char *right = NULL; enum match_result ret = MATCH_EQ; int cmp; + if (cnode->ms.map) + left = get_srcline(cnode->ms.map->dso, + map__rip_2objdump(cnode->ms.map, cnode->ip), + cnode->ms.sym, true, false); + if (node->map) + right = get_srcline(node->map->dso, + map__rip_2objdump(node->map, node->ip), + node->sym, true, false); + if (left && right) cmp = strcmp(left, right); else if (!left && right) From b21cc97810932a551f7aac46f0b89c469c828b3f Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Wed, 24 May 2017 15:21:24 +0900 Subject: [PATCH 172/254] perf report: Fix memory leak in addr2line when called by addr2inlines When a filename was found in addr2line it was duplicated via strdup() but never freed. Now we pass NULL and handle this gracefully in addr2line. Detected by Valgrind: ==16331== 1,680 bytes in 21 blocks are definitely lost in loss record 148 of 220 ==16331== at 0x4C2AF1F: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) ==16331== by 0x672FA69: strdup (in /usr/lib/libc-2.25.so) ==16331== by 0x52769F: addr2line (srcline.c:256) ==16331== by 0x52769F: addr2inlines (srcline.c:294) ==16331== by 0x52769F: dso__parse_addr_inlines (srcline.c:502) ==16331== by 0x574D7A: inline__fprintf (hist.c:41) ==16331== by 0x574D7A: ipchain__fprintf_graph (hist.c:147) ==16331== by 0x57518A: __callchain__fprintf_graph (hist.c:212) ==16331== by 0x5753CF: callchain__fprintf_graph.constprop.6 (hist.c:337) ==16331== by 0x57738E: hist_entry__fprintf (hist.c:628) ==16331== by 0x57738E: hists__fprintf (hist.c:882) ==16331== by 0x44A20F: perf_evlist__tty_browse_hists (builtin-report.c:399) ==16331== by 0x44A20F: report__browse_hists (builtin-report.c:491) ==16331== by 0x44A20F: __cmd_report (builtin-report.c:624) ==16331== by 0x44A20F: cmd_report (builtin-report.c:1054) ==16331== by 0x4A49CE: run_builtin (perf.c:296) ==16331== by 0x4A4CC0: handle_internal_command (perf.c:348) ==16331== by 0x434371: run_argv (perf.c:392) ==16331== by 0x434371: main (perf.c:530) Signed-off-by: Milian Wolff Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yao Jin Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170524062129.32529-3-namhyung@kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/srcline.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index df051a52393c..5e376d64d59e 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -230,7 +230,10 @@ static int addr2line(const char *dso_name, u64 addr, bfd_map_over_sections(a2l->abfd, find_address_in_section, a2l); - if (a2l->found && unwind_inlines) { + if (!a2l->found) + return 0; + + if (unwind_inlines) { int cnt = 0; while (bfd_find_inliner_info(a2l->abfd, &a2l->filename, @@ -243,6 +246,8 @@ static int addr2line(const char *dso_name, u64 addr, a2l->line, node, dso) != 0) return 0; + // found at least one inline frame + ret = 1; } } @@ -252,14 +257,14 @@ static int addr2line(const char *dso_name, u64 addr, } } - if (a2l->found && a2l->filename) { - *file = strdup(a2l->filename); - *line = a2l->line; - - if (*file) - ret = 1; + if (file) { + *file = a2l->filename ? strdup(a2l->filename) : NULL; + ret = *file ? 1 : 0; } + if (line) + *line = a2l->line; + return ret; } @@ -278,8 +283,6 @@ void dso__free_a2l(struct dso *dso) static struct inline_node *addr2inlines(const char *dso_name, u64 addr, struct dso *dso) { - char *file = NULL; - unsigned int line = 0; struct inline_node *node; node = zalloc(sizeof(*node)); @@ -291,7 +294,7 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, INIT_LIST_HEAD(&node->val); node->addr = addr; - if (!addr2line(dso_name, addr, &file, &line, dso, TRUE, node)) + if (!addr2line(dso_name, addr, NULL, NULL, dso, TRUE, node)) goto out_free_inline_node; if (list_empty(&node->val)) From 1982ad48fc82c284a5cc55697a012d3357e84d01 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Wed, 24 May 2017 15:21:25 +0900 Subject: [PATCH 173/254] perf report: Fix off-by-one for non-activation frames As the documentation for dwfl_frame_pc says, frames that are no activation frames need to have their program counter decremented by one to properly find the function of the caller. This fixes many cases where perf report currently attributes the cost to the next line. I.e. I have code like this: ~~~~~~~~~~~~~~~ #include #include using namespace std; int main() { this_thread::sleep_for(chrono::milliseconds(1000)); this_thread::sleep_for(chrono::milliseconds(100)); this_thread::sleep_for(chrono::milliseconds(10)); return 0; } ~~~~~~~~~~~~~~~ Now compile and record it: ~~~~~~~~~~~~~~~ g++ -std=c++11 -g -O2 test.cpp echo 1 | sudo tee /proc/sys/kernel/sched_schedstats perf record \ --event sched:sched_stat_sleep \ --event sched:sched_process_exit \ --event sched:sched_switch --call-graph=dwarf \ --output perf.data.raw \ ./a.out echo 0 | sudo tee /proc/sys/kernel/sched_schedstats perf inject --sched-stat --input perf.data.raw --output perf.data ~~~~~~~~~~~~~~~ Before this patch, the report clearly shows the off-by-one issue. Most notably, the last sleep invocation is incorrectly attributed to the "return 0;" line: ~~~~~~~~~~~~~~~ Overhead Source:Line ........ ........... 100.00% core.c:0 | ---__schedule core.c:0 schedule do_nanosleep hrtimer.c:0 hrtimer_nanosleep sys_nanosleep entry_SYSCALL_64_fastpath .tmp_entry_64.o:0 __nanosleep_nocancel .:0 std::this_thread::sleep_for > thread:323 | |--90.08%--main test.cpp:9 | __libc_start_main | _start | |--9.01%--main test.cpp:10 | __libc_start_main | _start | --0.91%--main test.cpp:13 __libc_start_main _start ~~~~~~~~~~~~~~~ With this patch here applied, the issue is fixed. The report becomes much more usable: ~~~~~~~~~~~~~~~ Overhead Source:Line ........ ........... 100.00% core.c:0 | ---__schedule core.c:0 schedule do_nanosleep hrtimer.c:0 hrtimer_nanosleep sys_nanosleep entry_SYSCALL_64_fastpath .tmp_entry_64.o:0 __nanosleep_nocancel .:0 std::this_thread::sleep_for > thread:323 | |--90.08%--main test.cpp:8 | __libc_start_main | _start | |--9.01%--main test.cpp:9 | __libc_start_main | _start | --0.91%--main test.cpp:10 __libc_start_main _start ~~~~~~~~~~~~~~~ Similarly it works for signal frames: ~~~~~~~~~~~~~~~ __noinline void bar(void) { volatile long cnt = 0; for (cnt = 0; cnt < 100000000; cnt++); } __noinline void foo(void) { bar(); } void sig_handler(int sig) { foo(); } int main(void) { signal(SIGUSR1, sig_handler); raise(SIGUSR1); foo(); return 0; } ~~~~~~~~~~~~~~~~ Before, the report wrongly points to `signal.c:29` after raise(): ~~~~~~~~~~~~~~~~ $ perf report --stdio --no-children -g srcline -s srcline ... 100.00% signal.c:11 | ---bar signal.c:11 | |--50.49%--main signal.c:29 | __libc_start_main | _start | --49.51%--0x33a8f raise .:0 main signal.c:29 __libc_start_main _start ~~~~~~~~~~~~~~~~ With this patch in, the issue is fixed and we instead get: ~~~~~~~~~~~~~~~~ 100.00% signal signal [.] bar | ---bar signal.c:11 | |--50.49%--main signal.c:29 | __libc_start_main | _start | --49.51%--0x33a8f raise .:0 main signal.c:27 __libc_start_main _start ~~~~~~~~~~~~~~~~ Note how this patch fixes this issue for both unwinding methods, i.e. both dwfl and libunwind. The former case is straight-forward thanks to dwfl_frame_pc(). For libunwind, we replace the functionality via unw_is_signal_frame() for any but the very first frame. Signed-off-by: Milian Wolff Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yao Jin Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170524062129.32529-4-namhyung@kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/unwind-libdw.c | 6 +++++- tools/perf/util/unwind-libunwind-local.c | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index f90e11a555b2..943a06291587 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -168,12 +168,16 @@ frame_callback(Dwfl_Frame *state, void *arg) { struct unwind_info *ui = arg; Dwarf_Addr pc; + bool isactivation; - if (!dwfl_frame_pc(state, &pc, NULL)) { + if (!dwfl_frame_pc(state, &pc, &isactivation)) { pr_err("%s", dwfl_errmsg(-1)); return DWARF_CB_ABORT; } + if (!isactivation) + --pc; + return entry(pc, ui) || !(--ui->max_stack) ? DWARF_CB_ABORT : DWARF_CB_OK; } diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index f8455bed6e65..672c2ada9357 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -692,6 +692,17 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb, while (!ret && (unw_step(&c) > 0) && i < max_stack) { unw_get_reg(&c, UNW_REG_IP, &ips[i]); + + /* + * Decrement the IP for any non-activation frames. + * this is required to properly find the srcline + * for caller frames. + * See also the documentation for dwfl_frame_pc(), + * which this code tries to replicate. + */ + if (unw_is_signal_frame(&c) <= 0) + --ips[i]; + ++i; } From 325fbff51f961491adff4037d0e0a94d6132bd9b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 May 2017 15:21:26 +0900 Subject: [PATCH 174/254] perf script: Add --inline option for debugging The --inline option is to show inlined functions in callchains. For example: $ perf script a.out 5644 11611.467597: 309961 cycles:u: 790 main (/home/namhyung/tmp/perf/a.out) 20511 __libc_start_main (/usr/lib/libc-2.25.so) 8ba _start (/home/namhyung/tmp/perf/a.out) ... $ perf script --inline a.out 5644 11611.467597: 309961 cycles:u: 790 main (/home/namhyung/tmp/perf/a.out) std::__detail::_Adaptor, double>::operator() std::uniform_real_distribution::operator() > std::uniform_real_distribution::operator() > main 20511 __libc_start_main (/usr/lib/libc-2.25.so) 8ba _start (/home/namhyung/tmp/perf/a.out) ... Reviewed-and-tested-by: Milian Wolff Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jin Yao Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Milian Wolff Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170524062129.32529-5-namhyung@kernel.org Signed-off-by: Ingo Molnar --- tools/perf/Documentation/perf-script.txt | 4 +++ tools/perf/builtin-script.c | 2 ++ tools/perf/util/evsel_fprintf.c | 33 ++++++++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index cb0eda3925e6..3517e204a2b3 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -311,6 +311,10 @@ include::itrace.txt[] Set the maximum number of program blocks to print with brstackasm for each sample. +--inline:: + If a callgraph address belongs to an inlined function, the inline stack + will be printed. Each entry has function name and file/line. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index d05aec491cff..4761b0d7fcb5 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2494,6 +2494,8 @@ int cmd_script(int argc, const char **argv) "Enable kernel symbol demangling"), OPT_STRING(0, "time", &script.time_str, "str", "Time span of interest (start,stop)"), + OPT_BOOLEAN(0, "inline", &symbol_conf.inline_name, + "Show inline function"), OPT_END() }; const char * const script_subcommands[] = { "record", "report", NULL }; diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index e415aee6a245..583f3a602506 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -7,6 +7,7 @@ #include "map.h" #include "strlist.h" #include "symbol.h" +#include "srcline.h" static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...) { @@ -168,6 +169,38 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, if (!print_oneline) printed += fprintf(fp, "\n"); + if (symbol_conf.inline_name && node->map) { + struct inline_node *inode; + + addr = map__rip_2objdump(node->map, node->ip), + inode = dso__parse_addr_inlines(node->map->dso, addr); + + if (inode) { + struct inline_list *ilist; + + list_for_each_entry(ilist, &inode->val, list) { + if (print_arrow) + printed += fprintf(fp, " <-"); + + /* IP is same, just skip it */ + if (print_ip) + printed += fprintf(fp, "%c%16s", + s, ""); + if (print_sym) + printed += fprintf(fp, " %s", + ilist->funcname); + if (print_srcline) + printed += fprintf(fp, "\n %s:%d", + ilist->filename, + ilist->line_nr); + if (!print_oneline) + printed += fprintf(fp, "\n"); + } + + inline_node__delete(inode); + } + } + if (symbol_conf.bt_stop_list && node->sym && strlist__has_entry(symbol_conf.bt_stop_list, From 28071f51839e393f697d0d1df0b223a4bc373606 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Wed, 24 May 2017 15:21:27 +0900 Subject: [PATCH 175/254] perf report: Always honor callchain order for inlined nodes So far, the inlined nodes where only reversed when we built perf against libbfd. If that was not available, the addr2line fallback code path was missing the inline_list__reverse call. Now we always add the nodes in the correct order within inline_list__append. This removes the need to reverse the list and also ensures that all callers construct the list in the right order. Signed-off-by: Milian Wolff Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yao Jin Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170524062129.32529-6-namhyung@kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/srcline.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 5e376d64d59e..6af0364cad06 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -56,7 +56,10 @@ static int inline_list__append(char *filename, char *funcname, int line_nr, } } - list_add_tail(&ilist->list, &node->val); + if (callchain_param.order == ORDER_CALLEE) + list_add_tail(&ilist->list, &node->val); + else + list_add(&ilist->list, &node->val); return 0; } @@ -200,14 +203,6 @@ static void addr2line_cleanup(struct a2l_data *a2l) #define MAX_INLINE_NEST 1024 -static void inline_list__reverse(struct inline_node *node) -{ - struct inline_list *ilist, *n; - - list_for_each_entry_safe_reverse(ilist, n, &node->val, list) - list_move_tail(&ilist->list, &node->val); -} - static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *line, struct dso *dso, bool unwind_inlines, struct inline_node *node) @@ -250,11 +245,6 @@ static int addr2line(const char *dso_name, u64 addr, ret = 1; } } - - if ((node != NULL) && - (callchain_param.order != ORDER_CALLEE)) { - inline_list__reverse(node); - } } if (file) { From 4d53b9d546f9f4505e6e3d58c8eed894d6f684e7 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Wed, 24 May 2017 15:21:28 +0900 Subject: [PATCH 176/254] perf report: Do not drop last inlined frame The very last inlined frame, i.e. the one furthest away from the non-inlined frame, was silently dropped. This is apparent when comparing the output of `perf script` and `addr2line`: ~~~~~~ $ perf script --inline ... a.out 26722 80836.309329: 72425 cycles: 21561 __hypot_finite (/usr/lib/libm-2.25.so) ace3 hypot (/usr/lib/libm-2.25.so) a4a main (a.out) std::abs std::_Norm_helper::_S_do_it std::norm main 20510 __libc_start_main (/usr/lib/libc-2.25.so) bd9 _start (a.out) $ addr2line -a -f -i -e /tmp/a.out a4a | c++filt 0x0000000000000a4a std::__complex_abs(doublecomplex ) /usr/include/c++/6.3.1/complex:589 double std::abs(std::complex const&) /usr/include/c++/6.3.1/complex:597 double std::_Norm_helper::_S_do_it(std::complex const&) /usr/include/c++/6.3.1/complex:654 double std::norm(std::complex const&) /usr/include/c++/6.3.1/complex:664 main /tmp/inlining.cpp:14 ~~~~~ Note how `std::__complex_abs` is missing from the `perf script` output. This is similarly showing up in `perf report`. The patch here fixes this issue, and the output becomes: ~~~~~ a.out 26722 80836.309329: 72425 cycles: 21561 __hypot_finite (/usr/lib/libm-2.25.so) ace3 hypot (/usr/lib/libm-2.25.so) a4a main (a.out) std::__complex_abs std::abs std::_Norm_helper::_S_do_it std::norm main 20510 __libc_start_main (/usr/lib/libc-2.25.so) bd9 _start (a.out) ~~~~~ Signed-off-by: Milian Wolff Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yao Jin Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170524062129.32529-7-namhyung@kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/srcline.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 6af0364cad06..ebc88a74e67b 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -203,6 +203,16 @@ static void addr2line_cleanup(struct a2l_data *a2l) #define MAX_INLINE_NEST 1024 +static int inline_list__append_dso_a2l(struct dso *dso, + struct inline_node *node) +{ + struct a2l_data *a2l = dso->a2l; + char *funcname = a2l->funcname ? strdup(a2l->funcname) : NULL; + char *filename = a2l->filename ? strdup(a2l->filename) : NULL; + + return inline_list__append(filename, funcname, a2l->line, node, dso); +} + static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *line, struct dso *dso, bool unwind_inlines, struct inline_node *node) @@ -231,15 +241,15 @@ static int addr2line(const char *dso_name, u64 addr, if (unwind_inlines) { int cnt = 0; + if (node && inline_list__append_dso_a2l(dso, node)) + return 0; + while (bfd_find_inliner_info(a2l->abfd, &a2l->filename, &a2l->funcname, &a2l->line) && cnt++ < MAX_INLINE_NEST) { if (node != NULL) { - if (inline_list__append(strdup(a2l->filename), - strdup(a2l->funcname), - a2l->line, node, - dso) != 0) + if (inline_list__append_dso_a2l(dso, node)) return 0; // found at least one inline frame ret = 1; From 7111ffff60a68f55d864200cd6c7677319e5c242 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 May 2017 15:21:29 +0900 Subject: [PATCH 177/254] perf tools: Put caller above callee in --children mode The __hpp__sort_acc() sorts entries using callchain depth in order to put callers above in children mode. But it assumed the callchain order was callee-first. Now default (for children) is caller-first so the order of entries is reverted. For example, consider following case: $ perf report --no-children ..l # Overhead Command Shared Object Symbol # ........ ....... ................... .......................... # 99.44% a.out a.out [.] main | ---main __libc_start_main _start Then children mode should show 'start' above '__libc_start_main' since it's the caller (parent) of the __libc_start_main. But it's reversed: # Children Self Command Shared Object Symbol # ........ ........ ....... ............... ..................... # 99.61% 0.00% a.out libc-2.25.so [.] __libc_start_main 99.61% 0.00% a.out a.out [.] _start 99.54% 99.44% a.out a.out [.] main This patch fixes it. # Children Self Command Shared Object Symbol # ........ ........ ....... ............... ..................... # 99.61% 0.00% a.out a.out [.] _start 99.61% 0.00% a.out libc-2.25.so [.] __libc_start_main 99.54% 99.44% a.out a.out [.] main Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Linus Torvalds Cc: Milian Wolff Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yao Jin Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170524062129.32529-8-namhyung@kernel.org Signed-off-by: Ingo Molnar --- tools/perf/ui/hist.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 59addd52d9cd..ddb2c6fbdf91 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -210,6 +210,8 @@ static int __hpp__sort_acc(struct hist_entry *a, struct hist_entry *b, return 0; ret = b->callchain->max_depth - a->callchain->max_depth; + if (callchain_param.order == ORDER_CALLER) + ret = -ret; } return ret; } From 6e30437bd42c4d4e9cfc4c40efda00eb83a11cde Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 24 May 2017 08:57:21 +0200 Subject: [PATCH 178/254] tools/include: Sync kernel ABI headers with tooling headers Sync (copy) the following v4.12 kernel headers to the tooling headers: arch/x86/include/asm/disabled-features.h: arch/x86/include/uapi/asm/kvm.h: arch/powerpc/include/uapi/asm/kvm.h: arch/s390/include/uapi/asm/kvm.h: arch/arm/include/uapi/asm/kvm.h: arch/arm64/include/uapi/asm/kvm.h: - 'struct kvm_sync_regs' got changed in an ABI-incompatible way, fortunately none of the (in-kernel) tooling relied on it - new KVM_DEV calls added arch/x86/include/asm/required-features.h: - 5-level paging hardware ABI detail added arch/x86/include/asm/cpufeatures.h: - new CPU feature added arch/x86/include/uapi/asm/vmx.h: - new VMX exit conditions None of the changes requires fixes in the tooling source code. This addresses the following warnings: Warning: include/uapi/linux/stat.h differs from kernel Warning: arch/x86/include/asm/disabled-features.h differs from kernel Warning: arch/x86/include/asm/required-features.h differs from kernel Warning: arch/x86/include/asm/cpufeatures.h differs from kernel Warning: arch/x86/include/uapi/asm/kvm.h differs from kernel Warning: arch/x86/include/uapi/asm/vmx.h differs from kernel Warning: arch/powerpc/include/uapi/asm/kvm.h differs from kernel Warning: arch/s390/include/uapi/asm/kvm.h differs from kernel Warning: arch/arm/include/uapi/asm/kvm.h differs from kernel Warning: arch/arm64/include/uapi/asm/kvm.h differs from kernel Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yao Jin Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170524065721.j2mlch6bgk5klgbc@gmail.com Signed-off-by: Ingo Molnar --- tools/arch/arm/include/uapi/asm/kvm.h | 10 ++++++- tools/arch/arm64/include/uapi/asm/kvm.h | 10 ++++++- tools/arch/powerpc/include/uapi/asm/kvm.h | 3 +++ tools/arch/s390/include/uapi/asm/kvm.h | 26 ++++++++++++++++-- tools/arch/x86/include/asm/cpufeatures.h | 2 ++ .../arch/x86/include/asm/disabled-features.h | 8 +++++- .../arch/x86/include/asm/required-features.h | 8 +++++- tools/arch/x86/include/uapi/asm/kvm.h | 3 +++ tools/arch/x86/include/uapi/asm/vmx.h | 27 +++++++++++++------ tools/include/uapi/linux/stat.h | 8 ++---- 10 files changed, 85 insertions(+), 20 deletions(-) diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h index 6ebd3e6a1fd1..5e3c673fa3f4 100644 --- a/tools/arch/arm/include/uapi/asm/kvm.h +++ b/tools/arch/arm/include/uapi/asm/kvm.h @@ -27,6 +27,8 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) @@ -114,6 +116,8 @@ struct kvm_debug_exit_arch { }; struct kvm_sync_regs { + /* Used with KVM_CAP_ARM_USER_IRQ */ + __u64 device_irq_level; }; struct kvm_arch_memory_slot { @@ -192,13 +196,17 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 +#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff #define VGIC_LEVEL_INFO_LINE_LEVEL 0 -#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 +#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 +#define KVM_DEV_ARM_ITS_SAVE_TABLES 1 +#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 +#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index c2860358ae3e..70eea2ecc663 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -39,6 +39,8 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) @@ -143,6 +145,8 @@ struct kvm_debug_exit_arch { #define KVM_GUESTDBG_USE_HW (1 << 17) struct kvm_sync_regs { + /* Used with KVM_CAP_ARM_USER_IRQ */ + __u64 device_irq_level; }; struct kvm_arch_memory_slot { @@ -212,13 +216,17 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 +#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff #define VGIC_LEVEL_INFO_LINE_LEVEL 0 -#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 +#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 +#define KVM_DEV_ARM_ITS_SAVE_TABLES 1 +#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 +#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 4edbe4bb0e8b..07fbeb927834 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -29,6 +29,9 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_GUEST_DEBUG +/* Not always available, but if it is, this is the correct offset. */ +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + struct kvm_regs { __u64 pc; __u64 cr; diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 7f4fd65e9208..3dd2a1d308dd 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -26,6 +26,8 @@ #define KVM_DEV_FLIC_ADAPTER_REGISTER 6 #define KVM_DEV_FLIC_ADAPTER_MODIFY 7 #define KVM_DEV_FLIC_CLEAR_IO_IRQ 8 +#define KVM_DEV_FLIC_AISM 9 +#define KVM_DEV_FLIC_AIRQ_INJECT 10 /* * We can have up to 4*64k pending subchannels + 8 adapter interrupts, * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. @@ -41,7 +43,14 @@ struct kvm_s390_io_adapter { __u8 isc; __u8 maskable; __u8 swap; - __u8 pad; + __u8 flags; +}; + +#define KVM_S390_ADAPTER_SUPPRESSIBLE 0x01 + +struct kvm_s390_ais_req { + __u8 isc; + __u16 mode; }; #define KVM_S390_IO_ADAPTER_MASK 1 @@ -110,6 +119,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_CMMA 10 #define KVM_S390_VM_CPU_FEAT_PFMFI 11 #define KVM_S390_VM_CPU_FEAT_SIGPIF 12 +#define KVM_S390_VM_CPU_FEAT_KSS 13 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; @@ -198,6 +208,10 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_VRS (1UL << 6) #define KVM_SYNC_RICCB (1UL << 7) #define KVM_SYNC_FPRS (1UL << 8) +#define KVM_SYNC_GSCB (1UL << 9) +/* length and alignment of the sdnx as a power of two */ +#define SDNXC 8 +#define SDNXL (1UL << SDNXC) /* definition of registers in kvm_run */ struct kvm_sync_regs { __u64 prefix; /* prefix register */ @@ -218,8 +232,16 @@ struct kvm_sync_regs { }; __u8 reserved[512]; /* for future vector expansion */ __u32 fpc; /* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */ - __u8 padding[52]; /* riccb needs to be 64byte aligned */ + __u8 padding1[52]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ + __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ + union { + __u8 sdnx[SDNXL]; /* state description annex */ + struct { + __u64 reserved1[2]; + __u64 gscb[4]; + }; + }; }; #define KVM_REG_S390_TODPR (KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 0fe00446f9ca..2701e5f8145b 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -202,6 +202,8 @@ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 85599ad4d024..5dff775af7cd 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -36,6 +36,12 @@ # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ +#ifdef CONFIG_X86_5LEVEL +# define DISABLE_LA57 0 +#else +# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -55,7 +61,7 @@ #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) #define DISABLED_MASK17 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index fac9a5c0abe9..d91ba04dd007 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -53,6 +53,12 @@ # define NEED_MOVBE 0 #endif +#ifdef CONFIG_X86_5LEVEL +# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31)) +#else +# define NEED_LA57 0 +#endif + #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT /* Paravirtualized systems may not have PSE or PGE available */ @@ -98,7 +104,7 @@ #define REQUIRED_MASK13 0 #define REQUIRED_MASK14 0 #define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 +#define REQUIRED_MASK16 (NEED_LA57) #define REQUIRED_MASK17 0 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 739c0c594022..c2824d02ba37 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -9,6 +9,9 @@ #include #include +#define KVM_PIO_PAGE_OFFSET 1 +#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 + #define DE_VECTOR 0 #define DB_VECTOR 1 #define BP_VECTOR 3 diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index 14458658e988..690a2dcf4078 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -76,7 +76,11 @@ #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 @@ -90,6 +94,7 @@ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ { EXIT_REASON_CPUID, "CPUID" }, \ { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVLPG, "INVLPG" }, \ { EXIT_REASON_RDPMC, "RDPMC" }, \ { EXIT_REASON_RDTSC, "RDTSC" }, \ @@ -108,6 +113,8 @@ { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ { EXIT_REASON_MSR_READ, "MSR_READ" }, \ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ @@ -115,20 +122,24 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ - { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ + { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_RDTSCP, "RDTSCP" }, \ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ - { EXIT_REASON_WBINVD, "WBINVD" }, \ - { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ - { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ - { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ - { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ - { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVVPID, "INVVPID" }, \ + { EXIT_REASON_WBINVD, "WBINVD" }, \ + { EXIT_REASON_XSETBV, "XSETBV" }, \ + { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ + { EXIT_REASON_RDRAND, "RDRAND" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ + { EXIT_REASON_VMFUNC, "VMFUNC" }, \ + { EXIT_REASON_ENCLS, "ENCLS" }, \ + { EXIT_REASON_RDSEED, "RDSEED" }, \ + { EXIT_REASON_PML_FULL, "PML_FULL" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" } diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index d538897b8e08..17b10304c393 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -48,17 +48,13 @@ * tv_sec holds the number of seconds before (negative) or after (positive) * 00:00:00 1st January 1970 UTC. * - * tv_nsec holds a number of nanoseconds before (0..-999,999,999 if tv_sec is - * negative) or after (0..999,999,999 if tv_sec is positive) the tv_sec time. - * - * Note that if both tv_sec and tv_nsec are non-zero, then the two values must - * either be both positive or both negative. + * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time. * * __reserved is held in case we need a yet finer resolution. */ struct statx_timestamp { __s64 tv_sec; - __s32 tv_nsec; + __u32 tv_nsec; __s32 __reserved; }; From ebd574994c63164d538a197172157318f58ac647 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 23 May 2017 10:37:29 -0500 Subject: [PATCH 179/254] Revert "x86/entry: Fix the end of the stack for newly forked tasks" Petr Mladek reported the following warning when loading the livepatch sample module: WARNING: CPU: 1 PID: 3699 at arch/x86/kernel/stacktrace.c:132 save_stack_trace_tsk_reliable+0x133/0x1a0 ... Call Trace: __schedule+0x273/0x820 schedule+0x36/0x80 kthreadd+0x305/0x310 ? kthread_create_on_cpu+0x80/0x80 ? icmp_echo.part.32+0x50/0x50 ret_from_fork+0x2c/0x40 That warning means the end of the stack is no longer recognized as such for newly forked tasks. The problem was introduced with the following commit: ff3f7e2475bb ("x86/entry: Fix the end of the stack for newly forked tasks") ... which was completely misguided. It only partially fixed the reported issue, and it introduced another bug in the process. None of the other entry code saves the frame pointer before calling into C code, so it doesn't make sense for ret_from_fork to do so either. Contrary to what I originally thought, the original issue wasn't related to newly forked tasks. It was actually related to ftrace. When entry code calls into a function which then calls into an ftrace handler, the stack frame looks different than normal. The original issue will be fixed in the unwinder, in a subsequent patch. Reported-by: Petr Mladek Signed-off-by: Josh Poimboeuf Acked-by: Thomas Gleixner Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: live-patching@vger.kernel.org Fixes: ff3f7e2475bb ("x86/entry: Fix the end of the stack for newly forked tasks") Link: http://lkml.kernel.org/r/f350760f7e82f0750c8d1dd093456eb212751caa.1495553739.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 30 +++++++++++++++++++----------- arch/x86/entry/entry_64.S | 11 ++++------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 50bc26949e9e..48ef7bb32c42 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -251,6 +251,23 @@ ENTRY(__switch_to_asm) jmp __switch_to END(__switch_to_asm) +/* + * The unwinder expects the last frame on the stack to always be at the same + * offset from the end of the page, which allows it to validate the stack. + * Calling schedule_tail() directly would break that convention because its an + * asmlinkage function so its argument has to be pushed on the stack. This + * wrapper creates a proper "end of stack" frame header before the call. + */ +ENTRY(schedule_tail_wrapper) + FRAME_BEGIN + + pushl %eax + call schedule_tail + popl %eax + + FRAME_END + ret +ENDPROC(schedule_tail_wrapper) /* * A newly forked process directly context switches into this address. * @@ -259,24 +276,15 @@ END(__switch_to_asm) * edi: kernel thread arg */ ENTRY(ret_from_fork) - FRAME_BEGIN /* help unwinder find end of stack */ - - /* - * schedule_tail() is asmlinkage so we have to put its 'prev' argument - * on the stack. - */ - pushl %eax - call schedule_tail - popl %eax + call schedule_tail_wrapper testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ 2: /* When we fork, we trace the syscall return in the child, too. */ - leal FRAME_OFFSET(%esp), %eax + movl %esp, %eax call syscall_return_slowpath - FRAME_END jmp restore_all /* kernel thread */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 607d72c4a485..4a4c0834f965 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,7 +36,6 @@ #include #include #include -#include #include .code64 @@ -406,19 +405,17 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) - FRAME_BEGIN /* help unwinder find end of stack */ movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ + call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ 2: - leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */ + movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS - FRAME_END jmp restore_regs_and_iret 1: From 519fb5c3350d1b5225b27b1cac55144f79351718 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 23 May 2017 10:37:30 -0500 Subject: [PATCH 180/254] x86/unwind: Add end-of-stack check for ftrace handlers Dave Jones and Steven Rostedt reported unwinder warnings like the following: WARNING: kernel stack frame pointer at ffff8800bda0ff30 in sshd:1090 has bad value 000055b32abf1fa8 In both cases, the unwinder was attempting to unwind from an ftrace handler into entry code. The callchain was something like: syscall entry code C function ftrace handler save_stack_trace() The problem is that the unwinder's end-of-stack logic gets confused by the way ftrace lays out the stack frame (with fentry enabled). I was able to recreate this warning with: echo call_usermodehelper_exec_async:stacktrace > /sys/kernel/debug/tracing/set_ftrace_filter (exit login session) I considered fixing this by changing the ftrace code to rewrite the stack to make the unwinder happy. But that seemed too intrusive after I implemented it. Instead, just add another check to the unwinder's end-of-stack logic to detect this special case. Side note: We could probably get rid of these end-of-stack checks by encoding the frame pointer for syscall entry just like we do for interrupt entry. That would be simpler, but it would also be a lot more intrusive since it would slightly affect the performance of every syscall. Reported-by: Dave Jones Reported-by: Steven Rostedt Signed-off-by: Josh Poimboeuf Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Petr Mladek Cc: live-patching@vger.kernel.org Fixes: c32c47c68a0a ("x86/unwind: Warn on bad frame pointer") Link: http://lkml.kernel.org/r/671ba22fbc0156b8f7e0cfa5ab2a795e08bc37e1.1495553739.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 49 +++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 82c6d7f1fd73..b9389d72b2f7 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -104,6 +104,11 @@ static inline unsigned long *last_frame(struct unwind_state *state) return (unsigned long *)task_pt_regs(state->task) - 2; } +static bool is_last_frame(struct unwind_state *state) +{ + return state->bp == last_frame(state); +} + #ifdef CONFIG_X86_32 #define GCC_REALIGN_WORDS 3 #else @@ -115,16 +120,15 @@ static inline unsigned long *last_aligned_frame(struct unwind_state *state) return last_frame(state) - GCC_REALIGN_WORDS; } -static bool is_last_task_frame(struct unwind_state *state) +static bool is_last_aligned_frame(struct unwind_state *state) { unsigned long *last_bp = last_frame(state); unsigned long *aligned_bp = last_aligned_frame(state); /* - * We have to check for the last task frame at two different locations - * because gcc can occasionally decide to realign the stack pointer and - * change the offset of the stack frame in the prologue of a function - * called by head/entry code. Examples: + * GCC can occasionally decide to realign the stack pointer and change + * the offset of the stack frame in the prologue of a function called + * by head/entry code. Examples: * * : * push %edi @@ -141,11 +145,38 @@ static bool is_last_task_frame(struct unwind_state *state) * push %rbp * mov %rsp,%rbp * - * Note that after aligning the stack, it pushes a duplicate copy of - * the return address before pushing the frame pointer. + * After aligning the stack, it pushes a duplicate copy of the return + * address before pushing the frame pointer. */ - return (state->bp == last_bp || - (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1))); + return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1)); +} + +static bool is_last_ftrace_frame(struct unwind_state *state) +{ + unsigned long *last_bp = last_frame(state); + unsigned long *last_ftrace_bp = last_bp - 3; + + /* + * When unwinding from an ftrace handler of a function called by entry + * code, the stack layout of the last frame is: + * + * bp + * parent ret addr + * bp + * function ret addr + * parent ret addr + * pt_regs + * ----------------- + */ + return (state->bp == last_ftrace_bp && + *state->bp == *(state->bp + 2) && + *(state->bp + 1) == *(state->bp + 4)); +} + +static bool is_last_task_frame(struct unwind_state *state) +{ + return is_last_frame(state) || is_last_aligned_frame(state) || + is_last_ftrace_frame(state); } /* From 7e6091209f7f73e2a81943020793b5ad26d645c6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 23 May 2017 18:27:54 +0200 Subject: [PATCH 181/254] x86/build: Permit building with old make versions At least Make 3.82 dislikes the tab in front of the $(warning) function: arch/x86/Makefile:162: *** recipe commences before first target. Stop. Let's be gentle. Signed-off-by: Jan Kiszka Acked-by: Thomas Gleixner Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1944fcd8-e3df-d1f7-c0e4-60aeb1917a24@siemens.com Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5851411e60fb..bf240b920473 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -159,7 +159,7 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER # If '-Os' is enabled, disable it and print a warning. ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE undefine CONFIG_CC_OPTIMIZE_FOR_SIZE - $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.) + $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.) endif endif From c9525a3fab63fbe091007494f8b7a06438eea6a7 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Sat, 20 May 2017 17:20:16 -0700 Subject: [PATCH 182/254] x86/watchdog: Fix Kconfig help text file path reference to lockup watchdog documentation Signed-off-by: Benjamin Peterson Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: 9919cba7ff71147803c988521cc1ceb80e7f0f6d ("watchdog: Update documentation") Link: http://lkml.kernel.org/r/20170521002016.13258-1-bp@benjamin.pe Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd18994a9555..4ccfacc7232a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -360,7 +360,7 @@ config SMP Management" code will be disabled if you say Y here. See also , - and the SMP-HOWTO available at + and the SMP-HOWTO available at . If you don't know what to do here, say N. From cbed27cdf0e3f7ea3b2259e86b9e34df02be3fe4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 18 Apr 2017 15:07:11 -0400 Subject: [PATCH 183/254] x86/PAT: Fix Xorg regression on CPUs that don't support PAT In the file arch/x86/mm/pat.c, there's a '__pat_enabled' variable. The variable is set to 1 by default and the function pat_init() sets __pat_enabled to 0 if the CPU doesn't support PAT. However, on AMD K6-3 CPUs, the processor initialization code never calls pat_init() and so __pat_enabled stays 1 and the function pat_enabled() returns true, even though the K6-3 CPU doesn't support PAT. The result of this bug is that a kernel warning is produced when attempting to start the Xserver and the Xserver doesn't start (fork() returns ENOMEM). Another symptom of this bug is that the framebuffer driver doesn't set the K6-3 MTRR registers: x86/PAT: Xorg:3891 map pfn expected mapping type uncached-minus for [mem 0xe4000000-0xe5ffffff], got write-combining ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3891 at arch/x86/mm/pat.c:1020 untrack_pfn+0x5c/0x9f ... x86/PAT: Xorg:3891 map pfn expected mapping type uncached-minus for [mem 0xe4000000-0xe5ffffff], got write-combining To fix the bug change pat_enabled() so that it returns true only if PAT initialization was actually done. Also, I changed boot_cpu_has(X86_FEATURE_PAT) to this_cpu_has(X86_FEATURE_PAT) in pat_ap_init(), so that we check the PAT feature on the processor that is being initialized. Signed-off-by: Mikulas Patocka Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: stable@vger.kernel.org # v4.2+ Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1704181501450.26399@file01.intranet.prod.int.rdu2.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 9b78685b66e6..83a59a67757a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -65,9 +65,11 @@ static int __init nopat(char *str) } early_param("nopat", nopat); +static bool __read_mostly __pat_initialized = false; + bool pat_enabled(void) { - return !!__pat_enabled; + return __pat_initialized; } EXPORT_SYMBOL_GPL(pat_enabled); @@ -225,13 +227,14 @@ static void pat_bsp_init(u64 pat) } wrmsrl(MSR_IA32_CR_PAT, pat); + __pat_initialized = true; __init_cache_modes(pat); } static void pat_ap_init(u64 pat) { - if (!boot_cpu_has(X86_FEATURE_PAT)) { + if (!this_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. @@ -306,7 +309,7 @@ void pat_init(void) u64 pat; struct cpuinfo_x86 *c = &boot_cpu_data; - if (!pat_enabled()) { + if (!__pat_enabled) { init_cache_modes(); return; } From fc152d22d6e9fac95a9a990e6c29510bdf1b9425 Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 24 May 2017 15:55:00 +0200 Subject: [PATCH 184/254] x86/alternatives: Prevent uninitialized stack byte read in apply_alternatives() In the current form of the code, if a->replacementlen is 0, the reference to *insnbuf for comparison touches potentially garbage memory. While it doesn't affect the execution flow due to the subsequent a->replacementlen comparison, it is (rightly) detected as use of uninitialized memory by a runtime instrumentation currently under my development, and could be detected as such by other tools in the future, too (e.g. KMSAN). Fix the "false-positive" by reordering the conditions to first check the replacement instruction length before referencing specific opcode bytes. Signed-off-by: Mateusz Jurczyk Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/20170524135500.27223-1-mjurczyk@google.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/alternative.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c5b8f760473c..32e14d137416 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -409,8 +409,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, memcpy(insnbuf, replacement, a->replacementlen); insnbuf_sz = a->replacementlen; - /* 0xe8 is a relative jump; fix the offset. */ - if (*insnbuf == 0xe8 && a->replacementlen == 5) { + /* + * 0xe8 is a relative jump; fix the offset. + * + * Instruction length is checked before the opcode to avoid + * accessing uninitialized bytes for zero-length replacements. + */ + if (a->replacementlen == 5 && *insnbuf == 0xe8) { *(s32 *)(insnbuf + 1) += replacement - instr; DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", *(s32 *)(insnbuf + 1), From 0a2ad541071f99eaf4589c3551176fca191c1ee2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 5 May 2017 18:47:37 +0800 Subject: [PATCH 185/254] libceph: cleanup old messages according to reconnect seq when reopen a connection, use 'reconnect seq' to clean up messages that have already been received by peer. Link: http://tracker.ceph.com/issues/18690 Signed-off-by: "Yan, Zheng" Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d7ab481b2508..588a91930051 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2228,10 +2228,18 @@ static void process_ack(struct ceph_connection *con) struct ceph_msg *m; u64 ack = le64_to_cpu(con->in_temp_ack); u64 seq; + bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ); + struct list_head *list = reconnect ? &con->out_queue : &con->out_sent; - while (!list_empty(&con->out_sent)) { - m = list_first_entry(&con->out_sent, struct ceph_msg, - list_head); + /* + * In the reconnect case, con_fault() has requeued messages + * in out_sent. We should cleanup old messages according to + * the reconnect seq. + */ + while (!list_empty(list)) { + m = list_first_entry(list, struct ceph_msg, list_head); + if (reconnect && m->needs_out_seq) + break; seq = le64_to_cpu(m->hdr.seq); if (seq > ack) break; @@ -2240,6 +2248,7 @@ static void process_ack(struct ceph_connection *con) m->ack_stamp = jiffies; ceph_msg_remove(m); } + prepare_read_tag(con); } From 42c99fc4c7069371da7b04b9099319dd1c633ee2 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 5 May 2017 18:28:44 +0100 Subject: [PATCH 186/254] ceph: check that the new inode size is within limits in ceph_fallocate() Currently the ceph client doesn't respect the rlimit in fallocate. This means that a user can allocate a file with size > RLIMIT_FSIZE. This patch adds the call to inode_newsize_ok() to verify filesystem limits and ulimits. This should make ceph successfully run xfstest generic/228. Signed-off-by: Luis Henriques Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3fdde0b283c9..29308a80d66f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1671,8 +1671,12 @@ static long ceph_fallocate(struct file *file, int mode, } size = i_size_read(inode); - if (!(mode & FALLOC_FL_KEEP_SIZE)) + if (!(mode & FALLOC_FL_KEEP_SIZE)) { endoff = offset + length; + ret = inode_newsize_ok(inode, endoff); + if (ret) + goto unlock; + } if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; From 46e3813d72abb018f0cd6e72389004db8728c738 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 18 May 2017 18:34:35 -0300 Subject: [PATCH 187/254] MAINTAINERS/serial: Change maintainer of jsm driver Gabriel will no longer maintain this driver, so I'm adding myself as maintainer. Thanks for all your work on jsm driver Gabriel. Signed-off-by: Guilherme G. Piccoli Acked-by: Gabriel Krisman Bertazi Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f7d568b8f133..a3030d04ef41 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7136,7 +7136,7 @@ S: Maintained F: drivers/media/platform/rcar_jpu.c JSM Neo PCI based serial card -M: Gabriel Krisman Bertazi +M: Guilherme G. Piccoli L: linux-serial@vger.kernel.org S: Maintained F: drivers/tty/serial/jsm/ From 3ab2137915aea0ce7b3ec02e0f260ecc0f1c289d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 23 May 2017 13:28:54 +0800 Subject: [PATCH 188/254] sctp: fix stream update when processing dupcookie Since commit 3dbcc105d556 ("sctp: alloc stream info when initializing asoc"), stream and stream.out info are always alloced when creating an asoc. So it's not correct to check !asoc->stream before updating stream info when processing dupcookie, but would be better to check asoc state instead. Fixes: 3dbcc105d556 ("sctp: alloc stream info when initializing asoc") Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index a9708da28eb5..95238284c422 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1176,7 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - if (!asoc->stream) { + + if (sctp_state(asoc, COOKIE_WAIT)) { + sctp_stream_free(asoc->stream); asoc->stream = new->stream; new->stream = NULL; } From 7e06297768886337707f5833942b3bd524a6d3d5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 23 May 2017 13:28:55 +0800 Subject: [PATCH 189/254] sctp: set new_asoc temp when processing dupcookie After sctp changed to use transport hashtable, a transport would be added into global hashtable when adding the peer to an asoc, then the asoc can be got by searching the transport in the hashtbale. The problem is when processing dupcookie in sctp_sf_do_5_2_4_dupcook, a new asoc would be created. A peer with the same addr and port as the one in the old asoc might be added into the new asoc, but fail to be added into the hashtable, as they also belong to the same sk. It causes that sctp's dupcookie processing can not really work. Since the new asoc will be freed after copying it's information to the old asoc, it's more like a temp asoc. So this patch is to fix it by setting it as a temp asoc to avoid adding it's any transport into the hashtable and also avoid allocing assoc_id. An extra thing it has to do is to also alloc stream info for any temp asoc, as sctp dupcookie process needs it to update old asoc. But I don't think it would hurt something, as a temp asoc would always be freed after finishing processing cookie echo packet. Reported-by: Jianwen Ji Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 13 ++++--------- net/sctp/sm_statefuns.c | 3 +++ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 8a08f13469c4..92e332e17391 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2454,16 +2454,11 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * stream sequence number shall be set to 0. */ - /* Allocate storage for the negotiated streams if it is not a temporary - * association. - */ - if (!asoc->temp) { - if (sctp_stream_init(asoc, gfp)) - goto clean_up; + if (sctp_stream_init(asoc, gfp)) + goto clean_up; - if (sctp_assoc_set_id(asoc, gfp)) - goto clean_up; - } + if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) + goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4f5e6cfc7f60..f863b5573e42 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2088,6 +2088,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net, } } + /* Set temp so that it won't be added into hashtable */ + new_asoc->temp = 1; + /* Compare the tie_tag in cookie with the verification tag of * current association. */ From 159a07604a99bd01e7db112de08d53dc4fcad109 Mon Sep 17 00:00:00 2001 From: Quentin Schulz Date: Tue, 23 May 2017 11:48:08 +0200 Subject: [PATCH 190/254] net: fec: add post PHY reset delay DT property Some PHY require to wait for a bit after the reset GPIO has been toggled. This adds support for the DT property `phy-reset-post-delay` which gives the delay in milliseconds to wait after reset. If the DT property is not given, no delay is observed. Post reset delay greater than 1000ms are invalid. Signed-off-by: Quentin Schulz Reviewed-by: Andrew Lunn Acked-by: Fugang Duan Signed-off-by: David S. Miller --- .../devicetree/bindings/net/fsl-fec.txt | 4 ++++ drivers/net/ethernet/freescale/fec_main.c | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/fsl-fec.txt b/Documentation/devicetree/bindings/net/fsl-fec.txt index a1e3693cca16..6f55bdd52f8a 100644 --- a/Documentation/devicetree/bindings/net/fsl-fec.txt +++ b/Documentation/devicetree/bindings/net/fsl-fec.txt @@ -15,6 +15,10 @@ Optional properties: - phy-reset-active-high : If present then the reset sequence using the GPIO specified in the "phy-reset-gpios" property is reversed (H=reset state, L=operation state). +- phy-reset-post-delay : Post reset delay in milliseconds. If present then + a delay of phy-reset-post-delay milliseconds will be observed after the + phy-reset-gpios has been toggled. Can be omitted thus no delay is + observed. Delay is in range of 1ms to 1000ms. Other delays are invalid. - phy-supply : regulator that powers the Ethernet PHY. - phy-handle : phandle to the PHY device connected to this device. - fixed-link : Assume a fixed link. See fixed-link.txt in the same directory. diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 56a563f90b0b..f7c8649fd28f 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3192,7 +3192,7 @@ static int fec_reset_phy(struct platform_device *pdev) { int err, phy_reset; bool active_high = false; - int msec = 1; + int msec = 1, phy_post_delay = 0; struct device_node *np = pdev->dev.of_node; if (!np) @@ -3209,6 +3209,11 @@ static int fec_reset_phy(struct platform_device *pdev) else if (!gpio_is_valid(phy_reset)) return 0; + err = of_property_read_u32(np, "phy-reset-post-delay", &phy_post_delay); + /* valid reset duration should be less than 1s */ + if (!err && phy_post_delay > 1000) + return -EINVAL; + active_high = of_property_read_bool(np, "phy-reset-active-high"); err = devm_gpio_request_one(&pdev->dev, phy_reset, @@ -3226,6 +3231,15 @@ static int fec_reset_phy(struct platform_device *pdev) gpio_set_value_cansleep(phy_reset, !active_high); + if (!phy_post_delay) + return 0; + + if (phy_post_delay > 20) + msleep(phy_post_delay); + else + usleep_range(phy_post_delay * 1000, + phy_post_delay * 1000 + 1000); + return 0; } #else /* CONFIG_OF */ From 0ff50e83b5122e836ca492fefb11656b225ac29c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 23 May 2017 13:20:28 +0200 Subject: [PATCH 191/254] net: rtnetlink: bail out from rtnl_fdb_dump() on parse error rtnl_fdb_dump() failed to check the result of nlmsg_parse(), which led to contents of |ifm| being uninitialized because nlh->nlmsglen was too small to accommodate |ifm|. The uninitialized data may affect some branches and result in unwanted effects, although kernel data doesn't seem to leak to the userspace directly. The bug has been detected with KMSAN and syzkaller. For the record, here is the KMSAN report: ================================================================== BUG: KMSAN: use of unitialized memory in rtnl_fdb_dump+0x5dc/0x1000 CPU: 0 PID: 1039 Comm: probe Not tainted 4.11.0-rc5+ #2727 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x143/0x1b0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:1007 __kmsan_warning_32+0x66/0xb0 mm/kmsan/kmsan_instr.c:491 rtnl_fdb_dump+0x5dc/0x1000 net/core/rtnetlink.c:3230 netlink_dump+0x84f/0x1190 net/netlink/af_netlink.c:2168 __netlink_dump_start+0xc97/0xe50 net/netlink/af_netlink.c:2258 netlink_dump_start ./include/linux/netlink.h:165 rtnetlink_rcv_msg+0xae9/0xb40 net/core/rtnetlink.c:4094 netlink_rcv_skb+0x339/0x5a0 net/netlink/af_netlink.c:2339 rtnetlink_rcv+0x83/0xa0 net/core/rtnetlink.c:4110 netlink_unicast_kernel net/netlink/af_netlink.c:1272 netlink_unicast+0x13b7/0x1480 net/netlink/af_netlink.c:1298 netlink_sendmsg+0x10b8/0x10f0 net/netlink/af_netlink.c:1844 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 ___sys_sendmsg+0xd4b/0x10f0 net/socket.c:1997 __sys_sendmsg net/socket.c:2031 SYSC_sendmsg+0x2c6/0x3f0 net/socket.c:2042 SyS_sendmsg+0x87/0xb0 net/socket.c:2038 do_syscall_64+0x102/0x150 arch/x86/entry/common.c:285 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:246 RIP: 0033:0x401300 RSP: 002b:00007ffc3b0e6d58 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000401300 RDX: 0000000000000000 RSI: 00007ffc3b0e6d80 RDI: 0000000000000003 RBP: 00007ffc3b0e6e00 R08: 000000000000000b R09: 0000000000000004 R10: 000000000000000d R11: 0000000000000246 R12: 0000000000000000 R13: 00000000004065a0 R14: 0000000000406630 R15: 0000000000000000 origin: 000000008fe00056 save_stack_trace+0x59/0x60 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:352 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:247 kmsan_poison_shadow+0x6d/0xc0 mm/kmsan/kmsan.c:260 slab_alloc_node mm/slub.c:2743 __kmalloc_node_track_caller+0x1f4/0x390 mm/slub.c:4349 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x2cd/0x740 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:933 netlink_alloc_large_skb net/netlink/af_netlink.c:1144 netlink_sendmsg+0x934/0x10f0 net/netlink/af_netlink.c:1819 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 ___sys_sendmsg+0xd4b/0x10f0 net/socket.c:1997 __sys_sendmsg net/socket.c:2031 SYSC_sendmsg+0x2c6/0x3f0 net/socket.c:2042 SyS_sendmsg+0x87/0xb0 net/socket.c:2038 do_syscall_64+0x102/0x150 arch/x86/entry/common.c:285 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:246 ================================================================== and the reproducer: ================================================================== #include #include #include #include int main() { int sock = socket(PF_NETLINK, SOCK_DGRAM | SOCK_NONBLOCK, 0); struct msghdr msg; memset(&msg, 0, sizeof(msg)); char nlmsg_buf[32]; memset(nlmsg_buf, 0, sizeof(nlmsg_buf)); struct nlmsghdr *nlmsg = nlmsg_buf; nlmsg->nlmsg_len = 0x11; nlmsg->nlmsg_type = 0x1e; // RTM_NEWROUTE = RTM_BASE + 0x0e // type = 0x0e = 1110b // kind = 2 nlmsg->nlmsg_flags = 0x101; // NLM_F_ROOT | NLM_F_REQUEST nlmsg->nlmsg_seq = 0; nlmsg->nlmsg_pid = 0; nlmsg_buf[16] = (char)7; struct iovec iov; iov.iov_base = nlmsg_buf; iov.iov_len = 17; msg.msg_iov = &iov; msg.msg_iovlen = 1; sendmsg(sock, &msg, 0); return 0; } ================================================================== Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 49a279a7cc15..9e2c0a7cb325 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3231,8 +3231,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL) == 0) { + err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { if (tb[IFLA_MASTER]) br_idx = nla_get_u32(tb[IFLA_MASTER]); } From cd47512e51190efc34a6b90d5c6b54de036ea421 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 23 May 2017 08:19:49 -0700 Subject: [PATCH 192/254] net/phy: fix mdio-octeon dependency and build Fix build errors by making this driver depend on OF_MDIO, like several other similar drivers do. drivers/built-in.o: In function `octeon_mdiobus_remove': mdio-octeon.c:(.text+0x196ee0): undefined reference to `mdiobus_unregister' mdio-octeon.c:(.text+0x196ee8): undefined reference to `mdiobus_free' drivers/built-in.o: In function `octeon_mdiobus_probe': mdio-octeon.c:(.text+0x196f1d): undefined reference to `devm_mdiobus_alloc_size' mdio-octeon.c:(.text+0x196ffe): undefined reference to `of_mdiobus_register' mdio-octeon.c:(.text+0x197010): undefined reference to `mdiobus_free' Signed-off-by: Randy Dunlap Cc: Andrew Lunn Cc: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 60ffc9da6a28..c360dd6ead22 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -108,7 +108,7 @@ config MDIO_MOXART config MDIO_OCTEON tristate "Octeon and some ThunderX SOCs MDIO buses" depends on 64BIT - depends on HAS_IOMEM + depends on HAS_IOMEM && OF_MDIO select MDIO_CAVIUM help This module provides a driver for the Octeon and ThunderX MDIO From f2899788353c13891412b273fdff5f02d49aa40f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 23 May 2017 17:49:13 +0200 Subject: [PATCH 193/254] net: phy: marvell: Limit errata to 88m1101 The 88m1101 has an errata when configuring autoneg. However, it was being applied to many other Marvell PHYs as well. Limit its scope to just the 88m1101. Fixes: 76884679c644 ("phylib: Add support for Marvell 88e1111S and 88e1145") Reported-by: Daniel Walker Signed-off-by: Andrew Lunn Acked-by: Harini Katakam Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 66 ++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 272b051a0199..9097e42bec2e 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -255,34 +255,6 @@ static int marvell_config_aneg(struct phy_device *phydev) { int err; - /* The Marvell PHY has an errata which requires - * that certain registers get written in order - * to restart autonegotiation */ - err = phy_write(phydev, MII_BMCR, BMCR_RESET); - - if (err < 0) - return err; - - err = phy_write(phydev, 0x1d, 0x1f); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0x200c); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1d, 0x5); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0x100); - if (err < 0) - return err; - err = marvell_set_polarity(phydev, phydev->mdix_ctrl); if (err < 0) return err; @@ -316,6 +288,42 @@ static int marvell_config_aneg(struct phy_device *phydev) return 0; } +static int m88e1101_config_aneg(struct phy_device *phydev) +{ + int err; + + /* This Marvell PHY has an errata which requires + * that certain registers get written in order + * to restart autonegotiation + */ + err = phy_write(phydev, MII_BMCR, BMCR_RESET); + + if (err < 0) + return err; + + err = phy_write(phydev, 0x1d, 0x1f); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0x200c); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1d, 0x5); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0x100); + if (err < 0) + return err; + + return marvell_config_aneg(phydev); +} + static int m88e1111_config_aneg(struct phy_device *phydev) { int err; @@ -1892,7 +1900,7 @@ static struct phy_driver marvell_drivers[] = { .flags = PHY_HAS_INTERRUPT, .probe = marvell_probe, .config_init = &marvell_config_init, - .config_aneg = &marvell_config_aneg, + .config_aneg = &m88e1101_config_aneg, .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, From b3c85a0fb2c79f2c945fa1305b39974d0acf3105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 10 May 2017 20:06:58 +0200 Subject: [PATCH 194/254] drm/amdgpu: fix fundamental suspend/resume issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reinitializing the VM manager during suspend/resume is a very very bad idea since all the VMs are still active and kicking. This can lead to random VM faults after resume when new processes become the same client ID assigned. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 22 +++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 + drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 15 ++------------- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 15 ++------------- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 15 ++------------- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 16 ++-------------- 6 files changed, 30 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 07ff3b1514f1..1bf36c3542c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -672,6 +672,7 @@ void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, struct amdgpu_vm_id_manager *id_mgr = &adev->vm_manager.id_mgr[vmhub]; struct amdgpu_vm_id *id = &id_mgr->ids[vmid]; + atomic64_set(&id->owner, 0); id->gds_base = 0; id->gds_size = 0; id->gws_base = 0; @@ -680,6 +681,26 @@ void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, id->oa_size = 0; } +/** + * amdgpu_vm_reset_all_id - reset VMID to zero + * + * @adev: amdgpu device structure + * + * Reset VMID to force flush on next use + */ +void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev) +{ + unsigned i, j; + + for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) { + struct amdgpu_vm_id_manager *id_mgr = + &adev->vm_manager.id_mgr[i]; + + for (j = 1; j < id_mgr->num_ids; ++j) + amdgpu_vm_reset_id(adev, i, j); + } +} + /** * amdgpu_vm_bo_find - find the bo_va for a specific vm & bo * @@ -2270,7 +2291,6 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev) for (i = 0; i < AMDGPU_MAX_RINGS; ++i) adev->vm_manager.seqno[i] = 0; - atomic_set(&adev->vm_manager.vm_pte_next_ring, 0); atomic64_set(&adev->vm_manager.client_counter, 0); spin_lock_init(&adev->vm_manager.prt_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index d97e28b4bdc4..e1d951ece433 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -204,6 +204,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job); void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, unsigned vmid); +void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev); int amdgpu_vm_update_directories(struct amdgpu_device *adev, struct amdgpu_vm *vm); int amdgpu_vm_clear_freed(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c index a572979f186c..d860939152df 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c @@ -950,10 +950,6 @@ static int gmc_v6_0_suspend(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->vm_manager.enabled) { - gmc_v6_0_vm_fini(adev); - adev->vm_manager.enabled = false; - } gmc_v6_0_hw_fini(adev); return 0; @@ -968,16 +964,9 @@ static int gmc_v6_0_resume(void *handle) if (r) return r; - if (!adev->vm_manager.enabled) { - r = gmc_v6_0_vm_init(adev); - if (r) { - dev_err(adev->dev, "vm manager initialization failed (%d).\n", r); - return r; - } - adev->vm_manager.enabled = true; - } + amdgpu_vm_reset_all_ids(adev); - return r; + return 0; } static bool gmc_v6_0_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index a9083a16a250..2750e5c23813 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -1117,10 +1117,6 @@ static int gmc_v7_0_suspend(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->vm_manager.enabled) { - gmc_v7_0_vm_fini(adev); - adev->vm_manager.enabled = false; - } gmc_v7_0_hw_fini(adev); return 0; @@ -1135,16 +1131,9 @@ static int gmc_v7_0_resume(void *handle) if (r) return r; - if (!adev->vm_manager.enabled) { - r = gmc_v7_0_vm_init(adev); - if (r) { - dev_err(adev->dev, "vm manager initialization failed (%d).\n", r); - return r; - } - adev->vm_manager.enabled = true; - } + amdgpu_vm_reset_all_ids(adev); - return r; + return 0; } static bool gmc_v7_0_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index 4ac99784160a..f56b4089ee9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1209,10 +1209,6 @@ static int gmc_v8_0_suspend(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->vm_manager.enabled) { - gmc_v8_0_vm_fini(adev); - adev->vm_manager.enabled = false; - } gmc_v8_0_hw_fini(adev); return 0; @@ -1227,16 +1223,9 @@ static int gmc_v8_0_resume(void *handle) if (r) return r; - if (!adev->vm_manager.enabled) { - r = gmc_v8_0_vm_init(adev); - if (r) { - dev_err(adev->dev, "vm manager initialization failed (%d).\n", r); - return r; - } - adev->vm_manager.enabled = true; - } + amdgpu_vm_reset_all_ids(adev); - return r; + return 0; } static bool gmc_v8_0_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index dc1e1c1d6b24..f936332a069d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -791,10 +791,6 @@ static int gmc_v9_0_suspend(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->vm_manager.enabled) { - gmc_v9_0_vm_fini(adev); - adev->vm_manager.enabled = false; - } gmc_v9_0_hw_fini(adev); return 0; @@ -809,17 +805,9 @@ static int gmc_v9_0_resume(void *handle) if (r) return r; - if (!adev->vm_manager.enabled) { - r = gmc_v9_0_vm_init(adev); - if (r) { - dev_err(adev->dev, - "vm manager initialization failed (%d).\n", r); - return r; - } - adev->vm_manager.enabled = true; - } + amdgpu_vm_reset_all_ids(adev); - return r; + return 0; } static bool gmc_v9_0_is_idle(void *handle) From 35d2f80b07bbe03fb358afb0bdeff7437a7d67ff Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:41 -0400 Subject: [PATCH 195/254] vlan: Fix tcp checksum offloads in Q-in-Q vlans It appears that TCP checksum offloading has been broken for Q-in-Q vlans. The behavior was execerbated by the series commit afb0bc972b52 ("Merge branch 'stacked_vlan_tso'") that that enabled accleleration features on stacked vlans. However, event without that series, it is possible to trigger this issue. It just requires a lot more specialized configuration. The root cause is the interaction between how netdev_intersect_features() works, the features actually set on the vlan devices and HW having the ability to run checksum with longer headers. The issue starts when netdev_interesect_features() replaces NETIF_F_HW_CSUM with a combination of NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM, if the HW advertises IP|IPV6 specific checksums. This happens for tagged and multi-tagged packets. However, HW that enables IP|IPV6 checksum offloading doesn't gurantee that packets with arbitrarily long headers can be checksummed. This patch disables IP|IPV6 checksums on the packet for multi-tagged packets. CC: Toshiaki Makita CC: Michal Kubecek Signed-off-by: Vladislav Yasevich Acked-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 8d5fcd6284ce..283dc2f5364d 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -614,14 +614,16 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb) static inline netdev_features_t vlan_features_check(const struct sk_buff *skb, netdev_features_t features) { - if (skb_vlan_tagged_multi(skb)) - features = netdev_intersect_features(features, - NETIF_F_SG | - NETIF_F_HIGHDMA | - NETIF_F_FRAGLIST | - NETIF_F_HW_CSUM | - NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + if (skb_vlan_tagged_multi(skb)) { + /* In the case of multi-tagged packets, use a direct mask + * instead of using netdev_interesect_features(), to make + * sure that only devices supporting NETIF_F_HW_CSUM will + * have checksum offloading support. + */ + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | + NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + } return features; } From cc6e9de62a7f84c9293a2ea41bc412b55bb46e85 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:42 -0400 Subject: [PATCH 196/254] be2net: Fix offload features for Q-in-Q packets At least some of the be2net cards do not seem to be capabled of performing checksum offload computions on Q-in-Q packets. In these case, the recevied checksum on the remote is invalid and TCP syn packets are dropped. This patch adds a call to check disbled acceleration features on Q-in-Q tagged traffic. CC: Sathya Perla CC: Ajit Khaparde CC: Sriharsha Basavapatna CC: Somnath Kotur Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index f3a09ab55900..4eee18ce9be4 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5078,9 +5078,11 @@ static netdev_features_t be_features_check(struct sk_buff *skb, struct be_adapter *adapter = netdev_priv(dev); u8 l4_hdr = 0; - /* The code below restricts offload features for some tunneled packets. + /* The code below restricts offload features for some tunneled and + * Q-in-Q packets. * Offload features for normal (non tunnel) packets are unchanged. */ + features = vlan_features_check(skb, features); if (!skb->encapsulation || !(adapter->flags & BE_FLAGS_VXLAN_OFFLOADS)) return features; From 2836b4f224d4fd7d1a2b23c3eecaf0f0ae199a74 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:43 -0400 Subject: [PATCH 197/254] virtio-net: enable TSO/checksum offloads for Q-in-Q vlans Since virtio does not provide it's own ndo_features_check handler, TSO, and now checksum offload, are disabled for stacked vlans. Re-enable the support and let the host take care of it. This restores/improves Guest-to-Guest performance over Q-in-Q vlans. Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 9320d96a1632..3e9246cc49c3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1989,6 +1989,7 @@ static const struct net_device_ops virtnet_netdev = { .ndo_poll_controller = virtnet_netpoll, #endif .ndo_xdp = virtnet_xdp, + .ndo_features_check = passthru_features_check, }; static void virtnet_config_changed_work(struct work_struct *work) From 0a646f331db0eb9efc8d3a95a44872036d441d58 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:10:02 -0400 Subject: [PATCH 198/254] drm/amdgpu/ci: disable mclk switching for high refresh rates (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if the vblank period would allow it, it still seems to be problematic on some cards. v2: fix logic inversion (Nils) bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Cc: stable@vger.kernel.org Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/ci_dpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/ci_dpm.c b/drivers/gpu/drm/amd/amdgpu/ci_dpm.c index 6dc1410b380f..ec93714e4524 100644 --- a/drivers/gpu/drm/amd/amdgpu/ci_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/ci_dpm.c @@ -906,6 +906,12 @@ static bool ci_dpm_vblank_too_short(struct amdgpu_device *adev) u32 vblank_time = amdgpu_dpm_get_vblank_time(adev); u32 switch_limit = adev->mc.vram_type == AMDGPU_VRAM_TYPE_GDDR5 ? 450 : 300; + /* disable mclk switching if the refresh is >120Hz, even if the + * blanking period would allow it + */ + if (amdgpu_dpm_get_vrefresh(adev) > 120) + return true; + if (vblank_time < switch_limit) return true; else From 58d7e3e427db1bd68f33025519a9468140280a75 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:14:14 -0400 Subject: [PATCH 199/254] drm/radeon/ci: disable mclk switching for high refresh rates (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if the vblank period would allow it, it still seems to be problematic on some cards. v2: fix logic inversion (Nils) bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Cc: stable@vger.kernel.org Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/ci_dpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/radeon/ci_dpm.c b/drivers/gpu/drm/radeon/ci_dpm.c index 7ba450832e6b..ea36dc4dd5d2 100644 --- a/drivers/gpu/drm/radeon/ci_dpm.c +++ b/drivers/gpu/drm/radeon/ci_dpm.c @@ -776,6 +776,12 @@ bool ci_dpm_vblank_too_short(struct radeon_device *rdev) u32 vblank_time = r600_dpm_get_vblank_time(rdev); u32 switch_limit = pi->mem_gddr5 ? 450 : 300; + /* disable mclk switching if the refresh is >120Hz, even if the + * blanking period would allow it + */ + if (r600_dpm_get_vrefresh(rdev) > 120) + return true; + if (vblank_time < switch_limit) return true; else From 09be4a5219610a6fae3215d4f51f948d6f5d2609 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:46:12 -0400 Subject: [PATCH 200/254] drm/amd/powerplay/smu7: add vblank check for mclk switching (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check to make sure the vblank period is long enough to support mclk switching. v2: drop needless initial assignment (Nils) bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Cc: stable@vger.kernel.org Acked-by: Christian König Reviewed-by: Rex Zhu Signed-off-by: Alex Deucher --- .../gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c index a74a3db3056c..1445c51b6d05 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c @@ -2655,6 +2655,28 @@ static int smu7_get_power_state_size(struct pp_hwmgr *hwmgr) return sizeof(struct smu7_power_state); } +static int smu7_vblank_too_short(struct pp_hwmgr *hwmgr, + uint32_t vblank_time_us) +{ + struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend); + uint32_t switch_limit_us; + + switch (hwmgr->chip_id) { + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + switch_limit_us = data->is_memory_gddr5 ? 190 : 150; + break; + default: + switch_limit_us = data->is_memory_gddr5 ? 450 : 150; + break; + } + + if (vblank_time_us < switch_limit_us) + return true; + else + return false; +} static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, struct pp_power_state *request_ps, @@ -2669,6 +2691,7 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, bool disable_mclk_switching; bool disable_mclk_switching_for_frame_lock; struct cgs_display_info info = {0}; + struct cgs_mode_info mode_info = {0}; const struct phm_clock_and_voltage_limits *max_limits; uint32_t i; struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend); @@ -2677,6 +2700,7 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, int32_t count; int32_t stable_pstate_sclk = 0, stable_pstate_mclk = 0; + info.mode_info = &mode_info; data->battery_state = (PP_StateUILabel_Battery == request_ps->classification.ui_label); @@ -2703,8 +2727,6 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, cgs_get_active_displays_info(hwmgr->device, &info); - /*TO DO result = PHM_CheckVBlankTime(hwmgr, &vblankTooShort);*/ - minimum_clocks.engineClock = hwmgr->display_config.min_core_set_clock; minimum_clocks.memoryClock = hwmgr->display_config.min_mem_set_clock; @@ -2769,8 +2791,9 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, PHM_PlatformCaps_DisableMclkSwitchingForFrameLock); - disable_mclk_switching = (1 < info.display_count) || - disable_mclk_switching_for_frame_lock; + disable_mclk_switching = ((1 < info.display_count) || + disable_mclk_switching_for_frame_lock || + smu7_vblank_too_short(hwmgr, mode_info.vblank_time_us)); sclk = smu7_ps->performance_levels[0].engine_clock; mclk = smu7_ps->performance_levels[0].memory_clock; From 2275a3a2fe9914ba6d76c8ea490da3c08342bd19 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:57:41 -0400 Subject: [PATCH 201/254] drm/amd/powerplay/smu7: disable mclk switching for high refresh rates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if the vblank period would allow it, it still seems to be problematic on some cards. bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Cc: stable@vger.kernel.org Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c index 1445c51b6d05..102eb6d029fa 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c @@ -2793,7 +2793,8 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr, disable_mclk_switching = ((1 < info.display_count) || disable_mclk_switching_for_frame_lock || - smu7_vblank_too_short(hwmgr, mode_info.vblank_time_us)); + smu7_vblank_too_short(hwmgr, mode_info.vblank_time_us) || + (mode_info.refresh_rate > 120)); sclk = smu7_ps->performance_levels[0].engine_clock; mclk = smu7_ps->performance_levels[0].memory_clock; From 3d18e33735a02b1a90aecf14410bf3edbfd4d3dc Mon Sep 17 00:00:00 2001 From: Lyude Date: Thu, 11 May 2017 19:31:12 -0400 Subject: [PATCH 202/254] drm/radeon: Unbreak HPD handling for r600+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We end up reading the interrupt register for HPD5, and then writing it to HPD6 which on systems without anything using HPD5 results in permanently disabling hotplug on one of the display outputs after the first time we acknowledge a hotplug interrupt from the GPU. This code is really bad. But for now, let's just fix this. I will hopefully have a large patch series to refactor all of this soon. Reviewed-by: Christian König Signed-off-by: Lyude Cc: stable@vger.kernel.org Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/cik.c | 4 ++-- drivers/gpu/drm/radeon/evergreen.c | 4 ++-- drivers/gpu/drm/radeon/r600.c | 2 +- drivers/gpu/drm/radeon/si.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index ccebe0f8d2e1..008c145b7f29 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -7401,7 +7401,7 @@ static inline void cik_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.cik.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -7431,7 +7431,7 @@ static inline void cik_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.cik.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index f130ec41ee4b..0bf103536404 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -4927,7 +4927,7 @@ static void evergreen_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -4958,7 +4958,7 @@ static void evergreen_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 0a085176e79b..e06e2d8feab3 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -3988,7 +3988,7 @@ static void r600_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.r600.disp_int_cont2 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index ceee87f029d9..76d1888528e6 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -6317,7 +6317,7 @@ static inline void si_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -6348,7 +6348,7 @@ static inline void si_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } From 7c4378f4523d4af05b5941ea906e7032631eb753 Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Thu, 11 May 2017 18:22:17 +0800 Subject: [PATCH 203/254] drm/amdgpu: fix NULL pointer panic of emit_gds_switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ 338.384770] BUG: unable to handle kernel NULL pointer dereference at (null) [ 338.384817] IP: [< (null)>] (null) [ 338.385505] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 338.385950] Call Trace: [ 338.385993] [] ? amdgpu_vm_flush+0x283/0x400 [amdgpu] [ 338.386025] [] ? printk+0x4d/0x4f [ 338.386074] [] amdgpu_ib_schedule+0x4a6/0x4d0 [amdgpu] [ 338.386140] [] amdgpu_job_run+0x64/0x180 [amdgpu] [ 338.386203] [] amd_sched_main+0x2e9/0x4a0 [amdgpu] [ 338.386232] [] ? prepare_to_wait_event+0x110/0x110 [ 338.386295] [] ? amd_sched_select_entity+0xe0/0xe0 [amdgpu] [ 338.386327] [] kthread+0xd3/0xf0 [ 338.386349] [] ? kthread_park+0x60/0x60 [ 338.386376] [] ret_from_fork+0x25/0x30 [ 338.386401] Code: Bad RIP value. [ 338.386420] RIP [< (null)>] (null) [ 338.386443] RSP [ 338.386458] CR2: 0000000000000000 [ 338.398508] ---[ end trace 4c66fcdc74b9a0a2 ]--- Signed-off-by: Chunming Zhou Reviewed-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1bf36c3542c1..8ecf82c5fe74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -634,7 +634,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job) mutex_unlock(&id_mgr->lock); } - if (gds_switch_needed) { + if (ring->funcs->emit_gds_switch && gds_switch_needed) { id->gds_base = job->gds_base; id->gds_size = job->gds_size; id->gws_base = job->gws_base; From 3083696a1ee68f4845f8e9a21b91e343ff25eff3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 May 2017 13:13:45 +0800 Subject: [PATCH 204/254] drm/amd/powerplay: fix a signedness bugs Smatch complains about a signedness bug here: vega10_hwmgr.c:4202 vega10_force_clock_level() warn: always true condition '(i >= 0) => (0-u32max >= 0)' Fixes: 7b52db39a4c2 ("drm/amd/powerplay: fix bug sclk/mclk level can't be set on vega10.") Signed-off-by: Dan Carpenter Reviewed-by: Eric Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c index ad30f5d3a10d..2614af2f553f 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c @@ -4186,7 +4186,7 @@ static int vega10_force_clock_level(struct pp_hwmgr *hwmgr, enum pp_clock_type type, uint32_t mask) { struct vega10_hwmgr *data = (struct vega10_hwmgr *)(hwmgr->backend); - uint32_t i; + int i; if (hwmgr->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL) return -EINVAL; From 6df765dca378bddf994cfd2044acafa501bd800f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 24 May 2017 21:38:46 +0200 Subject: [PATCH 205/254] serial: imx: ensure UCR3 and UFCR are setup correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e61c38d85b73 ("serial: imx: setup DCEDTE early and ensure DCD and RI irqs to be off") has a flaw: While UCR3 and UFCR were modified using read-modify-write before it switched to write register values independent of the previous state. That's a good idea in principle (and that's why I did it) but needs more care. This patch reinstates read-modify-write for UFCR and for UCR3 ensures that RXDMUXSEL and ADNIMP are set for post imx1. Fixes: e61c38d85b73 ("serial: imx: setup DCEDTE early and ensure DCD and RI irqs to be off") Signed-off-by: Uwe Kleine-König Acked-by: Mika Penttilä Tested-by: Mika Penttilä Acked-by: Steve Twiss Tested-by: Steve Twiss Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 33509b4beaec..bbefddd92bfe 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -2184,7 +2184,9 @@ static int serial_imx_probe(struct platform_device *pdev) * and DCD (when they are outputs) or enables the respective * irqs. So set this bit early, i.e. before requesting irqs. */ - writel(UFCR_DCEDTE, sport->port.membase + UFCR); + reg = readl(sport->port.membase + UFCR); + if (!(reg & UFCR_DCEDTE)) + writel(reg | UFCR_DCEDTE, sport->port.membase + UFCR); /* * Disable UCR3_RI and UCR3_DCD irqs. They are also not @@ -2195,7 +2197,15 @@ static int serial_imx_probe(struct platform_device *pdev) sport->port.membase + UCR3); } else { - writel(0, sport->port.membase + UFCR); + unsigned long ucr3 = UCR3_DSR; + + reg = readl(sport->port.membase + UFCR); + if (reg & UFCR_DCEDTE) + writel(reg & ~UFCR_DCEDTE, sport->port.membase + UFCR); + + if (!is_imx1_uart(sport)) + ucr3 |= IMX21_UCR3_RXDMUXSEL | UCR3_ADNIMP; + writel(ucr3, sport->port.membase + UCR3); } clk_disable_unprepare(sport->clk_ipg); From 59fe2cc8b1c32dd139da80fc8222b5b3290b7a09 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 19 May 2017 13:59:20 +0200 Subject: [PATCH 206/254] serial: altera_uart: call iounmap() at driver remove The driver calls ioremap() in the probe function but doesn't call iounmap() in the remove function correspondingly. Do so now. Follow commit 5c9d6abed9e0 ("serial: altera_jtaguart: adding iounmap()") Cc: Alexey Khoroshilov Signed-off-by: Tobias Klauser Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/altera_uart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/altera_uart.c b/drivers/tty/serial/altera_uart.c index 46d3438a0d27..3e4b717670d7 100644 --- a/drivers/tty/serial/altera_uart.c +++ b/drivers/tty/serial/altera_uart.c @@ -615,6 +615,7 @@ static int altera_uart_remove(struct platform_device *pdev) if (port) { uart_remove_one_port(&altera_uart_driver, port); port->mapbase = 0; + iounmap(port->membase); } return 0; From 415ba3c157b9de7edc59468d19d48c1d15972a75 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Sat, 20 May 2017 06:56:35 +1000 Subject: [PATCH 207/254] powerpc/powernv/npu-dma.c: Fix opal_npu_destroy_context() call opal_npu_destroy_context() should be called with the NPU PHB, not the PCIe PHB. Fixes: 1ab66d1fbada ("powerpc/powernv: Introduce address translation services for Nvlink2") Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 067defeea691..78fa9395b8c5 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -714,7 +714,7 @@ static void pnv_npu2_release_context(struct kref *kref) void pnv_npu2_destroy_context(struct npu_context *npu_context, struct pci_dev *gpdev) { - struct pnv_phb *nphb, *phb; + struct pnv_phb *nphb; struct npu *npu; struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); struct device_node *nvlink_dn; @@ -728,13 +728,12 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context, nphb = pci_bus_to_host(npdev->bus)->private_data; npu = &nphb->npu; - phb = pci_bus_to_host(gpdev->bus)->private_data; nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", &nvlink_index))) return; npu_context->npdev[npu->index][nvlink_index] = NULL; - opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id, + opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id, PCI_DEVID(gpdev->bus->number, gpdev->devfn)); kref_put(&npu_context->kref, pnv_npu2_release_context); } From d957fb4d173647640a2b83e7c7e56a580e7fc7e7 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 24 May 2017 17:03:26 +1000 Subject: [PATCH 208/254] powerpc: Fix booting P9 hash with CONFIG_PPC_RADIX_MMU=N Currently if you disable CONFIG_PPC_RADIX_MMU you'll crash on boot on a P9. This is because we still set MMU_FTR_TYPE_RADIX via ibm,pa-features and MMU_FTR_TYPE_RADIX is what's used for code patching in much of the asm code (ie. slb_miss_realmode) This patch fixes the problem by stopping MMU_FTR_TYPE_RADIX from being set from ibm.pa-features. We may eventually end up removing the CONFIG_PPC_RADIX_MMU option completely but until then this fixes the issue. Fixes: 17a3dd2f5fc7 ("powerpc/mm/radix: Use firmware feature to enable Radix MMU") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 40c4887c27b6..f83056297441 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -161,7 +161,9 @@ static struct ibm_pa_feature { { .pabyte = 0, .pabit = 3, .cpu_features = CPU_FTR_CTRL }, { .pabyte = 0, .pabit = 6, .cpu_features = CPU_FTR_NOEXECUTE }, { .pabyte = 1, .pabit = 2, .mmu_features = MMU_FTR_CI_LARGE_PAGE }, +#ifdef CONFIG_PPC_RADIX_MMU { .pabyte = 40, .pabit = 0, .mmu_features = MMU_FTR_TYPE_RADIX }, +#endif { .pabyte = 1, .pabit = 1, .invert = 1, .cpu_features = CPU_FTR_NODSISRALIGN }, { .pabyte = 5, .pabit = 0, .cpu_features = CPU_FTR_REAL_LE, .cpu_user_ftrs = PPC_FEATURE_TRUE_LE }, From d75e4919cc0b6fbcbc8d6654ef66d87a9dbf1526 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 24 May 2017 16:49:59 +1000 Subject: [PATCH 209/254] powerpc/spufs: Fix hash faults for kernel regions Commit ac29c64089b7 ("powerpc/mm: Replace _PAGE_USER with _PAGE_PRIVILEGED") swapped _PAGE_USER for _PAGE_PRIVILEGED, and introduced check_pte_access() which denied kernel access to non-_PAGE_PRIVILEGED pages. However, it didn't add _PAGE_PRIVILEGED to the hash fault handler for spufs' kernel accesses, so the DMAs required to establish SPE memory no longer work. This change adds _PAGE_PRIVILEGED to the hash fault handler for kernel accesses. Fixes: ac29c64089b7 ("powerpc/mm: Replace _PAGE_USER with _PAGE_PRIVILEGED") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Jeremy Kerr Reported-by: Sombat Tragolgosol Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/cell/spu_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 96c2b8a40630..0c45cdbac4cf 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -197,7 +197,9 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr) (REGION_ID(ea) != USER_REGION_ID)) { spin_unlock(&spu->register_lock); - ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr); + ret = hash_page(ea, + _PAGE_PRESENT | _PAGE_READ | _PAGE_PRIVILEGED, + 0x300, dsisr); spin_lock(&spu->register_lock); if (!ret) { From a4700a26107241cc7b9ac8528b2c6714ff99983d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 20 May 2017 14:29:49 +1000 Subject: [PATCH 210/254] powerpc: Add PPC_FEATURE userspace bits for SCV and DARN instructions Providing "scv" support to userspace requires kernel support, so it must be advertised as independently to the base ISA 3 instruction set. The darn instruction relies on firmware enablement, so it has been decided to split this out from the core ISA 3 feature as well. Signed-off-by: Nicholas Piggin Acked-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/include/uapi/asm/cputable.h | 2 ++ arch/powerpc/kernel/cputable.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h index 3e7ce86d5c13..4d877144f377 100644 --- a/arch/powerpc/include/uapi/asm/cputable.h +++ b/arch/powerpc/include/uapi/asm/cputable.h @@ -46,6 +46,8 @@ #define PPC_FEATURE2_HTM_NOSC 0x01000000 #define PPC_FEATURE2_ARCH_3_00 0x00800000 /* ISA 3.00 */ #define PPC_FEATURE2_HAS_IEEE128 0x00400000 /* VSX IEEE Binary Float 128-bit */ +#define PPC_FEATURE2_DARN 0x00200000 /* darn random number insn */ +#define PPC_FEATURE2_SCV 0x00100000 /* scv syscall */ /* * IMPORTANT! diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 9b3e88b1a9c8..6f849832a669 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -124,7 +124,8 @@ extern void __restore_cpu_e6500(void); #define COMMON_USER_POWER9 COMMON_USER_POWER8 #define COMMON_USER2_POWER9 (COMMON_USER2_POWER8 | \ PPC_FEATURE2_ARCH_3_00 | \ - PPC_FEATURE2_HAS_IEEE128) + PPC_FEATURE2_HAS_IEEE128 | \ + PPC_FEATURE2_DARN ) #ifdef CONFIG_PPC_BOOK3E_64 #define COMMON_USER_BOOKE (COMMON_USER_PPC64 | PPC_FEATURE_BOOKE) From a4d768e702de224cc85e0c8eac9311763403b368 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 22 May 2017 19:54:10 -0700 Subject: [PATCH 211/254] xfs: fix unaligned access in xfs_btree_visit_blocks This structure copy was throwing unaligned access warnings on sparc64: Kernel unaligned access at TPC[1043c088] xfs_btree_visit_blocks+0x88/0xe0 [xfs] xfs_btree_copy_ptrs does a memcpy, which avoids it. Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5392674bf893..3a673ba201aa 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4395,7 +4395,7 @@ xfs_btree_visit_blocks( xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ - lptr = *ptr; + xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); } /* for each buffer in the level */ From 8affebe16d79ebefb1d9d6d56a46dc89716f9453 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 23 May 2017 08:30:46 -0700 Subject: [PATCH 212/254] xfs: fix off-by-one on max nr_pages in xfs_find_get_desired_pgoff() xfs_find_get_desired_pgoff() is used to search for offset of hole or data in page range [index, end] (both inclusive), and the max number of pages to search should be at least one, if end == index. Otherwise the only page is missed and no hole or data is found, which is not correct. When block size is smaller than page size, this can be demonstrated by preallocating a file with size smaller than page size and writing data to the last block. E.g. run this xfs_io command on a 1k block size XFS on x86_64 host. # xfs_io -fc "falloc 0 3k" -c "pwrite 2k 1k" \ -c "seek -d 0" /mnt/xfs/testfile wrote 1024/1024 bytes at offset 2048 1 KiB, 1 ops; 0.0000 sec (33.675 MiB/sec and 34482.7586 ops/sec) Whence Result DATA EOF Data at offset 2k was missed, and lseek(2) returned ENXIO. This is uncovered by generic/285 subtest 07 and 08 on ppc64 host, where pagesize is 64k. Because a recent change to generic/285 reduced the preallocated file size to smaller than 64k. Cc: stable@vger.kernel.org # v3.7+ Signed-off-by: Eryu Guan Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 35703a801372..aefa2134a8cb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1049,7 +1049,7 @@ xfs_find_get_desired_pgoff( unsigned nr_pages; unsigned int i; - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); /* From 5375023ae1266553a7baa0845e82917d8803f48c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:22 -0700 Subject: [PATCH 213/254] xfs: Fix missed holes in SEEK_HOLE implementation XFS SEEK_HOLE implementation could miss a hole in an unwritten extent as can be seen by the following command: xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "pwrite 128k 8k" -c "seek -h 0" file wrote 57344/57344 bytes at offset 0 56 KiB, 14 ops; 0.0000 sec (49.312 MiB/sec and 12623.9856 ops/sec) wrote 8192/8192 bytes at offset 131072 8 KiB, 2 ops; 0.0000 sec (70.383 MiB/sec and 18018.0180 ops/sec) Whence Result HOLE 139264 Where we can see that hole at offset 56k was just ignored by SEEK_HOLE implementation. The bug is in xfs_find_get_desired_pgoff() which does not properly detect the case when pages are not contiguous. Fix the problem by properly detecting when found page has larger offset than expected. CC: stable@vger.kernel.org Fixes: d126d43f631f996daeee5006714fed914be32368 Signed-off-by: Jan Kara Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aefa2134a8cb..f1517e9928c7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1076,17 +1076,6 @@ xfs_find_get_desired_pgoff( break; } - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; - break; - } - for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; loff_t b_offset; @@ -1098,18 +1087,18 @@ xfs_find_get_desired_pgoff( * file mapping. However, page->index will not change * because we have a reference on the page. * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. + * If current page offset is beyond where we've ended, + * we've found a hole. */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } + if (type == HOLE_OFF && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { + found = true; + *offset = lastoff; goto out; } + /* Searching done if the page index is out of range. */ + if (page->index > end) + goto out; lock_page(page); /* From d7fd24257aa60316bf81093f7f909dc9475ae974 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:23 -0700 Subject: [PATCH 214/254] xfs: Fix off-by-in in loop termination in xfs_find_get_desired_pgoff() There is an off-by-one error in loop termination conditions in xfs_find_get_desired_pgoff() since 'end' may index a page beyond end of desired range if 'endoff' is page aligned. It doesn't have any visible effects but still it is good to fix it. Signed-off-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f1517e9928c7..dc0e4cb7029b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1043,7 +1043,7 @@ xfs_find_get_desired_pgoff( index = startoff >> PAGE_SHIFT; endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; do { int want; unsigned nr_pages; From a54fba8f5a0dc36161cacdf2aa90f007f702ec1a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:24 -0700 Subject: [PATCH 215/254] xfs: Move handling of missing page into one place in xfs_find_get_desired_pgoff() Currently several places in xfs_find_get_desired_pgoff() handle the case of a missing page. Make them all handled in one place after the loop has terminated. Signed-off-by: Jan Kara Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 38 ++++++++------------------------------ 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index dc0e4cb7029b..5fb5a0958a14 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1052,29 +1052,8 @@ xfs_find_get_desired_pgoff( want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); - /* - * No page mapped into given range. If we are searching holes - * and if this is the first time we got into the loop, it means - * that the given offset is landed in a hole, return it. - * - * If we have already stepped through some block buffers to find - * holes but they all contains data. In this case, the last - * offset is already updated and pointed to the end of the last - * mapped page, if it does not reach the endpoint to search, - * that means there should be a hole between them. - */ - if (nr_pages == 0) { - /* Data search found nothing */ - if (type == DATA_OFF) - break; - - ASSERT(type == HOLE_OFF); - if (lastoff == startoff || lastoff < endoff) { - found = true; - *offset = lastoff; - } + if (nr_pages == 0) break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1140,21 +1119,20 @@ xfs_find_get_desired_pgoff( /* * The number of returned pages less than our desired, search - * done. In this case, nothing was found for searching data, - * but we found a hole behind the last offset. + * done. */ - if (nr_pages < want) { - if (type == HOLE_OFF) { - *offset = lastoff; - found = true; - } + if (nr_pages < want) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* No page at lastoff and we are not done - we found a hole. */ + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } out: pagevec_release(&pvec); return found; From 11387fe4a98f75d1f4cdb3efe3b42b19205c9df5 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Tue, 23 May 2017 18:37:27 -0400 Subject: [PATCH 216/254] geneve: fix fill_info when using collect_metadata Since 9b4437a5b870 ("geneve: Unify LWT and netdev handling.") fill_info does not return UDP_ZERO_CSUM6_RX when using COLLECT_METADATA. This is because it uses ip_tunnel_info_af() with the device level info, which is not valid for COLLECT_METADATA. Fix by checking for the presence of the actual sockets. Fixes: 9b4437a5b870 ("geneve: Unify LWT and netdev handling.") Signed-off-by: Eric Garver Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/geneve.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index dec5d563ab19..959fd12d2e67 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1293,7 +1293,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u32(skb, IFLA_GENEVE_ID, vni)) goto nla_put_failure; - if (ip_tunnel_info_af(info) == AF_INET) { + if (rtnl_dereference(geneve->sock4)) { if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, info->key.u.ipv4.dst)) goto nla_put_failure; @@ -1302,8 +1302,10 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(info->key.tun_flags & TUNNEL_CSUM))) goto nla_put_failure; + } + #if IS_ENABLED(CONFIG_IPV6) - } else { + if (rtnl_dereference(geneve->sock6)) { if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6, &info->key.u.ipv6.dst)) goto nla_put_failure; @@ -1315,8 +1317,8 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, !geneve->use_udp6_rx_checksums)) goto nla_put_failure; -#endif } +#endif if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) || nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) || From b62ce397675502325d4282924bf70cfb6a005c3a Mon Sep 17 00:00:00 2001 From: Rex Zhu Date: Mon, 22 May 2017 13:11:41 +0800 Subject: [PATCH 217/254] drm/amdgpu: fix null point error when rmmod amdgpu. this bug happened when amdgpu load failed. [ 75.740951] BUG: unable to handle kernel paging request at 00000000000031c0 [ 75.748167] IP: [] amdgpu_fbdev_restore_mode+0x20/0x60 [amdgpu] [ 75.755774] PGD 0 [ 75.759185] Oops: 0000 [#1] SMP [ 75.762408] Modules linked in: amdgpu(OE-) ttm(OE) drm_kms_helper(OE) drm(OE) i2c_algo_bit(E) fb_sys_fops(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) rpcsec_gss_krb5(E) nfsv4(E) nfs(E) fscache(E) eeepc_wmi(E) asus_wmi(E) sparse_keymap(E) intel_rapl(E) snd_hda_codec_hdmi(E) snd_hda_codec_realtek(E) snd_hda_codec_generic(E) snd_hda_intel(E) snd_hda_codec(E) snd_hda_core(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) snd_hwdep(E) snd_pcm(E) snd_seq_midi(E) coretemp(E) kvm_intel(E) snd_seq_midi_event(E) snd_rawmidi(E) kvm(E) snd_seq(E) joydev(E) snd_seq_device(E) snd_timer(E) irqbypass(E) crct10dif_pclmul(E) crc32_pclmul(E) mei_me(E) ghash_clmulni_intel(E) snd(E) aesni_intel(E) mei(E) soundcore(E) aes_x86_64(E) shpchp(E) serio_raw(E) lrw(E) acpi_pad(E) gf128mul(E) glue_helper(E) ablk_helper(E) mac_hid(E) [ 75.835574] cryptd(E) parport_pc(E) ppdev(E) lp(E) nfsd(E) parport(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) sunrpc(E) autofs4(E) hid_generic(E) usbhid(E) mxm_wmi(E) psmouse(E) e1000e(E) ptp(E) pps_core(E) ahci(E) libahci(E) wmi(E) video(E) i2c_hid(E) hid(E) [ 75.858489] CPU: 5 PID: 1603 Comm: rmmod Tainted: G OE 4.9.0-custom #2 [ 75.866183] Hardware name: System manufacturer System Product Name/Z170-A, BIOS 0901 08/31/2015 [ 75.875050] task: ffff88045d1bbb80 task.stack: ffffc90002de4000 [ 75.881094] RIP: 0010:[] [] amdgpu_fbdev_restore_mode+0x20/0x60 [amdgpu] [ 75.891238] RSP: 0018:ffffc90002de7d48 EFLAGS: 00010286 [ 75.896648] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001 [ 75.903933] RDX: 0000000000000000 RSI: ffff88045d1bbb80 RDI: 0000000000000286 [ 75.911183] RBP: ffffc90002de7d50 R08: 0000000000000502 R09: 0000000000000004 [ 75.918449] R10: 0000000000000000 R11: 0000000000000001 R12: ffff880464bf0000 [ 75.925675] R13: ffffffffa0853000 R14: 0000000000000000 R15: 0000564e44f88210 [ 75.932980] FS: 00007f13d5400700(0000) GS:ffff880476540000(0000) knlGS:0000000000000000 [ 75.941238] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 75.947088] CR2: 00000000000031c0 CR3: 000000045fd0b000 CR4: 00000000003406e0 [ 75.954332] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 75.961566] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 75.968834] Stack: [ 75.970881] ffff880464bf0000 ffffc90002de7d60 ffffffffa0636592 ffffc90002de7d80 [ 75.978454] ffffffffa059015f ffff880464bf0000 ffff880464bf0000 ffffc90002de7da8 [ 75.986076] ffffffffa0595216 ffff880464bf0000 ffff880460f4d000 ffffffffa0853000 [ 75.993692] Call Trace: [ 75.996177] [] amdgpu_driver_lastclose_kms+0x12/0x20 [amdgpu] [ 76.003700] [] drm_lastclose+0x2f/0xd0 [drm] [ 76.009777] [] drm_dev_unregister+0x16/0xd0 [drm] [ 76.016255] [] drm_put_dev+0x34/0x70 [drm] [ 76.022139] [] amdgpu_pci_remove+0x15/0x20 [amdgpu] [ 76.028800] [] pci_device_remove+0x39/0xc0 [ 76.034661] [] __device_release_driver+0x9a/0x140 [ 76.041121] [] driver_detach+0xb8/0xc0 [ 76.046575] [] bus_remove_driver+0x55/0xd0 [ 76.052401] [] driver_unregister+0x2c/0x50 [ 76.058244] [] pci_unregister_driver+0x29/0x90 [ 76.064466] [] drm_pci_exit+0x9e/0xb0 [drm] [ 76.070507] [] amdgpu_exit+0x1c/0x32 [amdgpu] [ 76.076609] [] SyS_delete_module+0x1a0/0x200 [ 76.082627] [] ? rcu_eqs_enter.isra.36+0x4a/0x50 [ 76.089001] [] do_syscall_64+0x6e/0x180 [ 76.094583] [] entry_SYSCALL64_slow_path+0x25/0x25 [ 76.101114] Code: 94 c0 c3 31 c0 5d c3 0f 1f 40 00 0f 1f 44 00 00 55 31 c0 48 89 e5 53 48 89 fb 48 c7 c7 1d 21 84 a0 e8 ab 77 b3 e0 e8 fc 8b d7 e0 <48> 8b bb c0 31 00 00 48 85 ff 74 09 e8 ff eb fc ff 85 c0 75 03 [ 76.121432] RIP [] amdgpu_fbdev_restore_mode+0x20/0x60 [amdgpu] Signed-off-by: Rex Zhu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c index 236d9950221b..c0d8c6ff6380 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c @@ -425,10 +425,15 @@ bool amdgpu_fbdev_robj_is_fb(struct amdgpu_device *adev, struct amdgpu_bo *robj) void amdgpu_fbdev_restore_mode(struct amdgpu_device *adev) { - struct amdgpu_fbdev *afbdev = adev->mode_info.rfbdev; + struct amdgpu_fbdev *afbdev; struct drm_fb_helper *fb_helper; int ret; + if (!adev) + return; + + afbdev = adev->mode_info.rfbdev; + if (!afbdev) return; From 65d786c21bf8140dac83563306f46fe0b13a9aaa Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 23 May 2017 18:18:37 -0500 Subject: [PATCH 218/254] net: fix potential null pointer dereference Add null check to avoid a potential null pointer dereference. Addresses-Coverity-ID: 1408831 Signed-off-by: Gustavo A. R. Silva Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- drivers/net/gtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 4fea1b3dfbb4..7b652bb7ebe4 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -873,7 +873,7 @@ static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[]) /* Check if there's an existing gtpX device to configure */ dev = dev_get_by_index_rcu(net, nla_get_u32(nla[GTPA_LINK])); - if (dev->netdev_ops == >p_netdev_ops) + if (dev && dev->netdev_ops == >p_netdev_ops) gtp = netdev_priv(dev); put_net(net); From 4b3c7dbbfff0673e8a89575414b864d8b001d3bb Mon Sep 17 00:00:00 2001 From: KT Liao Date: Thu, 25 May 2017 10:06:21 -0700 Subject: [PATCH 219/254] Input: elan_i2c - clear INT before resetting controller Some old touchpad FWs need to have interrupt cleared before issuing reset command after updating firmware. We clear interrupt by attempting to read full report from the controller, and discarding any data read. Signed-off-by: KT Liao Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_i2c.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/input/mouse/elan_i2c_i2c.c b/drivers/input/mouse/elan_i2c_i2c.c index a679e56c44cd..765879dcaf85 100644 --- a/drivers/input/mouse/elan_i2c_i2c.c +++ b/drivers/input/mouse/elan_i2c_i2c.c @@ -557,7 +557,14 @@ static int elan_i2c_finish_fw_update(struct i2c_client *client, long ret; int error; int len; - u8 buffer[ETP_I2C_INF_LENGTH]; + u8 buffer[ETP_I2C_REPORT_LEN]; + + len = i2c_master_recv(client, buffer, ETP_I2C_REPORT_LEN); + if (len != ETP_I2C_REPORT_LEN) { + error = len < 0 ? len : -EIO; + dev_warn(dev, "failed to read I2C data after FW WDT reset: %d (%d)\n", + error, len); + } reinit_completion(completion); enable_irq(client->irq); From a04f144059ac09f2c3da50b5707df589044aad66 Mon Sep 17 00:00:00 2001 From: KT Liao Date: Tue, 23 May 2017 13:41:47 -0700 Subject: [PATCH 220/254] Input: elan_i2c - ignore signals when finishing updating firmware Use wait_for_completion_timeout() instead of wait_for_completion_interruptible_timeout() to avoid stray signals ruining firmware update. Our timeout is only 300 msec so we are fine simply letting it expire in case device misbehaves. Signed-off-by: KT Liao Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_i2c.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/input/mouse/elan_i2c_i2c.c b/drivers/input/mouse/elan_i2c_i2c.c index 765879dcaf85..f431da07f861 100644 --- a/drivers/input/mouse/elan_i2c_i2c.c +++ b/drivers/input/mouse/elan_i2c_i2c.c @@ -554,7 +554,6 @@ static int elan_i2c_finish_fw_update(struct i2c_client *client, struct completion *completion) { struct device *dev = &client->dev; - long ret; int error; int len; u8 buffer[ETP_I2C_REPORT_LEN]; @@ -570,23 +569,19 @@ static int elan_i2c_finish_fw_update(struct i2c_client *client, enable_irq(client->irq); error = elan_i2c_write_cmd(client, ETP_I2C_STAND_CMD, ETP_I2C_RESET); - if (!error) - ret = wait_for_completion_interruptible_timeout(completion, - msecs_to_jiffies(300)); - disable_irq(client->irq); - if (error) { dev_err(dev, "device reset failed: %d\n", error); - return error; - } else if (ret == 0) { + } else if (!wait_for_completion_timeout(completion, + msecs_to_jiffies(300))) { dev_err(dev, "timeout waiting for device reset\n"); - return -ETIMEDOUT; - } else if (ret < 0) { - error = ret; - dev_err(dev, "error waiting for device reset: %d\n", error); - return error; + error = -ETIMEDOUT; } + disable_irq(client->irq); + + if (error) + return error; + len = i2c_master_recv(client, buffer, ETP_I2C_INF_LENGTH); if (len != ETP_I2C_INF_LENGTH) { error = len < 0 ? len : -EIO; From 7c3f1875c66fbc19762760097cabc91849ea0bbb Mon Sep 17 00:00:00 2001 From: Roman Kapl Date: Wed, 24 May 2017 10:22:22 +0200 Subject: [PATCH 221/254] net: move somaxconn init from sysctl code The default value for somaxconn is set in sysctl_core_net_init(), but this function is not called when kernel is configured without CONFIG_SYSCTL. This results in the kernel not being able to accept TCP connections, because the backlog has zero size. Usually, the user ends up with: "TCP: request_sock_TCP: Possible SYN flooding on port 7. Dropping request. Check SNMP counters." If SYN cookies are not enabled the connection is rejected. Before ef547f2ac16 (tcp: remove max_qlen_log), the effects were less severe, because the backlog was always at least eight slots long. Signed-off-by: Roman Kapl Signed-off-by: David S. Miller --- net/core/net_namespace.c | 19 +++++++++++++++++++ net/core/sysctl_net_core.c | 2 -- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1934efd4a9d4..26bbfababff2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -315,6 +315,25 @@ out_undo: goto out; } +static int __net_init net_defaults_init_net(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + return 0; +} + +static struct pernet_operations net_defaults_ops = { + .init = net_defaults_init_net, +}; + +static __init int net_defaults_init(void) +{ + if (register_pernet_subsys(&net_defaults_ops)) + panic("Cannot initialize net default settings"); + + return 0; +} + +core_initcall(net_defaults_init); #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index ea23254b2457..b7cd9aafe99e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -479,8 +479,6 @@ static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl; - net->core.sysctl_somaxconn = SOMAXCONN; - tbl = netns_core_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); From ba615f675281d76fd19aa03558777f81fb6b6084 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 24 May 2017 09:59:31 -0700 Subject: [PATCH 222/254] tcp: avoid fastopen API to be used on AF_UNSPEC Fastopen API should be used to perform fastopen operations on the TCP socket. It does not make sense to use fastopen API to perform disconnect by calling it with AF_UNSPEC. The fastopen data path is also prone to race conditions and bugs when using with AF_UNSPEC. One issue reported and analyzed by Vegard Nossum is as follows: +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Thread A: Thread B: ------------------------------------------------------------------------ sendto() - tcp_sendmsg() - sk_stream_memory_free() = 0 - goto wait_for_sndbuf - sk_stream_wait_memory() - sk_wait_event() // sleep | sendto(flags=MSG_FASTOPEN, dest_addr=AF_UNSPEC) | - tcp_sendmsg() | - tcp_sendmsg_fastopen() | - __inet_stream_connect() | - tcp_disconnect() //because of AF_UNSPEC | - tcp_transmit_skb()// send RST | - return 0; // no reconnect! | - sk_stream_wait_connect() | - sock_error() | - xchg(&sk->sk_err, 0) | - return -ECONNRESET - ... // wake up, see sk->sk_err == 0 - skb_entail() on TCP_CLOSE socket If the connection is reopened then we will send a brand new SYN packet after thread A has already queued a buffer. At this point I think the socket internal state (sequence numbers etc.) becomes messed up. When the new connection is closed, the FIN-ACK is rejected because the sequence number is outside the window. The other side tries to retransmit, but __tcp_retransmit_skb() calls tcp_trim_head() on an empty skb which corrupts the skb data length and hits a BUG() in copy_and_csum_bits(). +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Hence, this patch adds a check for AF_UNSPEC in the fastopen data path and return EOPNOTSUPP to user if such case happens. Fixes: cf60af03ca4e7 ("tcp: Fast Open client - sendmsg(MSG_FASTOPEN)") Reported-by: Vegard Nossum Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 842b575f8fdd..59792d283ff8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1084,9 +1084,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && + uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ @@ -1108,7 +1111,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst From 5990baaa6d7b437dfcf58b7021ca56b1d6b35869 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Wed, 24 May 2017 15:19:35 -0700 Subject: [PATCH 223/254] arp: fixed -Wuninitialized compiler warning Commit 7d472a59c0e5ec117220a05de6b370447fb6cb66 ("arp: always override existing neigh entries with gratuitous ARP") introduced a compiler warning: net/ipv4/arp.c:880:35: warning: 'addr_type' may be used uninitialized in this function [-Wmaybe-uninitialized] While the code logic seems to be correct and doesn't allow the variable to be used uninitialized, and the warning is not consistently reproducible, it's still worth fixing it for other people not to waste time looking at the warning in case it pops up in the build environment. Yes, compiler is probably at fault, but we will need to accommodate. Fixes: 7d472a59c0e5 ("arp: always override existing neigh entries with gratuitous ARP") Signed-off-by: Ihar Hrachyshka Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ae96e6f3e0cb..e9f3386a528b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -863,8 +863,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + addr_type = -1; if (n || IN_DEV_ARP_ACCEPT(in_dev)) { - addr_type = -1; is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } From 1ad2f5838d345e1c102bd1cd27c4f4c1349b0dc8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:05 +0200 Subject: [PATCH 224/254] bpf: fix incorrect pruning decision when alignment must be tracked Currently, when we enforce alignment tracking on direct packet access, the verifier lets the following program pass despite doing a packet write with unaligned access: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (bf) r0 = r2 4: (07) r0 += 14 5: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 6: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 7: (63) *(u32 *)(r0 -4) = r0 8: (b7) r0 = 0 9: (95) exit from 6 to 8: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 8: (b7) r0 = 0 9: (95) exit from 5 to 10: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R10=fp 10: (07) r0 += 1 11: (05) goto pc-6 6: safe <----- here, wrongly found safe processed 15 insns However, if we enforce a pruning mismatch by adding state into r8 which is then being mismatched in states_equal(), we find that for the otherwise same program, the verifier detects a misaligned packet access when actually walking that path: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (b7) r8 = 1 4: (bf) r0 = r2 5: (07) r0 += 14 6: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 9: (b7) r0 = 0 10: (95) exit from 7 to 9: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 9: (b7) r0 = 0 10: (95) exit from 6 to 11: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 11: (07) r0 += 1 12: (b7) r8 = 0 13: (05) goto pc-7 <----- mismatch due to r8 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=15,r=15) R1=ctx R2=pkt(id=0,off=0,r=15) R3=pkt_end R7=inv,min_value=2 R8=imm0,min_value=0,max_value=0,min_align=2147483648 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 misaligned packet access off 2+15+-4 size 4 The reason why we fail to see it in states_equal() is that the third test in compare_ptrs_to_packet() ... if (old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; ... will let the above pass. The situation we run into is that old->off <= cur->off (14 <= 15), meaning that prior walked paths went with smaller offset, which was later used in the packet access after successful packet range check and found to be safe already. For example: Given is R0=pkt(id=0,off=0,r=0). Adding offset 14 as in above program to it, results in R0=pkt(id=0,off=14,r=0) before the packet range test. Now, testing this against R3=pkt_end with 'if r0 > r3 goto out' will transform R0 into R0=pkt(id=0,off=14,r=14) for the case when we're within bounds. A write into the packet at offset *(u32 *)(r0 -4), that is, 2 + 14 -4, is valid and aligned (2 is for NET_IP_ALIGN). After processing this with all fall-through paths, we later on check paths from branches. When the above skb->mark test is true, then we jump near the end of the program, perform r0 += 1, and jump back to the 'if r0 > r3 goto out' test we've visited earlier already. This time, R0 is of type R0=pkt(id=0,off=15,r=0), and we'll prune that part because this time we'll have a larger safe packet range, and we already found that with off=14 all further insn were already safe, so it's safe as well with a larger off. However, the problem is that the subsequent write into the packet with 2 + 15 -4 is then unaligned, and not caught by the alignment tracking. Note that min_align, aux_off, and aux_off_align were all 0 in this example. Since we cannot tell at this time what kind of packet access was performed in the prior walk and what minimal requirements it has (we might do so in the future, but that requires more complexity), fix it to disable this pruning case for strict alignment for now, and let the verifier do check such paths instead. With that applied, the test cases pass and reject the program due to misalignment. Fixes: d1174416747d ("bpf: Track alignment of register values in the verifier.") Reference: http://patchwork.ozlabs.org/patch/761909/ Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c72cd41f5b8b..e37e06b1229d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -843,9 +843,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, { bool strict = env->strict_alignment; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - strict = true; - switch (reg->type) { case PTR_TO_PACKET: return check_pkt_ptr_alignment(reg, off, size, strict); @@ -2696,7 +2693,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct bpf_reg_state *old, +static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, + struct bpf_reg_state *old, struct bpf_reg_state *cur) { if (old->id != cur->id) @@ -2739,7 +2737,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * 'if (R4 > data_end)' and all further insn were already good with r=20, * so they will be good with r=30 and we can prune the search. */ - if (old->off <= cur->off && + if (!env->strict_alignment && old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; @@ -2810,7 +2808,7 @@ static bool states_equal(struct bpf_verifier_env *env, continue; if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && - compare_ptrs_to_packet(rold, rcur)) + compare_ptrs_to_packet(env, rold, rcur)) continue; return false; @@ -3588,10 +3586,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) } else { log_level = 0; } - if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) + + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - else - env->strict_alignment = false; ret = replace_map_fd_with_map_ptr(env); if (ret < 0) @@ -3697,7 +3695,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, mutex_lock(&bpf_verifier_lock); log_level = 0; + env->strict_alignment = false; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), From a9789ef9afcb4fb0193f8dd94f2665ba3ad71e79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:06 +0200 Subject: [PATCH 225/254] bpf: properly reset caller saved regs after helper call and ld_abs/ind Currently, after performing helper calls, we clear all caller saved registers, that is r0 - r5 and fill r0 depending on struct bpf_func_proto specification. The way we reset these regs can affect pruning decisions in later paths, since we only reset register's imm to 0 and type to NOT_INIT. However, we leave out clearing of other variables such as id, min_value, max_value, etc, which can later on lead to pruning mismatches due to stale data. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e37e06b1229d..339c8a1371de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -463,19 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + + memset(®s[regno], 0, sizeof(regs[regno])); + regs[regno].type = NOT_INIT; + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + static void init_reg_state(struct bpf_reg_state *regs) { int i; - for (i = 0; i < MAX_BPF_REG; i++) { - regs[i].type = NOT_INIT; - regs[i].imm = 0; - regs[i].min_value = BPF_REGISTER_MIN_RANGE; - regs[i].max_value = BPF_REGISTER_MAX_RANGE; - regs[i].min_align = 0; - regs[i].aux_off = 0; - regs[i].aux_off_align = 0; - } + for (i = 0; i < MAX_BPF_REG; i++) + mark_reg_not_init(regs, i); /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; @@ -1346,7 +1349,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs = state->regs; - struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1413,11 +1415,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } /* reset caller saved regs */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* update return register */ if (fn->ret_type == RET_INTEGER) { @@ -2445,7 +2444,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -2478,11 +2476,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* reset caller saved regs to unreadable */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* mark destination R0 register as readable, since it contains * the value fetched from the packet From 41703a731066fde79c3e5ccf3391cf77a98aeda5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:07 +0200 Subject: [PATCH 226/254] bpf: add bpf_clone_redirect to bpf_helper_changes_pkt_data The bpf_clone_redirect() still needs to be listed in bpf_helper_changes_pkt_data() since we call into bpf_try_make_head_writable() from there, thus we need to invalidate prior pkt regs as well. Fixes: 36bbef52c7eb ("bpf: direct packet write and access for helpers for clsact progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index a253a6197e6b..a6bb95fa87b2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2281,6 +2281,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_change_head || func == bpf_skb_change_tail || func == bpf_skb_pull_data || + func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head) From a316338cb71a3260201490e615f2f6d5c0d8fb2c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:08 +0200 Subject: [PATCH 227/254] bpf: fix wrong exposure of map_flags into fdinfo for lpm trie_alloc() always needs to have BPF_F_NO_PREALLOC passed in via attr->map_flags, since it does not support preallocation yet. We check the flag, but we never copy the flag into trie->map.map_flags, which is later on exposed into fdinfo and used by loaders such as iproute2. Latter uses this in bpf_map_selfcheck_pinned() to test whether a pinned map has the same spec as the one from the BPF obj file and if not, bails out, which is currently the case for lpm since it exposes always 0 as flags. Also copy over flags in array_map_alloc() and stack_map_alloc(). They always have to be 0 right now, but we should make sure to not miss to copy them over at a later point in time when we add actual flags for them to use. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Reported-by: Jarno Rajahalme Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 1 + kernel/bpf/lpm_trie.c | 1 + kernel/bpf/stackmap.c | 1 + 3 files changed, 3 insertions(+) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e00b2333c26..172dc8ee0e3b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; + array->map.map_flags = attr->map_flags; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 39cfafd895b8..b09185f0f17d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.key_size = attr->key_size; trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; + trie->map.map_flags = attr->map_flags; trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4dfd6f2ec2f9..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.key_size = attr->key_size; smap->map.value_size = value_size; smap->map.max_entries = attr->max_entries; + smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; From 614d0d77b49a9b131e58b77473698ab5b2c525b7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:09 +0200 Subject: [PATCH 228/254] bpf: add various verifier test cases This patch adds various verifier test cases: 1) A test case for the pruning issue when tracking alignment is used. 2) Various PTR_TO_MAP_VALUE_OR_NULL tests to make sure pointer arithmetic turns such register into UNKNOWN_VALUE type. 3) Test cases for the special treatment of LD_ABS/LD_IND to make sure verifier doesn't break calling convention here. Latter is needed, since f.e. arm64 JIT uses r1 - r5 for storing temporary data, so they really must be marked as NOT_INIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 10 + tools/include/linux/filter.h | 10 + tools/testing/selftests/bpf/test_verifier.c | 239 +++++++++++++++++++- 3 files changed, 255 insertions(+), 4 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 56197f82af45..62d948f80730 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -272,6 +272,16 @@ struct bpf_prog_aux; .off = OFF, \ .imm = IMM }) +/* Unconditional jumps, goto pc + off16 */ + +#define BPF_JMP_A(OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_JA, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = 0 }) + /* Function call */ #define BPF_EMIT_CALL(FUNC) \ diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 390d7c9685fd..4ce25d43e8e3 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -208,6 +208,16 @@ .off = OFF, \ .imm = IMM }) +/* Unconditional jumps, goto pc + off16 */ + +#define BPF_JMP_A(OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_JA, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = 0 }) + /* Function call */ #define BPF_EMIT_CALL(FUNC) \ diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 3773562056da..cabb19b1e371 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -49,6 +49,7 @@ #define MAX_NR_MAPS 4 #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0) +#define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1) struct bpf_test { const char *descr; @@ -2614,6 +2615,30 @@ static struct bpf_test tests[] = { .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, + { + "direct packet access: test17 (pruning, alignment)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 14), + BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 1, 4), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, -4), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), + BPF_JMP_A(-6), + }, + .errstr = "misaligned packet access off 2+15+-4 size 4", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, + }, { "helper access to packet: test1, valid packet_ptr range", .insns = { @@ -3340,6 +3365,70 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, + { + "alu ops on ptr_to_map_value_or_null, 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .errstr = "R4 invalid mem access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, + { + "alu ops on ptr_to_map_value_or_null, 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_4, -1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .errstr = "R4 invalid mem access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, + { + "alu ops on ptr_to_map_value_or_null, 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .errstr = "R4 invalid mem access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, { "invalid memory access with multiple map_lookup_elem calls", .insns = { @@ -4937,7 +5026,149 @@ static struct bpf_test tests[] = { .fixup_map_in_map = { 3 }, .errstr = "R1 type=map_value_or_null expected=map_ptr", .result = REJECT, - } + }, + { + "ld_abs: check calling conv, r1", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R1 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r2", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R2 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r3", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_EXIT_INSN(), + }, + .errstr = "R3 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r4", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_4), + BPF_EXIT_INSN(), + }, + .errstr = "R4 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r5", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .errstr = "R5 !read_ok", + .result = REJECT, + }, + { + "ld_abs: check calling conv, r7", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_7, 0), + BPF_LD_ABS(BPF_W, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "ld_ind: check calling conv, r1", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_LD_IND(BPF_W, BPF_REG_1, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R1 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r2", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_LD_IND(BPF_W, BPF_REG_2, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R2 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r3", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_LD_IND(BPF_W, BPF_REG_3, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_EXIT_INSN(), + }, + .errstr = "R3 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r4", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_4, 1), + BPF_LD_IND(BPF_W, BPF_REG_4, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_4), + BPF_EXIT_INSN(), + }, + .errstr = "R4 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r5", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_5, 1), + BPF_LD_IND(BPF_W, BPF_REG_5, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .errstr = "R5 !read_ok", + .result = REJECT, + }, + { + "ld_ind: check calling conv, r7", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_7, 1), + BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) @@ -5059,9 +5290,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv, do_test_fixup(test, prog, map_fds); - fd_prog = bpf_load_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, - prog, prog_len, "GPL", 0, bpf_vlog, - sizeof(bpf_vlog)); + fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, + prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT, + "GPL", 0, bpf_vlog, sizeof(bpf_vlog)); expected_ret = unpriv && test->result_unpriv != UNDEF ? test->result_unpriv : test->result; From 791caeb084c57e3a4d648cf1ee799d1f70c0ef4e Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 24 May 2017 16:35:49 -0700 Subject: [PATCH 229/254] test_bpf: Add a couple of tests for BPF_JSGE. Some JITs can optimize comparisons with zero. Add a couple of BPF_JSGE tests against immediate zero. Signed-off-by: David Daney Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- lib/test_bpf.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 889bc31785be..be88cbaadde3 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4504,6 +4504,44 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + { + "JMP_JSGE_K: Signed jump: value walk 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -3), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 6), + BPF_ALU64_IMM(BPF_ADD, R1, 1), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 4), + BPF_ALU64_IMM(BPF_ADD, R1, 1), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 2), + BPF_ALU64_IMM(BPF_ADD, R1, 1), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSGE_K: Signed jump: value walk 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -3), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 4), + BPF_ALU64_IMM(BPF_ADD, R1, 2), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 2), + BPF_ALU64_IMM(BPF_ADD, R1, 2), + BPF_JMP_IMM(BPF_JSGE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGT | BPF_K */ { "JMP_JGT_K: if (3 > 2) return 1", From 797a93647a48d6cb8a20641a86a71713a947f786 Mon Sep 17 00:00:00 2001 From: Nithin Sujir Date: Wed, 24 May 2017 19:45:17 -0700 Subject: [PATCH 230/254] bonding: Don't update slave->link until ready to commit In the loadbalance arp monitoring scheme, when a slave link change is detected, the slave->link is immediately updated and slave_state_changed is set. Later down the function, the rtnl_lock is acquired and the changes are committed, updating the bond link state. However, the acquisition of the rtnl_lock can fail. The next time the monitor runs, since slave->link is already updated, it determines that link is unchanged. This results in the bond link state permanently out of sync with the slave link. This patch modifies bond_loadbalance_arp_mon() to handle link changes identical to bond_ab_arp_{inspect/commit}(). The new link state is maintained in slave->new_link until we're ready to commit at which point it's copied into slave->link. NOTE: miimon_{inspect/commit}() has a more complex state machine requiring the use of the bond_{propose,commit}_link_state() functions which maintains the intermediate state in slave->link_new_state. The arp monitors don't require that. Testing: This bug is very easy to reproduce with the following steps. 1. In a loop, toggle a slave link of a bond slave interface. 2. In a separate loop, do ifconfig up/down of an unrelated interface to create contention for rtnl_lock. Within a few iterations, the bond link goes out of sync with the slave link. Signed-off-by: Nithin Nayak Sujir Cc: Mahesh Bandewar Cc: Jay Vosburgh Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 73313318399c..2359478b977f 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2612,11 +2612,13 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) bond_for_each_slave_rcu(bond, slave, iter) { unsigned long trans_start = dev_trans_start(slave->dev); + slave->new_link = BOND_LINK_NOCHANGE; + if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, trans_start, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { - slave->link = BOND_LINK_UP; + slave->new_link = BOND_LINK_UP; slave_state_changed = 1; /* primary_slave has no meaning in round-robin @@ -2643,7 +2645,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) if (!bond_time_in_interval(bond, trans_start, 2) || !bond_time_in_interval(bond, slave->last_rx, 2)) { - slave->link = BOND_LINK_DOWN; + slave->new_link = BOND_LINK_DOWN; slave_state_changed = 1; if (slave->link_failure_count < UINT_MAX) @@ -2674,6 +2676,11 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) if (!rtnl_trylock()) goto re_arm; + bond_for_each_slave(bond, slave, iter) { + if (slave->new_link != BOND_LINK_NOCHANGE) + slave->link = slave->new_link; + } + if (slave_state_changed) { bond_slave_state_change(bond); if (BOND_MODE(bond) == BOND_MODE_XOR) From 9bdcfb10f221e796c9619fe48655e0f1272f1d92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2017 15:14:43 +0200 Subject: [PATCH 231/254] nvme-pci: consistencly use ctrl->device for logging This is what most of the code already does and gives much more useful prefixes than the device embedded in the pci_dev. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/pci.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4c2ff2bb26bc..bf8bec39c017 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -263,7 +263,7 @@ static void nvme_dbbuf_set(struct nvme_dev *dev) c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr); if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) { - dev_warn(dev->dev, "unable to set dbbuf\n"); + dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); /* Free memory and continue on */ nvme_dbbuf_dma_free(dev); } @@ -1394,11 +1394,11 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, &pci_status); if (result == PCIBIOS_SUCCESSFUL) - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", csts, pci_status); else - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", csts, result); } @@ -1740,8 +1740,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) */ if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { dev->q_depth = 2; - dev_warn(dev->dev, "detected Apple NVMe controller, set " - "queue depth=%u to work around controller resets\n", + dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " + "set queue depth=%u to work around controller resets\n", dev->q_depth); } @@ -1759,7 +1759,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) if (dev->cmbsz) { if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, &dev_attr_cmb.attr, NULL)) - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "failed to add sysfs attribute for CMB\n"); } } From d3d5b87ddde09bade512526f6df90e8c06c28230 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2017 15:14:44 +0200 Subject: [PATCH 232/254] nvme: replace is_flags field in nvme_ctrl_ops with a flags field So that we can have more flags for transport-specific behavior. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 3 ++- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/target/loop.c | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 04e115834702..228f7c73e2f1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1605,7 +1605,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); - if (ctrl->ops->is_fabrics) { + if (ctrl->ops->flags & NVME_F_FABRICS) { ctrl->icdoff = le16_to_cpu(id->icdoff); ctrl->ioccsz = le32_to_cpu(id->ioccsz); ctrl->iorcsz = le32_to_cpu(id->iorcsz); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 14a009e43aa5..5b14cbefb724 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2647,7 +2647,7 @@ nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .name = "fc", .module = THIS_MODULE, - .is_fabrics = true, + .flags = NVME_F_FABRICS, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 29c708ca9621..7c4b0f6636c5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -208,7 +208,8 @@ struct nvme_ns { struct nvme_ctrl_ops { const char *name; struct module *module; - bool is_fabrics; + unsigned int flags; +#define NVME_F_FABRICS (1 << 0) int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e2c18f3d9dcf..28bd255c144d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1792,7 +1792,7 @@ static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .name = "rdma", .module = THIS_MODULE, - .is_fabrics = true, + .flags = NVME_F_FABRICS, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index feb497134aee..e503cfff0337 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -558,7 +558,7 @@ static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .name = "loop", .module = THIS_MODULE, - .is_fabrics = true, + .flags = NVME_F_FABRICS, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, From c81bfba9983fc44210d3eb5971e0faac597bf50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2017 15:14:45 +0200 Subject: [PATCH 233/254] nvme: only setup block integrity if supported by the driver Currently only the PCIe driver supports metadata, so we should not claim integrity support for the other drivers. This prevents nasty crashes with targets that advertise metadata support on fabrics. Also use the opportunity to factor out some code into a separate helper that isn't even compiled if CONFIG_BLK_DEV_INTEGRITY is disabled. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 50 +++++++++++++++++++++++++--------------- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 1 + 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 228f7c73e2f1..a60926410438 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -925,6 +925,29 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, + u16 bs) +{ + struct nvme_ns *ns = disk->private_data; + u16 old_ms = ns->ms; + u8 pi_type = 0; + + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + + /* PI implementation requires metadata equal t10 pi tuple size */ + if (ns->ms == sizeof(struct t10_pi_tuple)) + pi_type = id->dps & NVME_NS_DPS_PI_MASK; + + if (blk_get_integrity(disk) && + (ns->pi_type != pi_type || ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && ns->ext))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; +} + static void nvme_init_integrity(struct nvme_ns *ns) { struct blk_integrity integrity; @@ -951,6 +974,10 @@ static void nvme_init_integrity(struct nvme_ns *ns) blk_queue_max_integrity_segments(ns->queue, 1); } #else +static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, + u16 bs) +{ +} static void nvme_init_integrity(struct nvme_ns *ns) { } @@ -997,37 +1024,22 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - old_ms = ns->ms; - lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + u16 bs; /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ + ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; bs = 1 << ns->lba_shift; - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ - pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? - id->dps & NVME_NS_DPS_PI_MASK : 0; blk_mq_freeze_queue(disk->queue); - if (blk_get_integrity(disk) && (ns->pi_type != pi_type || - ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - ns->pi_type = pi_type; + if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) + nvme_prep_integrity(disk, id, bs); blk_queue_logical_block_size(ns->queue, bs); - if (ns->ms && !blk_get_integrity(disk) && !ns->ext) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7c4b0f6636c5..9d6a070d4391 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -210,6 +210,7 @@ struct nvme_ctrl_ops { struct module *module; unsigned int flags; #define NVME_F_FABRICS (1 << 0) +#define NVME_F_METADATA_SUPPORTED (1 << 1) int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bf8bec39c017..6103b178e43a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2047,6 +2047,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, + .flags = NVME_F_METADATA_SUPPORTED, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, From 50af47d04ca530544b27affffb0722f158e2bb9c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 24 May 2017 15:06:31 -0700 Subject: [PATCH 234/254] nvme: Quirk APST on Intel 600P/P3100 devices They have known firmware bugs. A fix is apparently in the works -- once fixed firmware is available, someone from Intel (Hi, Keith!) can adjust the quirk accordingly. Cc: stable@vger.kernel.org # v4.11 Cc: Kai-Heng Feng Cc: Mario Limonciello Signed-off-by: Andy Lutomirski Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6103b178e43a..d52701df7245 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2294,6 +2294,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0a54), .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, + { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ From 702644ec1cab10ffefcebb4060d4da46d4ef2c7f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 24 May 2017 20:04:41 +0200 Subject: [PATCH 235/254] x86/timers: Move simple_udelay_calibration past init_hypervisor_platform This ensures that adjustments to x86_platform done by the hypervisor setup is already respected by this simple calibration. The current user of this, introduced by 1b5aeebf3a92 ("x86/earlyprintk: Add support for earlyprintk via USB3 debug port"), comes much later into play. Fixes: dd759d93f4dd ("x86/timers: Add simple udelay calibration") Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Acked-by: Lu Baolu Link: http://lkml.kernel.org/r/5e89fe60-aab3-2c1c-aba8-32f8ad376189@siemens.com --- arch/x86/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0b4d3c686b1e..f81823695014 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -980,8 +980,6 @@ void __init setup_arch(char **cmdline_p) */ x86_configure_nx(); - simple_udelay_calibration(); - parse_early_param(); #ifdef CONFIG_MEMORY_HOTPLUG @@ -1041,6 +1039,8 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + simple_udelay_calibration(); + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ From a8ecdd7117ee68fe27009acc8021423870c1dcd7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 25 May 2017 16:38:06 -0700 Subject: [PATCH 236/254] blk-mq: Only register debugfs attributes for blk-mq queues The code in blk-mq-debugfs.c assumes that it is working on a blk-mq queue and is not intended to work on a blk-sq queue. Hence only register blk-mq debugfs attributes for blk-mq queues. Fixes: commit 9c1051aacde8 ("blk-mq: untangle debugfs and sysfs") Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Ming Lei Reviewed-by: Omar Sandoval Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 504fee940052..712b018e9f54 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -887,10 +887,10 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } - if (q->mq_ops) + if (q->mq_ops) { __blk_mq_register_dev(dev, q); - - blk_mq_debugfs_register(q); + blk_mq_debugfs_register(q); + } kobject_uevent(&q->kobj, KOBJ_ADD); From 83b4605b0c16cde5b00c8cf192408d51eab75402 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2017 18:59:54 +0200 Subject: [PATCH 237/254] PCI/msi: fix the pci_alloc_irq_vectors_affinity stub We need to return an error for any call that asks for MSI / MSI-X vectors only, so that non-trivial fallback logic can work properly. Also valid dev->irq and use the "correct" errno value based on feedback from Linus. Signed-off-by: Christoph Hellwig Reported-by: Steven Rostedt Fixes: aff17164 ("PCI: Provide sensible IRQ vector alloc/free routines") Signed-off-by: Linus Torvalds --- include/linux/pci.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index 33c2b0b77429..fc2e832d7b9c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1342,9 +1342,9 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, unsigned int max_vecs, unsigned int flags, const struct irq_affinity *aff_desc) { - if (min_vecs > 1) - return -EINVAL; - return 1; + if ((flags & PCI_IRQ_LEGACY) && min_vecs == 1 && dev->irq) + return 1; + return -ENOSPC; } static inline void pci_free_irq_vectors(struct pci_dev *dev) From 0908cf4dfef35fc6ac12329007052ebe93ff1081 Mon Sep 17 00:00:00 2001 From: linzhang Date: Thu, 25 May 2017 14:07:18 +0800 Subject: [PATCH 238/254] net: llc: add lock_sock in llc_ui_bind to avoid a race condition There is a race condition in llc_ui_bind if two or more processes/threads try to bind a same socket. If more processes/threads bind a same socket success that will lead to two problems, one is this action is not what we expected, another is will lead to kernel in unstable status or oops(in my simple test case, cause llc2.ko can't unload). The current code is test SOCK_ZAPPED bit to avoid a process to bind a same socket twice but that is can't avoid more processes/threads try to bind a same socket at the same time. So, add lock_sock in llc_ui_bind like others, such as llc_ui_connect. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- net/llc/af_llc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8364fe5b59e4..c38d16f22d2a 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -311,6 +311,8 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) int rc = -EINVAL; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); + + lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; @@ -382,6 +384,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) out_put: llc_sap_put(sap); out: + release_sock(sk); return rc; } From 804ec7ebe8ea003999ca8d1bfc499edc6a9e07df Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 25 May 2017 19:14:56 +0200 Subject: [PATCH 239/254] sctp: fix ICMP processing if skb is non-linear sometimes ICMP replies to INIT chunks are ignored by the client, even if the encapsulated SCTP headers match an open socket. This happens when the ICMP packet is carried by a paged skb: use skb_header_pointer() to read packet contents beyond the SCTP header, so that chunk header and initiate tag are validated correctly. v2: - don't use skb_header_pointer() to read the transport header, since icmp_socket_deliver() already puts these 8 bytes in the linear area. - change commit message to make specific reference to INIT chunks. Signed-off-by: Davide Caratti Acked-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/input.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 0e06a278d2a9..ba9ad32fc447 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -473,15 +473,14 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctp_association **app, struct sctp_transport **tpp) { + struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; - struct sctp_init_chunk *chunkhdr; __u32 vtag = ntohl(sctphdr->vtag); - int len = skb->len - ((void *)sctphdr - (void *)skb->data); *app = NULL; *tpp = NULL; @@ -516,13 +515,16 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, * discard the packet. */ if (vtag == 0) { - chunkhdr = (void *)sctphdr + sizeof(struct sctphdr); - if (len < sizeof(struct sctphdr) + sizeof(sctp_chunkhdr_t) - + sizeof(__be32) || + /* chunk header + first 4 octects of init header */ + chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr), + sizeof(struct sctp_chunkhdr) + + sizeof(__be32), &_chunkhdr); + if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || - ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) { + ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; - } + } else if (vtag != asoc->c.peer_vtag) { goto out; } From 0e9a709560dbcfbace8bf4019dc5298619235891 Mon Sep 17 00:00:00 2001 From: Peter Dawson Date: Fri, 26 May 2017 06:35:18 +1000 Subject: [PATCH 240/254] ip6_tunnel, ip6_gre: fix setting of DSCP on encapsulated packets This fix addresses two problems in the way the DSCP field is formulated on the encapsulating header of IPv6 tunnels. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195661 1) The IPv6 tunneling code was manipulating the DSCP field of the encapsulating packet using the 32b flowlabel. Since the flowlabel is only the lower 20b it was incorrect to assume that the upper 12b containing the DSCP and ECN fields would remain intact when formulating the encapsulating header. This fix handles the 'inherit' and 'fixed-value' DSCP cases explicitly using the extant dsfield u8 variable. 2) The use of INET_ECN_encapsulate(0, dsfield) in ip6_tnl_xmit was incorrect and resulted in the DSCP value always being set to 0. Commit 90427ef5d2a4 ("ipv6: fix flow labels when the traffic class is non-0") caused the regression by masking out the flowlabel which exposed the incorrect handling of the DSCP portion of the flowlabel in ip6_tunnel and ip6_gre. Fixes: 90427ef5d2a4 ("ipv6: fix flow labels when the traffic class is non-0") Signed-off-by: Peter Dawson Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 13 +++++++------ net/ipv6/ip6_tunnel.c | 21 +++++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 8d128ba79b66..0c5b4caa1949 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -537,11 +537,10 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - dsfield = ipv4_get_dsfield(iph); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) - & IPV6_TCLASS_MASK; + dsfield = ipv4_get_dsfield(iph); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else @@ -598,9 +597,11 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + dsfield = ipv6_get_dsfield(ipv6h); + else + dsfield = ip6_tclass(t->parms.flowinfo); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 6eb2ae507500..7ae6c503f1ca 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1196,7 +1196,7 @@ route_lookup: skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; @@ -1231,8 +1231,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (tproto != IPPROTO_IPIP && tproto != 0) return -1; - dsfield = ipv4_get_dsfield(iph); - if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; @@ -1246,6 +1244,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPIP; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; + dsfield = ip6_tclass(key->label); } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; @@ -1254,8 +1253,9 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPIP; if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) - & IPV6_TCLASS_MASK; + dsfield = ipv4_get_dsfield(iph); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else @@ -1267,6 +1267,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph)); + skb_set_inner_ipproto(skb, IPPROTO_IPIP); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, @@ -1300,8 +1302,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) ip6_tnl_addr_conflict(t, ipv6h)) return -1; - dsfield = ipv6_get_dsfield(ipv6h); - if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; @@ -1315,6 +1315,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPV6; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; + dsfield = ip6_tclass(key->label); } else { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ @@ -1337,7 +1338,9 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowi6_proto = IPPROTO_IPV6; if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK); + dsfield = ipv6_get_dsfield(ipv6h); + else + dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) @@ -1351,6 +1354,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h)); + skb_set_inner_ipproto(skb, IPPROTO_IPV6); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, From 82533ad9a1ce3a7a6863849a552c2cc041b55e0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 25 May 2017 22:54:53 +0200 Subject: [PATCH 241/254] net: ethernet: ax88796: don't call free_irq without request_irq first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function ax_init_dev (which is called only from the driver's .probe function) calls free_irq in the error path without having requested the irq in the first place. So drop the free_irq call in the error path. Fixes: 825a2ff1896e ("AX88796 network driver") Signed-off-by: Uwe Kleine-König Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/ax88796.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c index b0a3b85fc6f8..db02bc2fb4b2 100644 --- a/drivers/net/ethernet/8390/ax88796.c +++ b/drivers/net/ethernet/8390/ax88796.c @@ -748,13 +748,13 @@ static int ax_init_dev(struct net_device *dev) ret = ax_mii_init(dev); if (ret) - goto out_irq; + goto err_out; ax_NS8390_init(dev, 0); ret = register_netdev(dev); if (ret) - goto out_irq; + goto err_out; netdev_info(dev, "%dbit, irq %d, %lx, MAC: %pM\n", ei_local->word16 ? 16 : 8, dev->irq, dev->base_addr, @@ -762,9 +762,6 @@ static int ax_init_dev(struct net_device *dev) return 0; - out_irq: - /* cleanup irq */ - free_irq(dev->irq, dev); err_out: return ret; } From 3fb07daff8e99243366a081e5129560734de4ada Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 May 2017 14:27:35 -0700 Subject: [PATCH 242/254] ipv4: add reference counting to metrics Andrey Konovalov reported crashes in ipv4_mtu() I could reproduce the issue with KASAN kernels, between 10.246.7.151 and 10.246.7.152 : 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 & 2) At the same time run following loop : while : do ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 done Cong Wang attempted to add back rt->fi in commit 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting") but this proved to add some issues that were complex to solve. Instead, I suggested to add a refcount to the metrics themselves, being a standalone object (in particular, no reference to other objects) I tried to make this patch as small as possible to ease its backport, instead of being super clean. Note that we believe that only ipv4 dst need to take care of the metric refcount. But if this is wrong, this patch adds the basic infrastructure to extend this to other families. Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang for his efforts on this problem. Fixes: 2860583fe840 ("ipv4: Kill rt->fi") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Reviewed-by: Julian Anastasov Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/dst.h | 8 +++++++- include/net/ip_fib.h | 10 +++++----- net/core/dst.c | 23 ++++++++++++++--------- net/ipv4/fib_semantics.c | 17 ++++++++++------- net/ipv4/route.c | 10 +++++++++- 5 files changed, 45 insertions(+), 23 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 049af33da3b6..cfc043784166 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -107,10 +107,16 @@ struct dst_entry { }; }; +struct dst_metrics { + u32 metrics[RTAX_MAX]; + atomic_t refcnt; +}; +extern const struct dst_metrics dst_default_metrics; + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); -extern const u32 dst_default_metrics[]; #define DST_METRICS_READ_ONLY 0x1UL +#define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6692c5758b33..f7f6aa789c61 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -114,11 +114,11 @@ struct fib_info { __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; - u32 *fib_metrics; -#define fib_mtu fib_metrics[RTAX_MTU-1] -#define fib_window fib_metrics[RTAX_WINDOW-1] -#define fib_rtt fib_metrics[RTAX_RTT-1] -#define fib_advmss fib_metrics[RTAX_ADVMSS-1] + struct dst_metrics *fib_metrics; +#define fib_mtu fib_metrics->metrics[RTAX_MTU-1] +#define fib_window fib_metrics->metrics[RTAX_WINDOW-1] +#define fib_rtt fib_metrics->metrics[RTAX_RTT-1] +#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_weight; diff --git a/net/core/dst.c b/net/core/dst.c index 960e503b5a52..6192f11beec9 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ - [RTAX_MAX] = 0xdeadbeef, + .refcnt = ATOMIC_INIT(1), }; void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, if (dev) dev_hold(dev); dst->ops = ops; - dst_init_metrics(dst, dst_default_metrics, true); + dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; dst->path = dst; dst->from = NULL; @@ -314,25 +314,30 @@ EXPORT_SYMBOL(dst_release); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { - u32 *old_p = __DST_METRICS_PTR(old); + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + atomic_set(&p->refcnt, 1); + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); - p = __DST_METRICS_PTR(prev); + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; + } else if (prev & DST_METRICS_REFCOUNTED) { + if (atomic_dec_and_test(&old_p->refcnt)) + kfree(old_p); } } - return p; + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); + return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -341,7 +346,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da449ddb8cc1..ad9ad4aab5da 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -203,6 +203,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -213,8 +214,9 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); + m = fi->fib_metrics; + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) + kfree(m); kfree(fi); } @@ -971,11 +973,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; - fi->fib_metrics[type - 1] = val; + fi->fib_metrics->metrics[type - 1] = val; } if (ecn_ca) - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } @@ -1033,11 +1035,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; fib_info_cnt++; if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); if (!fi->fib_metrics) goto failure; + atomic_set(&fi->fib_metrics->refcnt, 1); } else - fi->fib_metrics = (u32 *) dst_default_metrics; + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1238,7 +1241,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 655d9eebe43e..6883b3d4ba8f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1385,8 +1385,12 @@ static void rt_add_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *) dst; + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) + kfree(p); + if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1438,7 +1442,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); + if (fi->fib_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + atomic_inc(&fi->fib_metrics->refcnt); + } #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif From f9797c2f20c0160edd718aa467101f3301e57e59 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Thu, 25 May 2017 16:20:38 +0100 Subject: [PATCH 243/254] ftrace: Fix memory leak in ftrace_graph_release() ftrace_hash is being kfree'ed in ftrace_graph_release(), however the ->buckets field is not. This results in a memory leak that is easily captured by kmemleak: unreferenced object 0xffff880038afe000 (size 8192): comm "trace-cmd", pid 238, jiffies 4294916898 (age 9.736s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] __kmalloc+0x12d/0x1a0 [] alloc_ftrace_hash+0x51/0x80 [] __ftrace_graph_open.isra.39.constprop.46+0xa3/0x100 [] ftrace_graph_open+0x68/0xa0 [] do_dentry_open.isra.1+0x1bd/0x2d0 [] vfs_open+0x47/0x60 [] path_openat+0x2a5/0x1020 [] do_filp_open+0x8a/0xf0 [] do_sys_open+0x12f/0x200 [] SyS_open+0x1e/0x20 [] entry_SYSCALL_64_fastpath+0x13/0x94 [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/20170525152038.7661-1-lhenriques@suse.com Cc: stable@vger.kernel.org Fixes: b9b0c831bed2 ("ftrace: Convert graph filter to use hash tables") Signed-off-by: Luis Henriques Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 74fdfe9ed3db..9e5841dc14b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5063,7 +5063,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) } out: - kfree(fgd->new_hash); + free_ftrace_hash(fgd->new_hash); kfree(fgd); return ret; From c93f5cf571e7795f97d49ef51b766cf25e328545 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 May 2017 19:38:17 +0900 Subject: [PATCH 244/254] kprobes/x86: Fix to set RWX bits correctly before releasing trampoline Fix kprobes to set(recover) RWX bits correctly on trampoline buffer before releasing it. Releasing readonly page to module_memfree() crash the kernel. Without this fix, if kprobes user register a bunch of kprobes in function body (since kprobes on function entry usually use ftrace) and unregister it, kernel hits a BUG and crash. Link: http://lkml.kernel.org/r/149570868652.3518.14120169373590420503.stgit@devbox Signed-off-by: Masami Hiramatsu Fixes: d0381c81c2f7 ("kprobes/x86: Set kprobes pages read-only") Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/kprobes/core.c | 9 +++++++++ kernel/kprobes.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 5b2bbfbb3712..6b877807598b 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -417,6 +418,14 @@ static void prepare_boost(struct kprobe *p, struct insn *insn) } } +/* Recover page to RW mode before releasing it */ +void free_insn_page(void *page) +{ + set_memory_nx((unsigned long)page & PAGE_MASK, 1); + set_memory_rw((unsigned long)page & PAGE_MASK, 1); + module_memfree(page); +} + static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d2d3a568e4e..adfe3b4cfe05 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -122,7 +122,7 @@ static void *alloc_insn_page(void) return module_alloc(PAGE_SIZE); } -static void free_insn_page(void *page) +void __weak free_insn_page(void *page) { module_memfree(page); } From bdd7e3d68492bfb7ade574f8c64b87bea499ca2e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 26 May 2017 13:44:54 +0900 Subject: [PATCH 245/254] selftests/ftrace: Add a testcase for many kprobe events Add a testcase to test kprobes via ftrace interface with many concurrent kprobe events. This tries to add many kprobe events (up to 256) on kernel functions. To avoid making ftrace-based kprobes (kprobes on fentry), it skips first N bytes (on x86 N=5, on ppc or arm N=4) of function entry. After that, it enables all those events, disable it, and remove it. Since the unoptimization buffer reclaiming will be delayed, after removing events, it will wait enough time. Link: http://lkml.kernel.org/r/149577388470.11702.11832460851769204511.stgit@devbox Signed-off-by: Masami Hiramatsu Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- .../ftrace/test.d/kprobe/multiple_kprobes.tc | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc new file mode 100644 index 000000000000..f4d1ff785d67 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc @@ -0,0 +1,21 @@ +#!/bin/sh +# description: Register/unregister many kprobe events + +# ftrace fentry skip size depends on the machine architecture. +# Currently HAVE_KPROBES_ON_FTRACE defined on x86 and powerpc +case `uname -m` in + x86_64|i[3456]86) OFFS=5;; + ppc*) OFFS=4;; + *) OFFS=0;; +esac + +echo "Setup up to 256 kprobes" +grep t /proc/kallsyms | cut -f3 -d" " | grep -v .*\\..* | \ +head -n 256 | while read i; do echo p ${i}+${OFFS} ; done > kprobe_events ||: + +echo 1 > events/kprobes/enable +echo 0 > events/kprobes/enable +echo > kprobe_events +echo "Waiting for unoptimizing & freeing" +sleep 5 +echo "Done" From a53276e2826010338478ed94310874001a8097fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 May 2017 10:14:11 -0400 Subject: [PATCH 246/254] x86/mm/ftrace: Do not bug in early boot on irqs_disabled in cpu_flush_range() With function tracing starting in early bootup and having its trampoline pages being read only, a bug triggered with the following: kernel BUG at arch/x86/mm/pageattr.c:189! invalid opcode: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 4.12.0-rc2-test+ #3 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 task: ffffffffb4222500 task.stack: ffffffffb4200000 RIP: 0010:change_page_attr_set_clr+0x269/0x302 RSP: 0000:ffffffffb4203c88 EFLAGS: 00010046 RAX: 0000000000000046 RBX: 0000000000000000 RCX: 00000001b6000000 RDX: ffffffffb4203d40 RSI: 0000000000000000 RDI: ffffffffb4240d60 RBP: ffffffffb4203d18 R08: 00000001b6000000 R09: 0000000000000001 R10: ffffffffb4203aa8 R11: 0000000000000003 R12: ffffffffc029b000 R13: ffffffffb4203d40 R14: 0000000000000001 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff9a639ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff9a636b384000 CR3: 00000001ea21d000 CR4: 00000000000406b0 Call Trace: change_page_attr_clear+0x1f/0x21 set_memory_ro+0x1e/0x20 arch_ftrace_update_trampoline+0x207/0x21c ? ftrace_caller+0x64/0x64 ? 0xffffffffc029b000 ftrace_startup+0xf4/0x198 register_ftrace_function+0x26/0x3c function_trace_init+0x5e/0x73 tracer_init+0x1e/0x23 tracing_set_tracer+0x127/0x15a register_tracer+0x19b/0x1bc init_function_trace+0x90/0x92 early_trace_init+0x236/0x2b3 start_kernel+0x200/0x3f5 x86_64_start_reservations+0x29/0x2b x86_64_start_kernel+0x17c/0x18f secondary_startup_64+0x9f/0x9f ? secondary_startup_64+0x9f/0x9f Interrupts should not be enabled at this early in the boot process. It is also fine to leave interrupts enabled during this time as there's only one CPU running, and on_each_cpu() means to only run on the current CPU. If early_boot_irqs_disabled is set, it is safe to run cpu_flush_range() with interrupts disabled. Don't trigger a BUG_ON() in that case. Link: http://lkml.kernel.org/r/20170526093717.0be3b849@gandalf.local.home Suggested-by: Thomas Gleixner Signed-off-by: Steven Rostedt (VMware) --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1dcd2be4cce4..c8520b2c62d2 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -186,7 +186,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) unsigned int i, level; unsigned long addr; - BUG_ON(irqs_disabled()); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); on_each_cpu(__cpa_flush_range, NULL, 1); From 6ee98ffeea0bc9e072e419497d78697d8afcdd6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 May 2017 10:57:51 +0200 Subject: [PATCH 247/254] x86/ftrace: Make sure that ftrace trampolines are not RWX ftrace use module_alloc() to allocate trampoline pages. The mapping of module_alloc() is RWX, which makes sense as the memory is written to right after allocation. But nothing makes these pages RO after writing to them. Add proper set_memory_rw/ro() calls to protect the trampolines after modification. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705251056410.1862@nanos Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0651e974dcb3..9bef1bbeba63 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -689,8 +689,12 @@ static inline void *alloc_tramp(unsigned long size) { return module_alloc(size); } -static inline void tramp_free(void *tramp) +static inline void tramp_free(void *tramp, int size) { + int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; + + set_memory_nx((unsigned long)tramp, npages); + set_memory_rw((unsigned long)tramp, npages); module_memfree(tramp); } #else @@ -699,7 +703,7 @@ static inline void *alloc_tramp(unsigned long size) { return NULL; } -static inline void tramp_free(void *tramp) { } +static inline void tramp_free(void *tramp, int size) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ @@ -771,7 +775,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* Copy ftrace_caller onto the trampoline memory */ ret = probe_kernel_read(trampoline, (void *)start_offset, size); if (WARN_ON(ret < 0)) { - tramp_free(trampoline); + tramp_free(trampoline, *tramp_size); return 0; } @@ -797,7 +801,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* Are we pointing to the reference? */ if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) { - tramp_free(trampoline); + tramp_free(trampoline, *tramp_size); return 0; } @@ -839,7 +843,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) unsigned long offset; unsigned long ip; unsigned int size; - int ret; + int ret, npages; if (ops->trampoline) { /* @@ -848,11 +852,14 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) */ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; + npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT; + set_memory_rw(ops->trampoline, npages); } else { ops->trampoline = create_trampoline(ops, &size); if (!ops->trampoline) return; ops->trampoline_size = size; + npages = PAGE_ALIGN(size) >> PAGE_SHIFT; } offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); @@ -863,6 +870,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); ret = update_ftrace_func(ip, new); + set_memory_ro(ops->trampoline, npages); /* The update should never fail */ WARN_ON(ret); @@ -939,7 +947,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; - tramp_free((void *)ops->trampoline); + tramp_free((void *)ops->trampoline, ops->trampoline_size); ops->trampoline = 0; } From 5ed02dbb497422bf225783f46e6eadd237d23d6b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 May 2017 17:20:53 -0700 Subject: [PATCH 248/254] Linux 4.12-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 63e10bd4f14a..470bd4d9513a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Fearless Coyote # *DOCUMENTATION* From 55b644fd2431cfd28d04cc28f092d49e7bea3433 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sat, 6 May 2017 19:37:45 +0200 Subject: [PATCH 249/254] usb: dwc2: add support for the DWC2 controller on Meson8 SoCs USB support in the Meson8 SoCs is provided by a DWC2 controller which works with the same settings as Meson8b and GXBB. Using the generic "snps,dwc2" binding results in an endless stream of "Overcurrent change detected" messages. Acked-by: Rob Herring Signed-off-by: Martin Blumenstingl Signed-off-by: Felipe Balbi --- Documentation/devicetree/bindings/usb/dwc2.txt | 1 + drivers/usb/dwc2/params.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/usb/dwc2.txt b/Documentation/devicetree/bindings/usb/dwc2.txt index 00bea038639e..fcf199b64d3d 100644 --- a/Documentation/devicetree/bindings/usb/dwc2.txt +++ b/Documentation/devicetree/bindings/usb/dwc2.txt @@ -10,6 +10,7 @@ Required properties: - "rockchip,rk3288-usb", "rockchip,rk3066-usb", "snps,dwc2": for rk3288 Soc; - "lantiq,arx100-usb": The DWC2 USB controller instance in Lantiq ARX SoCs; - "lantiq,xrx200-usb": The DWC2 USB controller instance in Lantiq XRX SoCs; + - "amlogic,meson8-usb": The DWC2 USB controller instance in Amlogic Meson8 SoCs; - "amlogic,meson8b-usb": The DWC2 USB controller instance in Amlogic Meson8b SoCs; - "amlogic,meson-gxbb-usb": The DWC2 USB controller instance in Amlogic S905 SoCs; - "amcc,dwc-otg": The DWC2 USB controller instance in AMCC Canyonlands 460EX SoCs; diff --git a/drivers/usb/dwc2/params.c b/drivers/usb/dwc2/params.c index 9cd8722f24f6..a3ffe97170ff 100644 --- a/drivers/usb/dwc2/params.c +++ b/drivers/usb/dwc2/params.c @@ -144,6 +144,8 @@ const struct of_device_id dwc2_of_match_table[] = { { .compatible = "lantiq,xrx200-usb", .data = dwc2_set_ltq_params }, { .compatible = "snps,dwc2" }, { .compatible = "samsung,s3c6400-hsotg" }, + { .compatible = "amlogic,meson8-usb", + .data = dwc2_set_amlogic_params }, { .compatible = "amlogic,meson8b-usb", .data = dwc2_set_amlogic_params }, { .compatible = "amlogic,meson-gxbb-usb", From dc9217b69dd6089dcfeb86ed4b3c671504326087 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Thu, 11 May 2017 17:26:48 -0700 Subject: [PATCH 250/254] usb: gadget: f_mass_storage: Serialize wake and sleep execution f_mass_storage has a memorry barrier issue with the sleep and wake functions that can cause a deadlock. This results in intermittent hangs during MSC file transfer. The host will reset the device after receiving no response to resume the transfer. This issue is seen when dwc3 is processing 2 transfer-in-progress events at the same time, invoking completion handlers for CSW and CBW. Also this issue occurs depending on the system timing and latency. To increase the chance to hit this issue, you can force dwc3 driver to wait and process those 2 events at once by adding a small delay (~100us) in dwc3_check_event_buf() whenever the request is for CSW and read the event count again. Avoid debugging with printk and ftrace as extra delays and memory barrier will mask this issue. Scenario which can lead to failure: ----------------------------------- 1) The main thread sleeps and waits for the next command in get_next_command(). 2) bulk_in_complete() wakes up main thread for CSW. 3) bulk_out_complete() tries to wake up the running main thread for CBW. 4) thread_wakeup_needed is not loaded with correct value in sleep_thread(). 5) Main thread goes to sleep again. The pattern is shown below. Note the 2 critical variables. * common->thread_wakeup_needed * bh->state CPU 0 (sleep_thread) CPU 1 (wakeup_thread) ============================== =============================== bh->state = BH_STATE_FULL; smp_wmb(); thread_wakeup_needed = 0; thread_wakeup_needed = 1; smp_rmb(); if (bh->state != BH_STATE_FULL) sleep again ... As pointed out by Alan Stern, this is an R-pattern issue. The issue can be seen when there are two wakeups in quick succession. The thread_wakeup_needed can be overwritten in sleep_thread, and the read of the bh->state maybe reordered before the write to thread_wakeup_needed. This patch applies full memory barrier smp_mb() in both sleep_thread() and wakeup_thread() to ensure the order which the thread_wakeup_needed and bh->state are written and loaded. However, a better solution in the future would be to use wait_queue method that takes care of managing memory barrier between waker and waiter. Cc: Acked-by: Alan Stern Signed-off-by: Thinh Nguyen Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_mass_storage.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index 4c8aacc232c0..74d57d6994da 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -396,7 +396,11 @@ static int fsg_set_halt(struct fsg_dev *fsg, struct usb_ep *ep) /* Caller must hold fsg->lock */ static void wakeup_thread(struct fsg_common *common) { - smp_wmb(); /* ensure the write of bh->state is complete */ + /* + * Ensure the reading of thread_wakeup_needed + * and the writing of bh->state are completed + */ + smp_mb(); /* Tell the main thread that something has happened */ common->thread_wakeup_needed = 1; if (common->thread_task) @@ -627,7 +631,12 @@ static int sleep_thread(struct fsg_common *common, bool can_freeze) } __set_current_state(TASK_RUNNING); common->thread_wakeup_needed = 0; - smp_rmb(); /* ensure the latest bh->state is visible */ + + /* + * Ensure the writing of thread_wakeup_needed + * and the reading of bh->state are completed + */ + smp_mb(); return rc; } From cdc876877ebc3f0677b267756d4564e2a429e730 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 26 Apr 2017 20:50:07 +0900 Subject: [PATCH 251/254] usb: gadget: udc: renesas_usb3: fix pm_runtime functions calling This patch fixes an issue that this driver is possible to access the registers before pm_runtime_get_sync() if a gadget driver is installed first. After that, oops happens on R-Car Gen3 environment. To avoid it, this patch changes the pm_runtime call timing from probe/remove to udc_start/udc_stop. Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 5a2d845fb1a6..10585da8e360 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -1799,6 +1799,9 @@ static int renesas_usb3_start(struct usb_gadget *gadget, /* hook up the driver */ usb3->driver = driver; + pm_runtime_enable(usb3_to_dev(usb3)); + pm_runtime_get_sync(usb3_to_dev(usb3)); + renesas_usb3_init_controller(usb3); return 0; @@ -1816,6 +1819,9 @@ static int renesas_usb3_stop(struct usb_gadget *gadget) renesas_usb3_stop_controller(usb3); spin_unlock_irqrestore(&usb3->lock, flags); + pm_runtime_put(usb3_to_dev(usb3)); + pm_runtime_disable(usb3_to_dev(usb3)); + return 0; } @@ -1891,9 +1897,6 @@ static int renesas_usb3_remove(struct platform_device *pdev) device_remove_file(&pdev->dev, &dev_attr_role); - pm_runtime_put(&pdev->dev); - pm_runtime_disable(&pdev->dev); - usb_del_gadget_udc(&usb3->gadget); __renesas_usb3_ep_free_request(usb3->ep0_req); @@ -2099,9 +2102,6 @@ static int renesas_usb3_probe(struct platform_device *pdev) usb3->workaround_for_vbus = priv->workaround_for_vbus; - pm_runtime_enable(&pdev->dev); - pm_runtime_get_sync(&pdev->dev); - dev_info(&pdev->dev, "probed\n"); return 0; From 067d6fdc558d2c43f0bfdc7af99630dd5eb08dc5 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 26 Apr 2017 20:50:08 +0900 Subject: [PATCH 252/254] usb: gadget: udc: renesas_usb3: fix deadlock by spinlock This patch fixes an issue that this driver is possible to cause deadlock by double-spinclocked in renesas_usb3_stop_controller(). So, this patch removes spinlock API calling in renesas_usb3_stop(). (In other words, the previous code had a redundant lock.) Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 10585da8e360..e27d948617e2 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -1810,14 +1810,11 @@ static int renesas_usb3_start(struct usb_gadget *gadget, static int renesas_usb3_stop(struct usb_gadget *gadget) { struct renesas_usb3 *usb3 = gadget_to_renesas_usb3(gadget); - unsigned long flags; - spin_lock_irqsave(&usb3->lock, flags); usb3->softconnect = false; usb3->gadget.speed = USB_SPEED_UNKNOWN; usb3->driver = NULL; renesas_usb3_stop_controller(usb3); - spin_unlock_irqrestore(&usb3->lock, flags); pm_runtime_put(usb3_to_dev(usb3)); pm_runtime_disable(usb3_to_dev(usb3)); From 940f538a100c84c6e72813e4ac88bd1753a86945 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 26 Apr 2017 20:50:09 +0900 Subject: [PATCH 253/254] usb: gadget: udc: renesas_usb3: lock for PN_ registers access This controller disallows to change the PIPE until reading/writing a packet finishes. However. the previous code is not enough to hold the lock in some functions. So, this patch fixes it. Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 28 ++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index e27d948617e2..c05097b5661e 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -1475,7 +1475,13 @@ static void usb3_request_done_pipen(struct renesas_usb3 *usb3, struct renesas_usb3_request *usb3_req, int status) { - usb3_pn_stop(usb3); + unsigned long flags; + + spin_lock_irqsave(&usb3->lock, flags); + if (usb3_pn_change(usb3, usb3_ep->num)) + usb3_pn_stop(usb3); + spin_unlock_irqrestore(&usb3->lock, flags); + usb3_disable_pipe_irq(usb3, usb3_ep->num); usb3_request_done(usb3_ep, usb3_req, status); @@ -1504,30 +1510,46 @@ static void usb3_irq_epc_pipen_bfrdy(struct renesas_usb3 *usb3, int num) { struct renesas_usb3_ep *usb3_ep = usb3_get_ep(usb3, num); struct renesas_usb3_request *usb3_req = usb3_get_request(usb3_ep); + bool done = false; if (!usb3_req) return; + spin_lock(&usb3->lock); + if (usb3_pn_change(usb3, num)) + goto out; + if (usb3_ep->dir_in) { /* Do not stop the IN pipe here to detect LSTTR interrupt */ if (!usb3_write_pipe(usb3_ep, usb3_req, USB3_PN_WRITE)) usb3_clear_bit(usb3, PN_INT_BFRDY, USB3_PN_INT_ENA); } else { if (!usb3_read_pipe(usb3_ep, usb3_req, USB3_PN_READ)) - usb3_request_done_pipen(usb3, usb3_ep, usb3_req, 0); + done = true; } + +out: + /* need to unlock because usb3_request_done_pipen() locks it */ + spin_unlock(&usb3->lock); + + if (done) + usb3_request_done_pipen(usb3, usb3_ep, usb3_req, 0); } static void usb3_irq_epc_pipen(struct renesas_usb3 *usb3, int num) { u32 pn_int_sta; - if (usb3_pn_change(usb3, num) < 0) + spin_lock(&usb3->lock); + if (usb3_pn_change(usb3, num) < 0) { + spin_unlock(&usb3->lock); return; + } pn_int_sta = usb3_read(usb3, USB3_PN_INT_STA); pn_int_sta &= usb3_read(usb3, USB3_PN_INT_ENA); usb3_write(usb3, pn_int_sta, USB3_PN_INT_STA); + spin_unlock(&usb3->lock); if (pn_int_sta & PN_INT_LSTTR) usb3_irq_epc_pipen_lsttr(usb3, num); if (pn_int_sta & PN_INT_BFRDY) From afbbc7913a288c29616bd31ae612548f6475151a Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 26 Apr 2017 20:50:10 +0900 Subject: [PATCH 254/254] usb: gadget: udc: renesas_usb3: Fix PN_INT_ENA disabling timing The PN_INT_ENA register should be used after usb3_pn_change() is called. So, this patch moves the access from renesas_usb3_stop_controller() to usb3_disable_pipe_n(). Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index c05097b5661e..cd4c88529721 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -623,7 +623,6 @@ static void renesas_usb3_stop_controller(struct renesas_usb3 *usb3) { usb3_disconnect(usb3); usb3_write(usb3, 0, USB3_P0_INT_ENA); - usb3_write(usb3, 0, USB3_PN_INT_ENA); usb3_write(usb3, 0, USB3_USB_OTG_INT_ENA); usb3_write(usb3, 0, USB3_USB_INT_ENA_1); usb3_write(usb3, 0, USB3_USB_INT_ENA_2); @@ -1682,6 +1681,7 @@ static int usb3_disable_pipe_n(struct renesas_usb3_ep *usb3_ep) spin_lock_irqsave(&usb3->lock, flags); if (!usb3_pn_change(usb3, usb3_ep->num)) { + usb3_write(usb3, 0, USB3_PN_INT_ENA); usb3_write(usb3, 0, USB3_PN_RAMMAP); usb3_clear_bit(usb3, PN_CON_EN, USB3_PN_CON); }