From 6fce983f9b3ef51d47e647b2cff15049ef803781 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Tue, 13 Dec 2016 11:03:49 +0000 Subject: [PATCH 001/297] ASoC: dwc: Fix PIO mode initialization We can no longer rely on the return value of devm_snd_dmaengine_pcm_register(...) to check if the DMA handle is declared in the DT. Previously this check activated PIO mode but currently dma_request_chan returns either a valid channel or -EPROBE_DEFER. In order to activate PIO mode check instead if the interrupt line is declared. This reflects better what is documented in the DT bindings (see Documentation/devicetree/bindings/sound/ designware-i2s.txt). Also, initialize use_pio variable which was never being set causing PIO mode to never work. Signed-off-by: Jose Abreu Signed-off-by: Mark Brown --- sound/soc/dwc/designware_i2s.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/sound/soc/dwc/designware_i2s.c b/sound/soc/dwc/designware_i2s.c index 2998954a1c74..bdf8398cbc81 100644 --- a/sound/soc/dwc/designware_i2s.c +++ b/sound/soc/dwc/designware_i2s.c @@ -681,22 +681,19 @@ static int dw_i2s_probe(struct platform_device *pdev) } if (!pdata) { - ret = devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, 0); - if (ret == -EPROBE_DEFER) { - dev_err(&pdev->dev, - "failed to register PCM, deferring probe\n"); - return ret; - } else if (ret) { - dev_err(&pdev->dev, - "Could not register DMA PCM: %d\n" - "falling back to PIO mode\n", ret); + if (irq >= 0) { ret = dw_pcm_register(pdev); - if (ret) { - dev_err(&pdev->dev, - "Could not register PIO PCM: %d\n", + dev->use_pio = true; + } else { + ret = devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, + 0); + dev->use_pio = false; + } + + if (ret) { + dev_err(&pdev->dev, "could not register pcm: %d\n", ret); - goto err_clk_disable; - } + goto err_clk_disable; } } From 8010d7feb2f0367ae573ad601b2905e29db50cd3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 11 Dec 2016 20:09:23 +0100 Subject: [PATCH 002/297] netfilter: nft_quota: reset quota after dump Dumping of netlink attributes may fail due to insufficient room in the skbuff, so let's reset consumed quota if we succeed to put netlink attributes into the skbuff. Fixes: 43da04a593d8 ("netfilter: nf_tables: atomic dump and reset for stateful objects") Reported-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_quota.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index bd6efc53f26d..2d6fe3559912 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -110,30 +110,32 @@ static int nft_quota_obj_init(const struct nlattr * const tb[], static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, bool reset) { + u64 consumed, consumed_cap; u32 flags = priv->flags; - u64 consumed; - - if (reset) { - consumed = atomic64_xchg(&priv->consumed, 0); - if (test_and_clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) - flags |= NFT_QUOTA_F_DEPLETED; - } else { - consumed = atomic64_read(&priv->consumed); - } /* Since we inconditionally increment consumed quota for each packet * that we see, don't go over the quota boundary in what we send to * userspace. */ - if (consumed > priv->quota) - consumed = priv->quota; + consumed = atomic64_read(&priv->consumed); + if (consumed >= priv->quota) { + consumed_cap = priv->quota; + flags |= NFT_QUOTA_F_DEPLETED; + } else { + consumed_cap = consumed; + } if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), NFTA_QUOTA_PAD) || - nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed), + nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap), NFTA_QUOTA_PAD) || nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) goto nla_put_failure; + + if (reset) { + atomic64_sub(consumed, &priv->consumed); + clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); + } return 0; nla_put_failure: From c2e756ff9e699865d294cdc112acfc36419cf5cc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 11 Dec 2016 20:46:51 +0100 Subject: [PATCH 003/297] netfilter: nft_queue: use raw_smp_processor_id() Using smp_processor_id() causes splats with PREEMPT_RCU: [19379.552780] BUG: using smp_processor_id() in preemptible [00000000] code: ping/32389 [19379.552793] caller is debug_smp_processor_id+0x17/0x19 [...] [19379.552823] Call Trace: [19379.552832] [] dump_stack+0x67/0x90 [19379.552837] [] check_preemption_disabled+0xe5/0xf5 [19379.552842] [] debug_smp_processor_id+0x17/0x19 [19379.552849] [] nft_queue_eval+0x35/0x20c [nft_queue] No need to disable preemption since we only fetch the numeric value, so let's use raw_smp_processor_id() instead. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 3e19fa1230dc..dbb6aaff67ec 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -38,7 +38,7 @@ static void nft_queue_eval(const struct nft_expr *expr, if (priv->queues_total > 1) { if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); queue = priv->queuenum + cpu % priv->queues_total; } else { From 3e38df136e453aa69eb4472108ebce2fb00b1ba6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 13 Dec 2016 13:59:33 +0100 Subject: [PATCH 004/297] netfilter: nf_tables: fix oob access BUG: KASAN: slab-out-of-bounds in nf_tables_rule_destroy+0xf1/0x130 at addr ffff88006a4c35c8 Read of size 8 by task nft/1607 When we've destroyed last valid expr, nft_expr_next() returns an invalid expr. We must not dereference it unless it passes != nft_expr_last() check. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a019a87e58ee..0db5f9782265 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2115,7 +2115,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); - while (expr->ops && expr != nft_expr_last(rule)) { + while (expr != nft_expr_last(rule) && expr->ops) { nf_tables_expr_destroy(ctx, expr); expr = nft_expr_next(expr); } From 053d20f5712529016eae10356e0dea9b360325bd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 14 Dec 2016 12:15:49 +0100 Subject: [PATCH 005/297] netfilter: nft_payload: mangle ckecksum if NFT_PAYLOAD_L4CSUM_PSEUDOHDR is set If the NFT_PAYLOAD_L4CSUM_PSEUDOHDR flag is set, then mangle layer 4 checksum. This should not depend on csum_type NFT_PAYLOAD_CSUM_INET since IPv6 header has no checksum field, but still an update of any of the pseudoheader fields may trigger a layer 4 checksum update. Fixes: 1814096980bb ("netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader fields") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 36d2b1096546..7d699bbd45b0 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -250,6 +250,22 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, return 0; } +static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, + __wsum fsum, __wsum tsum, int csum_offset) +{ + __sum16 sum; + + if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + nft_csum_replace(&sum, fsum, tsum); + if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + return 0; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -259,7 +275,6 @@ static void nft_payload_set_eval(const struct nft_expr *expr, const u32 *src = ®s->data[priv->sreg]; int offset, csum_offset; __wsum fsum, tsum; - __sum16 sum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: @@ -282,18 +297,14 @@ static void nft_payload_set_eval(const struct nft_expr *expr, csum_offset = offset + priv->csum_offset; offset += priv->offset; - if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && + if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) && (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER || skb->ip_summed != CHECKSUM_PARTIAL)) { - if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) - goto err; - fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); - nft_csum_replace(&sum, fsum, tsum); - if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || - skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && + nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset)) goto err; if (priv->csum_flags && From 0ea617a298dcdc2251b4e10f83ac3f3e627b66e3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 8 Dec 2016 13:05:43 +0000 Subject: [PATCH 006/297] ASoC: rsnd: don't double free kctrl On an error, snd_ctl_add already free's kctrl, so calling snd_ctl_free_one to free it again leads to a double free error. Fix this by removing the extraneous snd_ctl_free_one call. Issue found using static analysis with CoverityScan, CID 1372908 Signed-off-by: Colin Ian King Acked-by: Kuninori Morimoto Signed-off-by: Mark Brown --- sound/soc/sh/rcar/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/sh/rcar/core.c b/sound/soc/sh/rcar/core.c index 4bd68de76130..99b5b0835c1e 100644 --- a/sound/soc/sh/rcar/core.c +++ b/sound/soc/sh/rcar/core.c @@ -1030,10 +1030,8 @@ static int __rsnd_kctrl_new(struct rsnd_mod *mod, return -ENOMEM; ret = snd_ctl_add(card, kctrl); - if (ret < 0) { - snd_ctl_free_one(kctrl); + if (ret < 0) return ret; - } cfg->update = update; cfg->card = card; From c2b36129ce53a22b89dd2b88db33e7ffdefe0f41 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 9 Dec 2016 14:17:47 +0000 Subject: [PATCH 007/297] ASoC: topology: kfree kcontrol->private_value before freeing kcontrol kcontrol->private_value is being kfree'd after kcontrol has been freed (in previous call to snd_ctl_remove). Instead, fix this by kfreeing the private_value before kcontrol. CoverityScan CID#1388311 "Read from pointer after free" Fixes: eea3dd4f1247a ("ASoC: topology: Only free TLV for volume mixers of a widget") Signed-off-by: Colin Ian King Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 65670b2b408c..fbfb1fab88d5 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -514,13 +514,12 @@ static void remove_widget(struct snd_soc_component *comp, == SND_SOC_TPLG_TYPE_MIXER) kfree(kcontrol->tlv.p); - snd_ctl_remove(card, kcontrol); - /* Private value is used as struct soc_mixer_control * for volume mixers or soc_bytes_ext for bytes * controls. */ kfree((void *)kcontrol->private_value); + snd_ctl_remove(card, kcontrol); } kfree(w->kcontrol_news); } From 9e4d59ada4d602e78eee9fb5f898ce61fdddb446 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 16 Dec 2016 18:26:54 +0900 Subject: [PATCH 008/297] ASoC: hdmi-codec: use unsigned type to structure members with bit-field This is a fix for Linux 4.10-rc1. In C language specification, a bit-field is interpreted as a signed or unsigned integer type consisting of the specified number of bits. In GCC manual, the range of a signed bit field of N bits is from -(2^N) / 2 to ((2^N) / 2) - 1 https://www.gnu.org/software/gnu-c-manual/gnu-c-manual.html#Bit-Fields Therefore, when defined as 1 bit-field with signed type, variables can represents -1 and 0. The snd-soc-hdmi-codec module includes a structure which has signed type members with bit-fields. Codes of this module assign 0 and 1 to the members. This seems to result in implementation-dependent behaviours. As of v4.10-rc1 merge window, outside of sound subsystem, this structure is referred by below GPU modules. - tda998x - sti-drm - mediatek-drm-hdmi - msm As long as I review their codes relevant to the structure, the structure members are used just for condition statements and printk formats. My proposal of change is a bit intrusive to the printk formats but this may be acceptable. Totally, it's reasonable to use unsigned type for the structure members. This bug is detected by Sparse, static code analyzer with below warnings. ./include/sound/hdmi-codec.h:39:26: error: dubious one-bit signed bitfield ./include/sound/hdmi-codec.h:40:28: error: dubious one-bit signed bitfield ./include/sound/hdmi-codec.h:41:29: error: dubious one-bit signed bitfield ./include/sound/hdmi-codec.h:42:31: error: dubious one-bit signed bitfield Fixes: 09184118a8ab ("ASoC: hdmi-codec: Add hdmi-codec for external HDMI-encoders") Signed-off-by: Takashi Sakamoto Acked-by: Arnaud Pouliquen Signed-off-by: Mark Brown CC: stable@vger.kernel.org --- include/sound/hdmi-codec.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sound/hdmi-codec.h b/include/sound/hdmi-codec.h index 530c57bdefa0..915c4357945c 100644 --- a/include/sound/hdmi-codec.h +++ b/include/sound/hdmi-codec.h @@ -36,10 +36,10 @@ struct hdmi_codec_daifmt { HDMI_AC97, HDMI_SPDIF, } fmt; - int bit_clk_inv:1; - int frame_clk_inv:1; - int bit_clk_master:1; - int frame_clk_master:1; + unsigned int bit_clk_inv:1; + unsigned int frame_clk_inv:1; + unsigned int bit_clk_master:1; + unsigned int frame_clk_master:1; }; /* From 65dadffddbe44a60f8be9e95f264949ba1e547e9 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Fri, 16 Dec 2016 14:43:39 -0800 Subject: [PATCH 009/297] Input: joydev - remove unused linux/miscdevice.h include This patch remove the inclusion of linux/miscdevice.h for joydev since it does not use miscdevice. Signed-off-by: Corentin Labbe Signed-off-by: Dmitry Torokhov --- drivers/input/joydev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c index f3135ae22df4..abd18f31b24f 100644 --- a/drivers/input/joydev.c +++ b/drivers/input/joydev.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include From 41c567a5d7d1a986763e58c3394782813c3bcb03 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Sun, 18 Dec 2016 15:26:12 -0800 Subject: [PATCH 010/297] Input: i8042 - add Pegatron touchpad to noloop table Avoid AUX loopback in Pegatron C15B touchpad, so input subsystem is able to recognize a Synaptics touchpad in the AUX port. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=93791 (Touchpad is not detected on DNS 0801480 notebook (PEGATRON C15B)) Suggested-by: Dmitry Torokhov Signed-off-by: Marcos Paulo de Souza Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-x86ia64io.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h index 73a4e68448fc..381d802fe853 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-x86ia64io.h @@ -211,6 +211,12 @@ static const struct dmi_system_id __initconst i8042_dmi_noloop_table[] = { DMI_MATCH(DMI_PRODUCT_VERSION, "Rev 1"), }, }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "PEGATRON CORPORATION"), + DMI_MATCH(DMI_PRODUCT_NAME, "C15B"), + }, + }, { } }; From 67626c9323027534496d21cf32793121662d03c0 Mon Sep 17 00:00:00 2001 From: Aniroop Mathur Date: Sun, 18 Dec 2016 15:27:16 -0800 Subject: [PATCH 011/297] Input: synaptics_i2c - change msleep to usleep_range for small msecs msleep(1~20) may not do what the caller intends, and will often sleep longer. (~20 ms actual sleep for any value given in the 1~20ms range) This is not the desired behaviour for many cases like device resume time, device suspend time, device enable time, retry logic, etc. Thus, change msleep to usleep_range for precise wakeups. Signed-off-by: Aniroop Mathur Acked-by: Igor Grinberg Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/synaptics_i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/mouse/synaptics_i2c.c b/drivers/input/mouse/synaptics_i2c.c index aa7c5da60800..cb2bf203f4ca 100644 --- a/drivers/input/mouse/synaptics_i2c.c +++ b/drivers/input/mouse/synaptics_i2c.c @@ -29,7 +29,7 @@ * after soft reset, we should wait for 1 ms * before the device becomes operational */ -#define SOFT_RESET_DELAY_MS 3 +#define SOFT_RESET_DELAY_US 3000 /* and after hard reset, we should wait for max 500ms */ #define HARD_RESET_DELAY_MS 500 @@ -311,7 +311,7 @@ static int synaptics_i2c_reset_config(struct i2c_client *client) if (ret) { dev_err(&client->dev, "Unable to reset device\n"); } else { - msleep(SOFT_RESET_DELAY_MS); + usleep_range(SOFT_RESET_DELAY_US, SOFT_RESET_DELAY_US + 100); ret = synaptics_i2c_config(client); if (ret) dev_err(&client->dev, "Unable to config device\n"); From 4a8b3a682be9addff7dbd16371fa8c34103b5c31 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 16 Dec 2016 10:55:49 -0600 Subject: [PATCH 012/297] ASoC: Intel: bytcr_rt5640: fallback mechanism if MCLK is not enabled Commit df1a2776a795 ("ASoC: Intel: bytcr_rt5640: add MCLK support") was merged but the corresponding clock framework patches have not, after being bumped from audio to clock to x86 domains. The missing clock-related patches result in a regression starting with 4.9 with the audio card not being created. Rather than reverting this commit and all following updates already queued up for 4.10, handle run-time dependency on MCLK and fall back to the previous bit-clock mode. This provides the same functionality as in 4.8 for Baytrail devices. On Baytrail-CR most devices remain silent with this fallback but additional patches are needed anyway. As suggested by Mark Brown, the fallback is only allowed with -ENOENT, all other run-time errors, including -EPROBE_DEFER, will stop the probe with no sound card registered. This patch should be applied to -stable as well as ASoC 4.10 fixes Signed-off-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- sound/soc/intel/boards/bytcr_rt5640.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index 507a86a5eafe..e33e4777a65c 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -825,10 +825,20 @@ static int snd_byt_rt5640_mc_probe(struct platform_device *pdev) if ((byt_rt5640_quirk & BYT_RT5640_MCLK_EN) && (is_valleyview())) { priv->mclk = devm_clk_get(&pdev->dev, "pmc_plt_clk_3"); if (IS_ERR(priv->mclk)) { + ret_val = PTR_ERR(priv->mclk); + dev_err(&pdev->dev, - "Failed to get MCLK from pmc_plt_clk_3: %ld\n", - PTR_ERR(priv->mclk)); - return PTR_ERR(priv->mclk); + "Failed to get MCLK from pmc_plt_clk_3: %d\n", + ret_val); + + /* + * Fall back to bit clock usage for -ENOENT (clock not + * available likely due to missing dependencies), bail + * for all other errors, including -EPROBE_DEFER + */ + if (ret_val != -ENOENT) + return ret_val; + byt_rt5640_quirk &= ~BYT_RT5640_MCLK_EN; } } From 73ba39ab9307340dc98ec3622891314bbc09cc2e Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sun, 4 Dec 2016 12:51:53 +0800 Subject: [PATCH 013/297] btrfs: return the actual error value from from btrfs_uuid_tree_iterate In function btrfs_uuid_tree_iterate(), errno is assigned to variable ret on errors. However, it directly returns 0. It may be better to return ret. This patch also removes the warning, because the caller already prints a warning. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188731 Signed-off-by: Pan Bian Reviewed-by: Omar Sandoval [ edited subject ] Signed-off-by: David Sterba --- fs/btrfs/uuid-tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 161342b73ce5..726f928238d0 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -352,7 +352,5 @@ skip: out: btrfs_free_path(path); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); - return 0; + return ret; } From 1cab2a84f470e15ecc8e5143bfe9398c6e888032 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 20 Dec 2016 10:29:12 +0000 Subject: [PATCH 014/297] ASoC: wm_adsp: Don't overrun firmware file buffer when reading region data Protect against corrupt firmware files by ensuring that the length we get for the data in a region actually lies within the available firmware file data buffer. Signed-off-by: Richard Fitzgerald Signed-off-by: Mark Brown --- sound/soc/codecs/wm_adsp.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 593b7d1aed46..d72ccef9e238 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -1551,7 +1551,7 @@ static int wm_adsp_load(struct wm_adsp *dsp) const struct wmfw_region *region; const struct wm_adsp_region *mem; const char *region_name; - char *file, *text; + char *file, *text = NULL; struct wm_adsp_buf *buf; unsigned int reg; int regions = 0; @@ -1700,10 +1700,21 @@ static int wm_adsp_load(struct wm_adsp *dsp) regions, le32_to_cpu(region->len), offset, region_name); + if ((pos + le32_to_cpu(region->len) + sizeof(*region)) > + firmware->size) { + adsp_err(dsp, + "%s.%d: %s region len %d bytes exceeds file length %zu\n", + file, regions, region_name, + le32_to_cpu(region->len), firmware->size); + ret = -EINVAL; + goto out_fw; + } + if (text) { memcpy(text, region->data, le32_to_cpu(region->len)); adsp_info(dsp, "%s: %s\n", file, text); kfree(text); + text = NULL; } if (reg) { @@ -1748,6 +1759,7 @@ out_fw: regmap_async_complete(regmap); wm_adsp_buf_free(&buf_list); release_firmware(firmware); + kfree(text); out: kfree(file); @@ -2233,6 +2245,17 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp) } if (reg) { + if ((pos + le32_to_cpu(blk->len) + sizeof(*blk)) > + firmware->size) { + adsp_err(dsp, + "%s.%d: %s region len %d bytes exceeds file length %zu\n", + file, blocks, region_name, + le32_to_cpu(blk->len), + firmware->size); + ret = -EINVAL; + goto out_fw; + } + buf = wm_adsp_buf_alloc(blk->data, le32_to_cpu(blk->len), &buf_list); From 7dbbf0fa1bf14c17900bb8057986b06db3822239 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Nov 2016 16:17:50 -0800 Subject: [PATCH 015/297] scsi: scsi-mq: Wait for .queue_rq() if necessary Ensure that if scsi-mq is enabled that scsi_internal_device_block() waits until ongoing shost->hostt->queuecommand() calls have finished. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Cc: James Bottomley Cc: Christoph Hellwig Cc: Doug Ledford Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index c35b6de4ca64..9fd9a977c695 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2893,7 +2893,7 @@ scsi_internal_device_block(struct scsi_device *sdev) * request queue. */ if (q->mq_ops) { - blk_mq_stop_hw_queues(q); + blk_mq_quiesce_queue(q); } else { spin_lock_irqsave(q->queue_lock, flags); blk_stop_queue(q); From 7961d53d22375bb9e8ae8063533b9059102ed39d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 20 Dec 2016 08:43:30 -0800 Subject: [PATCH 016/297] scsi: qedi: fix build, depends on UIO Fix build of SCSI qedi driver. It uses uio interfaces so it should depend on UIO. ERROR: "uio_unregister_device" [drivers/scsi/qedi/qedi.ko] undefined! ERROR: "uio_event_notify" [drivers/scsi/qedi/qedi.ko] undefined! ERROR: "__uio_register_device" [drivers/scsi/qedi/qedi.ko] undefined! Signed-off-by: Randy Dunlap Cc: QLogic-Storage-Upstream@cavium.com Signed-off-by: Martin K. Petersen --- drivers/scsi/qedi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qedi/Kconfig b/drivers/scsi/qedi/Kconfig index 23ca8a274586..21331453db7b 100644 --- a/drivers/scsi/qedi/Kconfig +++ b/drivers/scsi/qedi/Kconfig @@ -1,6 +1,6 @@ config QEDI tristate "QLogic QEDI 25/40/100Gb iSCSI Initiator Driver Support" - depends on PCI && SCSI + depends on PCI && SCSI && UIO depends on QED select SCSI_ISCSI_ATTRS select QED_LL2 From 6c5d5cfbe3c59429e6d6f66477a7609aacf69751 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 20 Dec 2016 19:14:34 +0800 Subject: [PATCH 017/297] netfilter: ipt_CLUSTERIP: check duplicate config when initializing Now when adding an ipt_CLUSTERIP rule, it only checks duplicate config in clusterip_config_find_get(). But after that, there may be still another thread to insert a config with the same ip, then it leaves proc_create_data to do duplicate check. It's more reasonable to check duplicate config by ipt_CLUSTERIP itself, instead of checking it by proc fs duplicate file check. Before, when proc fs allowed duplicate name files in a directory, It could even crash kernel because of use-after-free. This patch is to check duplicate config under the protection of clusterip net lock when initializing a new config and correct the return err. Note that it also moves proc file node creation after adding new config, as proc_create_data may sleep, it couldn't be called under the clusterip_net lock. clusterip_config_find_get returns NULL if c->pde is null to make sure it can't be used until the proc file node creation is done. Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 34 ++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 21db00d0362b..a6b8c1a4102b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -144,7 +144,7 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) rcu_read_lock_bh(); c = __clusterip_config_find(net, clusterip); if (c) { - if (unlikely(!atomic_inc_not_zero(&c->refcount))) + if (!c->pde || unlikely(!atomic_inc_not_zero(&c->refcount))) c = NULL; else if (entry) atomic_inc(&c->entries); @@ -166,14 +166,15 @@ clusterip_config_init_nodelist(struct clusterip_config *c, static struct clusterip_config * clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, - struct net_device *dev) + struct net_device *dev) { + struct net *net = dev_net(dev); struct clusterip_config *c; - struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id); + struct clusterip_net *cn = net_generic(net, clusterip_net_id); c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) - return NULL; + return ERR_PTR(-ENOMEM); c->dev = dev; c->clusterip = ip; @@ -185,6 +186,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, atomic_set(&c->refcount, 1); atomic_set(&c->entries, 1); + spin_lock_bh(&cn->lock); + if (__clusterip_config_find(net, ip)) { + spin_unlock_bh(&cn->lock); + kfree(c); + + return ERR_PTR(-EBUSY); + } + + list_add_rcu(&c->list, &cn->configs); + spin_unlock_bh(&cn->lock); + #ifdef CONFIG_PROC_FS { char buffer[16]; @@ -195,16 +207,16 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, cn->procdir, &clusterip_proc_fops, c); if (!c->pde) { + spin_lock_bh(&cn->lock); + list_del_rcu(&c->list); + spin_unlock_bh(&cn->lock); kfree(c); - return NULL; + + return ERR_PTR(-ENOMEM); } } #endif - spin_lock_bh(&cn->lock); - list_add_rcu(&c->list, &cn->configs); - spin_unlock_bh(&cn->lock); - return c; } @@ -410,9 +422,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) config = clusterip_config_init(cipinfo, e->ip.dst.s_addr, dev); - if (!config) { + if (IS_ERR(config)) { dev_put(dev); - return -ENOMEM; + return PTR_ERR(config); } dev_mc_add(config->dev, config->clustermac); } From b6fc513da50c5dbc457a8ad6b58b046a6a68fd9d Mon Sep 17 00:00:00 2001 From: Pavel Rojtberg Date: Tue, 27 Dec 2016 11:44:51 -0800 Subject: [PATCH 018/297] Input: xpad - use correct product id for x360w controllers currently the controllers get the same product id as the wireless receiver. However the controllers actually have their own product id. The patch makes the driver expose the same product id as the windows driver. This improves compatibility when running applications with WINE. see https://github.com/paroj/xpad/issues/54 Signed-off-by: Pavel Rojtberg Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 6d9499658671..c7d5b2b643d1 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1377,6 +1377,12 @@ static int xpad_init_input(struct usb_xpad *xpad) input_dev->name = xpad->name; input_dev->phys = xpad->phys; usb_to_input_id(xpad->udev, &input_dev->id); + + if (xpad->xtype == XTYPE_XBOX360W) { + /* x360w controllers and the receiver have different ids */ + input_dev->id.product = 0x02a1; + } + input_dev->dev.parent = &xpad->intf->dev; input_set_drvdata(input_dev, xpad); From d7ddad0acc4add42567f7879b116a0b9eea31860 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 27 Dec 2016 11:32:55 -0800 Subject: [PATCH 019/297] Input: synaptics-rmi4 - fix F03 build error when serio is module Since F03 is a boolean, "depends" on symbols that can be modules do not work quite right. We can enable F03 if SERIO is built-in or if both RMI core and SERIO core are modules. If SERIO core is module, but RMI is built-in, we'll get: drivers/built-in.o: In function `rmi_f03_attention': rmi_f03.c:(.text+0xf8ef8): undefined reference to `serio_interrupt' rmi_f03.c:(.text+0xf8fbd): undefined reference to `serio_interrupt' drivers/built-in.o: In function `rmi_f03_remove': rmi_f03.c:(.text+0xf9082): undefined reference to `serio_unregister_port' drivers/built-in.o: In function `rmi_f03_probe': rmi_f03.c:(.text+0xf9260): undefined reference to `__serio_register_port' Reported-by: Randy Dunlap Reviewed-by: Guenter Roeck Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/input/rmi4/Kconfig b/drivers/input/rmi4/Kconfig index 30cc627a4f45..8993983e3fe4 100644 --- a/drivers/input/rmi4/Kconfig +++ b/drivers/input/rmi4/Kconfig @@ -41,7 +41,8 @@ config RMI4_SMB config RMI4_F03 bool "RMI4 Function 03 (PS2 Guest)" - depends on RMI4_CORE && SERIO + depends on RMI4_CORE + depends on SERIO=y || RMI4_CORE=SERIO help Say Y here if you want to add support for RMI4 function 03. From 9396c9cb0d9534ca67bda8a34b2413a2ca1c67e9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 22 Dec 2016 15:03:50 +0900 Subject: [PATCH 020/297] perf sched timehist: Show total scheduling time Show length of analyzed sample time and rate of idle task running. This also takes care of time range given by --time option. $ perf sched timehist -sI | tail Samples do not have callchains. Idle stats: CPU 0 idle for 930.316 msec ( 92.93%) CPU 1 idle for 963.614 msec ( 96.25%) CPU 2 idle for 885.482 msec ( 88.45%) CPU 3 idle for 938.635 msec ( 93.76%) Total number of unique tasks: 118 Total number of context switches: 2337 Total run time (msec): 3718.048 Total scheduling time (msec): 1001.131 (x 4) Suggested-by: David Ahern Signed-off-by: Namhyung Kim Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20161222060350.17655-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index d53e706a6f17..5b134b0d1ff3 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -209,6 +209,7 @@ struct perf_sched { u64 skipped_samples; const char *time_str; struct perf_time_interval ptime; + struct perf_time_interval hist_time; }; /* per thread run time data */ @@ -2460,6 +2461,11 @@ static int timehist_sched_change_event(struct perf_tool *tool, timehist_print_sample(sched, sample, &al, thread, t); out: + if (sched->hist_time.start == 0 && t >= ptime->start) + sched->hist_time.start = t; + if (ptime->end == 0 || t <= ptime->end) + sched->hist_time.end = t; + if (tr) { /* time of this sched_switch event becomes last time task seen */ tr->last_time = sample->time; @@ -2624,6 +2630,7 @@ static void timehist_print_summary(struct perf_sched *sched, struct thread *t; struct thread_runtime *r; int i; + u64 hist_time = sched->hist_time.end - sched->hist_time.start; memset(&totals, 0, sizeof(totals)); @@ -2665,7 +2672,7 @@ static void timehist_print_summary(struct perf_sched *sched, totals.sched_count += r->run_stats.n; printf(" CPU %2d idle for ", i); print_sched_time(r->total_run_time, 6); - printf(" msec\n"); + printf(" msec (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time); } else printf(" CPU %2d idle entire time window\n", i); } @@ -2701,12 +2708,16 @@ static void timehist_print_summary(struct perf_sched *sched, printf("\n" " Total number of unique tasks: %" PRIu64 "\n" - "Total number of context switches: %" PRIu64 "\n" - " Total run time (msec): ", + "Total number of context switches: %" PRIu64 "\n", totals.task_count, totals.sched_count); + printf(" Total run time (msec): "); print_sched_time(totals.total_run_time, 2); printf("\n"); + + printf(" Total scheduling time (msec): "); + print_sched_time(hist_time, 2); + printf(" (x %d)\n", sched->max_cpu); } typedef int (*sched_handler)(struct perf_tool *tool, From ee12996c9d392ec61241ab6c74d257bc2122e0bc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Dec 2016 21:49:17 -0300 Subject: [PATCH 021/297] samples/bpf sock_example: Avoid getting ethhdr from two includes To avoid the following build failure on Alpine Linux 3.4, that has clang-3.8 with the bpf target: HOSTCC samples/bpf/sock_example.o In file included from /usr/include/net/ethernet.h:10:0, from /git/linux/samples/bpf/sock_example.h:7, from /git/linux/samples/bpf/sock_example.c:30: /usr/include/netinet/if_ether.h:96:8: error: redefinition of 'struct ethhdr' struct ethhdr { ^ In file included from /git/linux/samples/bpf/sock_example.c:26:0: ./usr/include/linux/if_ether.h:144:8: note: originally defined here struct ethhdr { ^ scripts/Makefile.host:124: recipe for target 'samples/bpf/sock_example.o' failed make[2]: *** [samples/bpf/sock_example.o] Error 1 /git/linux/Makefile:1658: recipe for target 'samples/bpf/' failed So include net/if_ether.h for the needs of sock_example.h, using the same include that sock_example.c uses. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Joe Stringer Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-m9avekl1b651qe1r1zd5tzz9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- samples/bpf/sock_example.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h index 09f7fe7e5fd7..d8014065d479 100644 --- a/samples/bpf/sock_example.h +++ b/samples/bpf/sock_example.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include From abfb7b686a3e5be27bf81db62f9c5c895b76f5d1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 24 Dec 2016 13:59:23 +0000 Subject: [PATCH 022/297] efi/libstub/arm*: Pass latest memory map to the kernel As reported by James Morse, the current libstub code involving the annotated memory map only works somewhat correctly by accident, due to the fact that a pool allocation happens to be reused immediately, retaining its former contents on most implementations of the UEFI boot services. Instead of juggling memory maps, which makes the code more complex than it needs to be, simply put placeholder values into the FDT for the memory map parameters, and only write the actual values after ExitBootServices() has been called. Reported-by: James Morse Signed-off-by: Ard Biesheuvel Cc: Cc: Jeffrey Hugo Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Cc: linux-efi@vger.kernel.org Fixes: ed9cc156c42f ("efi/libstub: Use efi_exit_boot_services() in FDT") Link: http://lkml.kernel.org/r/1482587963-20183-2-git-send-email-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- drivers/firmware/efi/libstub/efistub.h | 8 --- drivers/firmware/efi/libstub/fdt.c | 87 +++++++++++++++++--------- 2 files changed, 56 insertions(+), 39 deletions(-) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index b98824e3800a..0e2a96b12cb3 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -39,14 +39,6 @@ efi_status_t efi_file_close(void *handle); unsigned long get_dram_base(efi_system_table_t *sys_table_arg); -efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt, - unsigned long orig_fdt_size, - void *fdt, int new_fdt_size, char *cmdline_ptr, - u64 initrd_addr, u64 initrd_size, - efi_memory_desc_t *memory_map, - unsigned long map_size, unsigned long desc_size, - u32 desc_ver); - efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, void *handle, unsigned long *new_fdt_addr, diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index a6a93116a8f0..921dfa047202 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -16,13 +16,10 @@ #include "efistub.h" -efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt, - unsigned long orig_fdt_size, - void *fdt, int new_fdt_size, char *cmdline_ptr, - u64 initrd_addr, u64 initrd_size, - efi_memory_desc_t *memory_map, - unsigned long map_size, unsigned long desc_size, - u32 desc_ver) +static efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt, + unsigned long orig_fdt_size, + void *fdt, int new_fdt_size, char *cmdline_ptr, + u64 initrd_addr, u64 initrd_size) { int node, num_rsv; int status; @@ -101,25 +98,23 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt, if (status) goto fdt_set_fail; - fdt_val64 = cpu_to_fdt64((u64)(unsigned long)memory_map); + fdt_val64 = U64_MAX; /* placeholder */ status = fdt_setprop(fdt, node, "linux,uefi-mmap-start", &fdt_val64, sizeof(fdt_val64)); if (status) goto fdt_set_fail; - fdt_val32 = cpu_to_fdt32(map_size); + fdt_val32 = U32_MAX; /* placeholder */ status = fdt_setprop(fdt, node, "linux,uefi-mmap-size", &fdt_val32, sizeof(fdt_val32)); if (status) goto fdt_set_fail; - fdt_val32 = cpu_to_fdt32(desc_size); status = fdt_setprop(fdt, node, "linux,uefi-mmap-desc-size", &fdt_val32, sizeof(fdt_val32)); if (status) goto fdt_set_fail; - fdt_val32 = cpu_to_fdt32(desc_ver); status = fdt_setprop(fdt, node, "linux,uefi-mmap-desc-ver", &fdt_val32, sizeof(fdt_val32)); if (status) @@ -148,6 +143,43 @@ fdt_set_fail: return EFI_LOAD_ERROR; } +static efi_status_t update_fdt_memmap(void *fdt, struct efi_boot_memmap *map) +{ + int node = fdt_path_offset(fdt, "/chosen"); + u64 fdt_val64; + u32 fdt_val32; + int err; + + if (node < 0) + return EFI_LOAD_ERROR; + + fdt_val64 = cpu_to_fdt64((unsigned long)*map->map); + err = fdt_setprop_inplace(fdt, node, "linux,uefi-mmap-start", + &fdt_val64, sizeof(fdt_val64)); + if (err) + return EFI_LOAD_ERROR; + + fdt_val32 = cpu_to_fdt32(*map->map_size); + err = fdt_setprop_inplace(fdt, node, "linux,uefi-mmap-size", + &fdt_val32, sizeof(fdt_val32)); + if (err) + return EFI_LOAD_ERROR; + + fdt_val32 = cpu_to_fdt32(*map->desc_size); + err = fdt_setprop_inplace(fdt, node, "linux,uefi-mmap-desc-size", + &fdt_val32, sizeof(fdt_val32)); + if (err) + return EFI_LOAD_ERROR; + + fdt_val32 = cpu_to_fdt32(*map->desc_ver); + err = fdt_setprop_inplace(fdt, node, "linux,uefi-mmap-desc-ver", + &fdt_val32, sizeof(fdt_val32)); + if (err) + return EFI_LOAD_ERROR; + + return EFI_SUCCESS; +} + #ifndef EFI_FDT_ALIGN #define EFI_FDT_ALIGN EFI_PAGE_SIZE #endif @@ -243,20 +275,10 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, goto fail; } - /* - * Now that we have done our final memory allocation (and free) - * we can get the memory map key needed for - * exit_boot_services(). - */ - status = efi_get_memory_map(sys_table, &map); - if (status != EFI_SUCCESS) - goto fail_free_new_fdt; - status = update_fdt(sys_table, (void *)fdt_addr, fdt_size, (void *)*new_fdt_addr, new_fdt_size, - cmdline_ptr, initrd_addr, initrd_size, - memory_map, map_size, desc_size, desc_ver); + cmdline_ptr, initrd_addr, initrd_size); /* Succeeding the first time is the expected case. */ if (status == EFI_SUCCESS) @@ -266,20 +288,16 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, /* * We need to allocate more space for the new * device tree, so free existing buffer that is - * too small. Also free memory map, as we will need - * to get new one that reflects the free/alloc we do - * on the device tree buffer. + * too small. */ efi_free(sys_table, new_fdt_size, *new_fdt_addr); - sys_table->boottime->free_pool(memory_map); new_fdt_size += EFI_PAGE_SIZE; } else { pr_efi_err(sys_table, "Unable to construct new device tree.\n"); - goto fail_free_mmap; + goto fail_free_new_fdt; } } - sys_table->boottime->free_pool(memory_map); priv.runtime_map = runtime_map; priv.runtime_entry_count = &runtime_entry_count; status = efi_exit_boot_services(sys_table, handle, &map, &priv, @@ -288,6 +306,16 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, if (status == EFI_SUCCESS) { efi_set_virtual_address_map_t *svam; + status = update_fdt_memmap((void *)*new_fdt_addr, &map); + if (status != EFI_SUCCESS) { + /* + * The kernel won't get far without the memory map, but + * may still be able to print something meaningful so + * return success here. + */ + return EFI_SUCCESS; + } + /* Install the new virtual address map */ svam = sys_table->runtime->set_virtual_address_map; status = svam(runtime_entry_count * desc_size, desc_size, @@ -319,9 +347,6 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, pr_efi_err(sys_table, "Exit boot services failed.\n"); -fail_free_mmap: - sys_table->boottime->free_pool(memory_map); - fail_free_new_fdt: efi_free(sys_table, new_fdt_size, *new_fdt_addr); From b6f4c66704b875aba9b8c912532323e3cc89824c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 28 Dec 2016 10:47:13 -0300 Subject: [PATCH 023/297] samples/bpf trace_output_user: Remove duplicate sys/ioctl.h include Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Joe Stringer Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-3awp0nv8tpnblatojmwjww7z@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- samples/bpf/trace_output_user.c | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index f4fa6af22def..ccca1e348017 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include From 88b333b0ed790f9433ff542b163bf972953b74d3 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 20 Dec 2016 08:54:29 -0700 Subject: [PATCH 024/297] drm/msm: Ensure that the hardware write pointer is valid Currently the value written to CP_RB_WPTR is calculated on the fly as (rb->next - rb->start). But as the code is designed rb->next is wrapped before writing the commands so if a series of commands happened to fit perfectly in the ringbuffer, rb->next would end up being equal to rb->size / 4 and thus result in an out of bounds address to CP_RB_WPTR. The easiest way to fix this is to mask WPTR when writing it to the hardware; it makes the hardware happy and the rest of the ringbuffer math appears to work and there isn't any point in upsetting anything. Signed-off-by: Jordan Crouse [squash in is_power_of_2() check] Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 9 ++++++++- drivers/gpu/drm/msm/msm_ringbuffer.c | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index a18126150e11..14ff87686a36 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -213,7 +213,14 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, void adreno_flush(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - uint32_t wptr = get_wptr(gpu->rb); + uint32_t wptr; + + /* + * Mask wptr value that we calculate to fit in the HW range. This is + * to account for the possibility that the last command fit exactly into + * the ringbuffer and rb->next hasn't wrapped to zero yet + */ + wptr = get_wptr(gpu->rb) & ((gpu->rb->size / 4) - 1); /* ensure writes to ringbuffer have hit system memory: */ mb(); diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index f326cf6a32e6..67b34e069abf 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -23,7 +23,8 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size) struct msm_ringbuffer *ring; int ret; - size = ALIGN(size, 4); /* size should be dword aligned */ + if (WARN_ON(!is_power_of_2(size))) + return ERR_PTR(-EINVAL); ring = kzalloc(sizeof(*ring), GFP_KERNEL); if (!ring) { From 6490abc4bc35fa4f3bdb9c7e49096943c50e29ea Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 20 Dec 2016 08:54:30 -0700 Subject: [PATCH 025/297] drm/msm: Put back the vaddr in submit_reloc() The error cases in submit_reloc() need to put back the virtual address of the bo before failling. Add a single failure path for the function. Signed-off-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem_submit.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 166e84e4f0d4..b6411ea6ed17 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -290,7 +290,7 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob { uint32_t i, last_offset = 0; uint32_t *ptr; - int ret; + int ret = 0; if (offset % 4) { DRM_ERROR("non-aligned cmdstream buffer: %u\n", offset); @@ -318,12 +318,13 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob ret = copy_from_user(&submit_reloc, userptr, sizeof(submit_reloc)); if (ret) - return -EFAULT; + goto out; if (submit_reloc.submit_offset % 4) { DRM_ERROR("non-aligned reloc offset: %u\n", submit_reloc.submit_offset); - return -EINVAL; + ret = -EINVAL; + goto out; } /* offset in dwords: */ @@ -332,12 +333,13 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob if ((off >= (obj->base.size / 4)) || (off < last_offset)) { DRM_ERROR("invalid offset %u at reloc %u\n", off, i); - return -EINVAL; + ret = -EINVAL; + goto out; } ret = submit_bo(submit, submit_reloc.reloc_idx, NULL, &iova, &valid); if (ret) - return ret; + goto out; if (valid) continue; @@ -354,9 +356,10 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob last_offset = off; } +out: msm_gem_put_vaddr_locked(&obj->base); - return 0; + return ret; } static void submit_cleanup(struct msm_gem_submit *submit) From a6cb3b864b21b7345f824a4faa12b723c8aaf099 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 20 Dec 2016 08:54:31 -0700 Subject: [PATCH 026/297] drm/msm: Verify that MSM_SUBMIT_BO_FLAGS are set For every submission buffer object one of MSM_SUBMIT_BO_WRITE and MSM_SUBMIT_BO_READ must be set (and nothing else). If we allowed zero then the buffer object would never get queued to be unreferenced. Signed-off-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem_submit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index b6411ea6ed17..489676568a10 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -106,7 +106,8 @@ static int submit_lookup_objects(struct msm_gem_submit *submit, pagefault_disable(); } - if (submit_bo.flags & ~MSM_SUBMIT_BO_FLAGS) { + if ((submit_bo.flags & ~MSM_SUBMIT_BO_FLAGS) || + !(submit_bo.flags & MSM_SUBMIT_BO_FLAGS)) { DRM_ERROR("invalid flags: %x\n", submit_bo.flags); ret = -EINVAL; goto out_unlock; From abc8d5832fd142d565fbfca163348f33b08bc1fe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Dec 2016 10:08:14 +0100 Subject: [PATCH 027/297] gpio: mxs: remove __init annotation Building with an old toolchain, I ran into this warning: WARNING: vmlinux.o(.text+0x63eef0): Section mismatch in reference from the function mxs_gpio_probe() to the function .init.text:mxs_gpio_init_gc() Clearly the annotation is wrong, since the function is called from the non-init probe, so let's remove it. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Walleij --- drivers/gpio/gpio-mxs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-mxs.c b/drivers/gpio/gpio-mxs.c index 1e8fde8cb803..2292742eac8f 100644 --- a/drivers/gpio/gpio-mxs.c +++ b/drivers/gpio/gpio-mxs.c @@ -205,7 +205,7 @@ static int mxs_gpio_set_wake_irq(struct irq_data *d, unsigned int enable) return 0; } -static int __init mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base) +static int mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base) { struct irq_chip_generic *gc; struct irq_chip_type *ct; From 5018ada69a04c8ac21d74bd682fceb8e42dc0f96 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 19 Dec 2016 18:29:23 +0100 Subject: [PATCH 028/297] gpio: Move freeing of GPIO hogs before numbing of the device When removing a gpiochip that uses GPIO hogging (e.g. by unloading the chip's DT overlay), a warning is printed: gpio gpiochip8: REMOVING GPIOCHIP WITH GPIOS STILL REQUESTED This happens because gpiochip_free_hogs() is called after the gdev->chip pointer is reset to NULL. Hence __gpiod_free() cannot determine the chip in use, and cannot clear flags nor call the optional chip-specific .free() callback. Move the call to gpiochip_free_hogs() up to fix this. Cc: stable@vger.kernel.org Fixes: ff2b135922992756 ("gpio: make the gpiochip a real device") Signed-off-by: Geert Uytterhoeven Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index f4c26c7826cd..86bf3b84ada5 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1317,12 +1317,12 @@ void gpiochip_remove(struct gpio_chip *chip) /* FIXME: should the legacy sysfs handling be moved to gpio_device? */ gpiochip_sysfs_unregister(gdev); + gpiochip_free_hogs(chip); /* Numb the device, cancelling all outstanding operations */ gdev->chip = NULL; gpiochip_irqchip_remove(chip); acpi_gpiochip_remove(chip); gpiochip_remove_pin_ranges(chip); - gpiochip_free_hogs(chip); of_gpiochip_remove(chip); /* * We accept no more calls into the driver from this point, so From 07825f0acd85dd8b7481d5ef0eb024b05364d892 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Dec 2016 17:44:15 +0800 Subject: [PATCH 029/297] crypto: aesni - Fix failure when built-in with modular pcbc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If aesni is built-in but pcbc is built as a module, then aesni will fail completely because when it tries to register the pcbc variant of aes the pcbc template is not available. This patch fixes this by modifying the pcbc presence test so that if aesni is built-in then pcbc must also be built-in for it to be used by aesni. Fixes: 85671860caac ("crypto: aesni - Convert to skcipher") Reported-by: Stephan Müller Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 31c34ee131f3..6ef688a1ef3e 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1020,7 +1020,8 @@ struct { const char *basename; struct simd_skcipher_alg *simd; } aesni_simd_skciphers2[] = { -#if IS_ENABLED(CONFIG_CRYPTO_PCBC) +#if (defined(MODULE) && IS_ENABLED(CONFIG_CRYPTO_PCBC)) || \ + IS_BUILTIN(CONFIG_CRYPTO_PCBC) { .algname = "pcbc(aes)", .drvname = "pcbc-aes-aesni", From 63447646ac657fde00bb658ce21a3431940ae0ad Mon Sep 17 00:00:00 2001 From: Loic Pallardy Date: Thu, 15 Dec 2016 15:49:56 +0100 Subject: [PATCH 030/297] rpmsg: virtio_rpmsg_bus: fix channel creation Since commit 4dffed5b3ac796b ("rpmsg: Name rpmsg devices based on channel id"), it is no more possible for a firmware to register twice a service (on different endpoints). rpmsg_register_device function is failing when calling device_add for the second time as second device has the same name as first one already register. It is because name is based only on service name and so is not more unique. Previously name was unique thanks to the use of rpmsg_dev_index. This patch adds destination and source endpoint numbers device name to create an unique identifier. Fixes: 4dffed5b3ac7 ("rpmsg: Name rpmsg devices based on channel id") Acked-by: Peter Griffin Signed-off-by: Loic Pallardy [bjorn: flipped name and address in device name] Signed-off-by: Bjorn Andersson --- drivers/rpmsg/rpmsg_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c index a79cb5a9e5f2..1cfb775e8e82 100644 --- a/drivers/rpmsg/rpmsg_core.c +++ b/drivers/rpmsg/rpmsg_core.c @@ -453,8 +453,8 @@ int rpmsg_register_device(struct rpmsg_device *rpdev) struct device *dev = &rpdev->dev; int ret; - dev_set_name(&rpdev->dev, "%s:%s", - dev_name(dev->parent), rpdev->id.name); + dev_set_name(&rpdev->dev, "%s.%s.%d.%d", dev_name(dev->parent), + rpdev->id.name, rpdev->src, rpdev->dst); rpdev->dev.bus = &rpmsg_bus; rpdev->dev.release = rpmsg_release_device; From c81c0e0710f031cb09eb7cbf0e75e6754d1d8346 Mon Sep 17 00:00:00 2001 From: Loic Pallardy Date: Wed, 14 Dec 2016 16:11:00 +0100 Subject: [PATCH 031/297] remoteproc: fix vdev reference management Commit 2b45cef5868a ("remoteproc: Further extend the vdev life cycle") extends kref support for vdev management. It introduces a regression when following sequence is executed: rproc_boot --> rproc_shutdown --> rproc_boot Second rproc_boot call crashes on register_virtio_device as device is already existing. Issue is previous vdev is never released when rproc is stop because associated refcount is too high. kref_get introduces is not needed as kref_init already initializes krefcount to 1 because it considers associated variable as used. This introduces a misalignment between kref_get and kref_put calls. Fixes: 2b45cef5868a ("remoteproc: Further extend the vdev life cycle") Signed-off-by: Loic Pallardy Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 9a507e77eced..feb24c43d4c7 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -396,9 +396,6 @@ static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc, goto unwind_vring_allocations; } - /* track the rvdevs list reference */ - kref_get(&rvdev->refcount); - list_add_tail(&rvdev->node, &rproc->rvdevs); rproc_add_subdev(rproc, &rvdev->subdev, From a0c10687ec9506b5e14fe3dd47832a77f2f2500c Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 30 Dec 2016 03:21:38 -0800 Subject: [PATCH 032/297] Revert "remoteproc: Merge table_ptr and cached_table pointers" Following any fw_rsc_vdev entries in the resource table are two variable length arrays, the first one reference vring resources and the second one is the virtio config space. The virtio config space is used by virtio to communicate status and configuration changes and must as such be shared with the remote. The reverted commit incorrectly made any changes to the virtio config space only affect the local copy, in an attempt to allowing memory protection of the shared resource table. This reverts commit cda8529346935fc86f476999ac4fbfe4e17abf11. Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 26 ++++++++++++++++---------- include/linux/remoteproc.h | 4 +++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index feb24c43d4c7..90b05c72186c 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -886,13 +886,15 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw) /* * Create a copy of the resource table. When a virtio device starts * and calls vring_new_virtqueue() the address of the allocated vring - * will be stored in the table_ptr. Before the device is started, - * table_ptr will be copied into device memory. + * will be stored in the cached_table. Before the device is started, + * cached_table will be copied into device memory. */ - rproc->table_ptr = kmemdup(table, tablesz, GFP_KERNEL); - if (!rproc->table_ptr) + rproc->cached_table = kmemdup(table, tablesz, GFP_KERNEL); + if (!rproc->cached_table) goto clean_up; + rproc->table_ptr = rproc->cached_table; + /* reset max_notifyid */ rproc->max_notifyid = -1; @@ -911,16 +913,18 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw) } /* - * The starting device has been given the rproc->table_ptr as the + * The starting device has been given the rproc->cached_table as the * resource table. The address of the vring along with the other - * allocated resources (carveouts etc) is stored in table_ptr. + * allocated resources (carveouts etc) is stored in cached_table. * In order to pass this information to the remote device we must copy * this information to device memory. We also update the table_ptr so * that any subsequent changes will be applied to the loaded version. */ loaded_table = rproc_find_loaded_rsc_table(rproc, fw); - if (loaded_table) - memcpy(loaded_table, rproc->table_ptr, tablesz); + if (loaded_table) { + memcpy(loaded_table, rproc->cached_table, tablesz); + rproc->table_ptr = loaded_table; + } /* power up the remote processor */ ret = rproc->ops->start(rproc); @@ -948,7 +952,8 @@ stop_rproc: clean_up_resources: rproc_resource_cleanup(rproc); clean_up: - kfree(rproc->table_ptr); + kfree(rproc->cached_table); + rproc->cached_table = NULL; rproc->table_ptr = NULL; rproc_disable_iommu(rproc); @@ -1182,7 +1187,8 @@ void rproc_shutdown(struct rproc *rproc) rproc_disable_iommu(rproc); /* Free the copy of the resource table */ - kfree(rproc->table_ptr); + kfree(rproc->cached_table); + rproc->cached_table = NULL; rproc->table_ptr = NULL; /* if in crash state, unlock crash handler */ diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index e2f3a3281d8f..8265d351c9f0 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -408,7 +408,8 @@ enum rproc_crash_type { * @crash_comp: completion used to sync crash handler and the rproc reload * @recovery_disabled: flag that state if recovery was disabled * @max_notifyid: largest allocated notify id. - * @table_ptr: our copy of the resource table + * @table_ptr: pointer to the resource table in effect + * @cached_table: copy of the resource table * @has_iommu: flag to indicate if remote processor is behind an MMU */ struct rproc { @@ -440,6 +441,7 @@ struct rproc { bool recovery_disabled; int max_notifyid; struct resource_table *table_ptr; + struct resource_table *cached_table; bool has_iommu; bool auto_boot; }; From 570b90fa230b8021f51a67fab2245fe8df6fe37d Mon Sep 17 00:00:00 2001 From: Andrew Lutomirski Date: Mon, 12 Dec 2016 12:55:55 -0800 Subject: [PATCH 033/297] orinoco: Use shash instead of ahash for MIC calculations Eric Biggers pointed out that the orinoco driver pointed scatterlists at the stack. Fix it by switching from ahash to shash. The result should be simpler, faster, and more correct. kvalo: cherry picked from commit 1fef293b8a9850cfa124a53c1d8878d355010403 as I accidentally applied this patch to wireless-drivers-next when I was supposed to apply this wireless-drivers Cc: stable@vger.kernel.org # 4.9 only Reported-by: Eric Biggers Signed-off-by: Andy Lutomirski Signed-off-by: Kalle Valo --- drivers/net/wireless/intersil/orinoco/mic.c | 44 +++++++++++-------- drivers/net/wireless/intersil/orinoco/mic.h | 3 +- .../net/wireless/intersil/orinoco/orinoco.h | 4 +- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/drivers/net/wireless/intersil/orinoco/mic.c b/drivers/net/wireless/intersil/orinoco/mic.c index bc7397d709d3..08bc7822f820 100644 --- a/drivers/net/wireless/intersil/orinoco/mic.c +++ b/drivers/net/wireless/intersil/orinoco/mic.c @@ -16,7 +16,7 @@ /********************************************************************/ int orinoco_mic_init(struct orinoco_private *priv) { - priv->tx_tfm_mic = crypto_alloc_ahash("michael_mic", 0, + priv->tx_tfm_mic = crypto_alloc_shash("michael_mic", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->tx_tfm_mic)) { printk(KERN_DEBUG "orinoco_mic_init: could not allocate " @@ -25,7 +25,7 @@ int orinoco_mic_init(struct orinoco_private *priv) return -ENOMEM; } - priv->rx_tfm_mic = crypto_alloc_ahash("michael_mic", 0, + priv->rx_tfm_mic = crypto_alloc_shash("michael_mic", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->rx_tfm_mic)) { printk(KERN_DEBUG "orinoco_mic_init: could not allocate " @@ -40,17 +40,16 @@ int orinoco_mic_init(struct orinoco_private *priv) void orinoco_mic_free(struct orinoco_private *priv) { if (priv->tx_tfm_mic) - crypto_free_ahash(priv->tx_tfm_mic); + crypto_free_shash(priv->tx_tfm_mic); if (priv->rx_tfm_mic) - crypto_free_ahash(priv->rx_tfm_mic); + crypto_free_shash(priv->rx_tfm_mic); } -int orinoco_mic(struct crypto_ahash *tfm_michael, u8 *key, +int orinoco_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *da, u8 *sa, u8 priority, u8 *data, size_t data_len, u8 *mic) { - AHASH_REQUEST_ON_STACK(req, tfm_michael); - struct scatterlist sg[2]; + SHASH_DESC_ON_STACK(desc, tfm_michael); u8 hdr[ETH_HLEN + 2]; /* size of header + padding */ int err; @@ -67,18 +66,27 @@ int orinoco_mic(struct crypto_ahash *tfm_michael, u8 *key, hdr[ETH_ALEN * 2 + 2] = 0; hdr[ETH_ALEN * 2 + 3] = 0; - /* Use scatter gather to MIC header and data in one go */ - sg_init_table(sg, 2); - sg_set_buf(&sg[0], hdr, sizeof(hdr)); - sg_set_buf(&sg[1], data, data_len); + desc->tfm = tfm_michael; + desc->flags = 0; - if (crypto_ahash_setkey(tfm_michael, key, MIC_KEYLEN)) - return -1; + err = crypto_shash_setkey(tfm_michael, key, MIC_KEYLEN); + if (err) + return err; + + err = crypto_shash_init(desc); + if (err) + return err; + + err = crypto_shash_update(desc, hdr, sizeof(hdr)); + if (err) + return err; + + err = crypto_shash_update(desc, data, data_len); + if (err) + return err; + + err = crypto_shash_final(desc, mic); + shash_desc_zero(desc); - ahash_request_set_tfm(req, tfm_michael); - ahash_request_set_callback(req, 0, NULL, NULL); - ahash_request_set_crypt(req, sg, mic, data_len + sizeof(hdr)); - err = crypto_ahash_digest(req); - ahash_request_zero(req); return err; } diff --git a/drivers/net/wireless/intersil/orinoco/mic.h b/drivers/net/wireless/intersil/orinoco/mic.h index ce731d05cc98..e8724e889219 100644 --- a/drivers/net/wireless/intersil/orinoco/mic.h +++ b/drivers/net/wireless/intersil/orinoco/mic.h @@ -6,6 +6,7 @@ #define _ORINOCO_MIC_H_ #include +#include #define MICHAEL_MIC_LEN 8 @@ -15,7 +16,7 @@ struct crypto_ahash; int orinoco_mic_init(struct orinoco_private *priv); void orinoco_mic_free(struct orinoco_private *priv); -int orinoco_mic(struct crypto_ahash *tfm_michael, u8 *key, +int orinoco_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *da, u8 *sa, u8 priority, u8 *data, size_t data_len, u8 *mic); diff --git a/drivers/net/wireless/intersil/orinoco/orinoco.h b/drivers/net/wireless/intersil/orinoco/orinoco.h index 2f0c84b1c440..5fa1c3e3713f 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco.h +++ b/drivers/net/wireless/intersil/orinoco/orinoco.h @@ -152,8 +152,8 @@ struct orinoco_private { u8 *wpa_ie; int wpa_ie_len; - struct crypto_ahash *rx_tfm_mic; - struct crypto_ahash *tx_tfm_mic; + struct crypto_shash *rx_tfm_mic; + struct crypto_shash *tx_tfm_mic; unsigned int wpa_enabled:1; unsigned int tkip_cm_active:1; From 60f59ce0278557f7896d5158ae6d12a4855a72cc Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 21 Dec 2016 11:18:55 -0600 Subject: [PATCH 034/297] rtlwifi: rtl_usb: Fix missing entry in USB driver's private data These drivers need to be able to reference "struct ieee80211_hw" from the driver's private data, and vice versa. The USB driver failed to store the address of ieee80211_hw in the private data. Although this bug has been present for a long time, it was not exposed until commit ba9f93f82aba ("rtlwifi: Fix enter/exit power_save"). Fixes: ba9f93f82aba ("rtlwifi: Fix enter/exit power_save") Signed-off-by: Larry Finger Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtlwifi/usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/realtek/rtlwifi/usb.c b/drivers/net/wireless/realtek/rtlwifi/usb.c index 0a508649903d..49015b05f3d1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/usb.c +++ b/drivers/net/wireless/realtek/rtlwifi/usb.c @@ -1063,6 +1063,7 @@ int rtl_usb_probe(struct usb_interface *intf, return -ENOMEM; } rtlpriv = hw->priv; + rtlpriv->hw = hw; rtlpriv->usb_data = kzalloc(RTL_USB_MAX_RX_COUNT * sizeof(u32), GFP_KERNEL); if (!rtlpriv->usb_data) From 14221cc45caad2fcab3a8543234bb7eda9b540d5 Mon Sep 17 00:00:00 2001 From: Artur Molchanov Date: Fri, 30 Dec 2016 19:46:36 +0300 Subject: [PATCH 035/297] bridge: netfilter: Fix dropping packets that moving through bridge interface Problem: br_nf_pre_routing_finish() calls itself instead of br_nf_pre_routing_finish_bridge(). Due to this bug reverse path filter drops packets that go through bridge interface. User impact: Local docker containers with bridge network can not communicate with each other. Fixes: c5136b15ea36 ("netfilter: bridge: add and use br_nf_hook_thresh") Signed-off-by: Artur Molchanov Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index b12501a77f18..135cc8ab813c 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -399,7 +399,7 @@ bridged_dnat: br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish); + br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); From d2e3a1358c37cd82eef92b5e908b4f0472194481 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Thu, 29 Dec 2016 14:11:21 +0100 Subject: [PATCH 036/297] ASoC: Fix binding and probing of auxiliary components Currently binding of auxiliary devices doesn't work as in soc_bind_aux_dev() function a bound component is not being added to any list and in soc_probe_aux_devices() we are trying to walk the component_dev_list list to probe auxiliary components but at that time this list doesn't contain any auxiliary components since they are being added to the card only in soc_probe_component(). This patch adds a list to the card where are stored bound but not probed auxiliary devices, so that all aux devices can be probed. Fixes: 1a653aa44725 "ASoC: core: replace aux_comp_list to component_dev_list" Signed-off-by: Sylwester Nawrocki Signed-off-by: Mark Brown --- include/sound/soc.h | 3 +++ sound/soc/soc-core.c | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/sound/soc.h b/include/sound/soc.h index 2b502f6cc6d0..b86168a21d56 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -813,6 +813,7 @@ struct snd_soc_component { unsigned int suspended:1; /* is in suspend PM state */ struct list_head list; + struct list_head card_aux_list; /* for auxiliary bound components */ struct list_head card_list; struct snd_soc_dai_driver *dai_drv; @@ -1152,6 +1153,7 @@ struct snd_soc_card { */ struct snd_soc_aux_dev *aux_dev; int num_aux_devs; + struct list_head aux_comp_list; const struct snd_kcontrol_new *controls; int num_controls; @@ -1547,6 +1549,7 @@ static inline void snd_soc_initialize_card_lists(struct snd_soc_card *card) INIT_LIST_HEAD(&card->widgets); INIT_LIST_HEAD(&card->paths); INIT_LIST_HEAD(&card->dapm_list); + INIT_LIST_HEAD(&card->aux_comp_list); INIT_LIST_HEAD(&card->component_dev_list); } diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index f1901bb1466e..baa1afa41e3d 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -1748,6 +1748,7 @@ static int soc_bind_aux_dev(struct snd_soc_card *card, int num) component->init = aux_dev->init; component->auxiliary = 1; + list_add(&component->card_aux_list, &card->aux_comp_list); return 0; @@ -1758,16 +1759,14 @@ err_defer: static int soc_probe_aux_devices(struct snd_soc_card *card) { - struct snd_soc_component *comp; + struct snd_soc_component *comp, *tmp; int order; int ret; for (order = SND_SOC_COMP_ORDER_FIRST; order <= SND_SOC_COMP_ORDER_LAST; order++) { - list_for_each_entry(comp, &card->component_dev_list, card_list) { - if (!comp->auxiliary) - continue; - + list_for_each_entry_safe(comp, tmp, &card->aux_comp_list, + card_aux_list) { if (comp->driver->probe_order == order) { ret = soc_probe_component(card, comp); if (ret < 0) { @@ -1776,6 +1775,7 @@ static int soc_probe_aux_devices(struct snd_soc_card *card) comp->name, ret); return ret; } + list_del(&comp->card_aux_list); } } } From 63c3194b82530bd71fd49db84eb7ab656b8d404a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 23 Dec 2016 11:21:10 +0200 Subject: [PATCH 037/297] ASoC: tlv320aic3x: Mark the RESET register as volatile The RESET register only have one self clearing bit and it should not be cached. If it is cached, when we sync the registers back to the chip we will initiate a software reset as well, which is not desirable. Signed-off-by: Peter Ujfalusi Reviewed-by: Jarkko Nikula Signed-off-by: Mark Brown --- sound/soc/codecs/tlv320aic3x.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sound/soc/codecs/tlv320aic3x.c b/sound/soc/codecs/tlv320aic3x.c index 8877b74b0510..bb94d50052d7 100644 --- a/sound/soc/codecs/tlv320aic3x.c +++ b/sound/soc/codecs/tlv320aic3x.c @@ -126,6 +126,16 @@ static const struct reg_default aic3x_reg[] = { { 108, 0x00 }, { 109, 0x00 }, }; +static bool aic3x_volatile_reg(struct device *dev, unsigned int reg) +{ + switch (reg) { + case AIC3X_RESET: + return true; + default: + return false; + } +} + static const struct regmap_config aic3x_regmap = { .reg_bits = 8, .val_bits = 8, @@ -133,6 +143,9 @@ static const struct regmap_config aic3x_regmap = { .max_register = DAC_ICC_ADJ, .reg_defaults = aic3x_reg, .num_reg_defaults = ARRAY_SIZE(aic3x_reg), + + .volatile_reg = aic3x_volatile_reg, + .cache_type = REGCACHE_RBTREE, }; From 91ce54978ccece323aa6df930249ff84a7d233c7 Mon Sep 17 00:00:00 2001 From: G Kranthi Date: Tue, 20 Dec 2016 12:46:45 +0530 Subject: [PATCH 038/297] ASoC: Intel: Skylake: Fix to fail safely if module not available in path If a module is not available in a pipeline, fail safely rather than causing oops. Signed-off-by: G Kranthi Signed-off-by: Subhransu S. Prusty Signed-off-by: Mark Brown --- sound/soc/intel/skylake/skl-pcm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/intel/skylake/skl-pcm.c b/sound/soc/intel/skylake/skl-pcm.c index 84b5101e6ca6..6c6b63a6b338 100644 --- a/sound/soc/intel/skylake/skl-pcm.c +++ b/sound/soc/intel/skylake/skl-pcm.c @@ -180,6 +180,9 @@ static int skl_pcm_open(struct snd_pcm_substream *substream, snd_pcm_set_sync(substream); mconfig = skl_tplg_fe_get_cpr_module(dai, substream->stream); + if (!mconfig) + return -EINVAL; + skl_tplg_d0i3_get(skl, mconfig->d0i3_caps); return 0; From a33b56a6a824fa5cd89c74f85cbeb9af1dcef87e Mon Sep 17 00:00:00 2001 From: John Hsu Date: Tue, 20 Dec 2016 16:47:06 +0800 Subject: [PATCH 039/297] ASoC: nau8825: correct the function name of register Change to correct name of the register function. Signed-off-by: John Hsu Signed-off-by: Mark Brown --- sound/soc/codecs/nau8825.c | 6 +++--- sound/soc/codecs/nau8825.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/soc/codecs/nau8825.c b/sound/soc/codecs/nau8825.c index efe3a44658d5..abf77dd422f4 100644 --- a/sound/soc/codecs/nau8825.c +++ b/sound/soc/codecs/nau8825.c @@ -561,9 +561,9 @@ static void nau8825_xtalk_prepare(struct nau8825 *nau8825) nau8825_xtalk_backup(nau8825); /* Config IIS as master to output signal by codec */ regmap_update_bits(nau8825->regmap, NAU8825_REG_I2S_PCM_CTRL2, - NAU8825_I2S_MS_MASK | NAU8825_I2S_DRV_MASK | + NAU8825_I2S_MS_MASK | NAU8825_I2S_LRC_DIV_MASK | NAU8825_I2S_BLK_DIV_MASK, NAU8825_I2S_MS_MASTER | - (0x2 << NAU8825_I2S_DRV_SFT) | 0x1); + (0x2 << NAU8825_I2S_LRC_DIV_SFT) | 0x1); /* Ramp up headphone volume to 0dB to get better performance and * avoid pop noise in headphone. */ @@ -657,7 +657,7 @@ static void nau8825_xtalk_clean(struct nau8825 *nau8825) NAU8825_IRQ_RMS_EN, NAU8825_IRQ_RMS_EN); /* Recover default value for IIS */ regmap_update_bits(nau8825->regmap, NAU8825_REG_I2S_PCM_CTRL2, - NAU8825_I2S_MS_MASK | NAU8825_I2S_DRV_MASK | + NAU8825_I2S_MS_MASK | NAU8825_I2S_LRC_DIV_MASK | NAU8825_I2S_BLK_DIV_MASK, NAU8825_I2S_MS_SLAVE); /* Restore value of specific register for cross talk */ nau8825_xtalk_restore(nau8825); diff --git a/sound/soc/codecs/nau8825.h b/sound/soc/codecs/nau8825.h index 5d1704e73241..b6b21b312854 100644 --- a/sound/soc/codecs/nau8825.h +++ b/sound/soc/codecs/nau8825.h @@ -247,8 +247,8 @@ /* I2S_PCM_CTRL2 (0x1d) */ #define NAU8825_I2S_TRISTATE (1 << 15) /* 0 - normal mode, 1 - Hi-Z output */ -#define NAU8825_I2S_DRV_SFT 12 -#define NAU8825_I2S_DRV_MASK (0x3 << NAU8825_I2S_DRV_SFT) +#define NAU8825_I2S_LRC_DIV_SFT 12 +#define NAU8825_I2S_LRC_DIV_MASK (0x3 << NAU8825_I2S_LRC_DIV_SFT) #define NAU8825_I2S_MS_SFT 3 #define NAU8825_I2S_MS_MASK (1 << NAU8825_I2S_MS_SFT) #define NAU8825_I2S_MS_MASTER (1 << NAU8825_I2S_MS_SFT) From a1792cda51300e15b03549cccf0b09f3be82e697 Mon Sep 17 00:00:00 2001 From: John Hsu Date: Tue, 20 Dec 2016 12:03:09 +0800 Subject: [PATCH 040/297] ASoC: nau8825: fix invalid configuration in Pre-Scalar of FLL The clk_ref_div is not configured in the correct position of the register. The patch fixes that clk_ref_div, Pre-Scalar, is assigned the wrong value. Signed-off-by: John Hsu Signed-off-by: Mark Brown --- sound/soc/codecs/nau8825.c | 3 ++- sound/soc/codecs/nau8825.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/nau8825.c b/sound/soc/codecs/nau8825.c index abf77dd422f4..4576f987a4a5 100644 --- a/sound/soc/codecs/nau8825.c +++ b/sound/soc/codecs/nau8825.c @@ -2006,7 +2006,8 @@ static void nau8825_fll_apply(struct nau8825 *nau8825, NAU8825_FLL_INTEGER_MASK, fll_param->fll_int); /* FLL pre-scaler */ regmap_update_bits(nau8825->regmap, NAU8825_REG_FLL4, - NAU8825_FLL_REF_DIV_MASK, fll_param->clk_ref_div); + NAU8825_FLL_REF_DIV_MASK, + fll_param->clk_ref_div << NAU8825_FLL_REF_DIV_SFT); /* select divided VCO input */ regmap_update_bits(nau8825->regmap, NAU8825_REG_FLL5, NAU8825_FLL_CLK_SW_MASK, NAU8825_FLL_CLK_SW_REF); diff --git a/sound/soc/codecs/nau8825.h b/sound/soc/codecs/nau8825.h index b6b21b312854..514fd13c2f46 100644 --- a/sound/soc/codecs/nau8825.h +++ b/sound/soc/codecs/nau8825.h @@ -137,7 +137,8 @@ #define NAU8825_FLL_CLK_SRC_FS (0x3 << NAU8825_FLL_CLK_SRC_SFT) /* FLL4 (0x07) */ -#define NAU8825_FLL_REF_DIV_MASK (0x3 << 10) +#define NAU8825_FLL_REF_DIV_SFT 10 +#define NAU8825_FLL_REF_DIV_MASK (0x3 << NAU8825_FLL_REF_DIV_SFT) /* FLL5 (0x08) */ #define NAU8825_FLL_PDB_DAC_EN (0x1 << 15) From 1594c18fd297a8edcc72bc4b161f3f52603ebb92 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 13 Dec 2016 11:15:21 -0700 Subject: [PATCH 041/297] dmaengine: ioatdma: Add Skylake PCI Dev ID Adding Skylake Xeon PCI device ids for ioatdma and related bits. Signed-off-by: Dave Jiang Signed-off-by: Vinod Koul --- drivers/dma/ioat/hw.h | 2 ++ drivers/dma/ioat/init.c | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h index 8e67895bcca3..abcc51b343ce 100644 --- a/drivers/dma/ioat/hw.h +++ b/drivers/dma/ioat/hw.h @@ -64,6 +64,8 @@ #define PCI_DEVICE_ID_INTEL_IOAT_BDX8 0x6f2e #define PCI_DEVICE_ID_INTEL_IOAT_BDX9 0x6f2f +#define PCI_DEVICE_ID_INTEL_IOAT_SKX 0x2021 + #define IOAT_VER_1_2 0x12 /* Version 1.2 */ #define IOAT_VER_2_0 0x20 /* Version 2.0 */ #define IOAT_VER_3_0 0x30 /* Version 3.0 */ diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 90eddd9f07e4..51b2b643ba71 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -106,6 +106,8 @@ static struct pci_device_id ioat_pci_tbl[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_BDX8) }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_BDX9) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SKX) }, + /* I/OAT v3.3 platforms */ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_BWD0) }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_BWD1) }, @@ -243,10 +245,15 @@ static bool is_bdx_ioat(struct pci_dev *pdev) } } +static inline bool is_skx_ioat(struct pci_dev *pdev) +{ + return (pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SKX) ? true : false; +} + static bool is_xeon_cb32(struct pci_dev *pdev) { return is_jf_ioat(pdev) || is_snb_ioat(pdev) || is_ivb_ioat(pdev) || - is_hsw_ioat(pdev) || is_bdx_ioat(pdev); + is_hsw_ioat(pdev) || is_bdx_ioat(pdev) || is_skx_ioat(pdev); } bool is_bwd_ioat(struct pci_dev *pdev) From 34a31f0af84158955a9747fb5c6712da5bbb5331 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 13 Dec 2016 11:15:27 -0700 Subject: [PATCH 042/297] dmaengine: ioatdma: workaround SKX ioatdma version The Skylake ioatdma is technically CBDMA 3.2+ and contains the same hardware bits with some additional 3.3 features, but it's not really 3.3 where the driver is concerned. Signed-off-by: Dave Jiang Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 51b2b643ba71..ace5cb2cb12f 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -1364,6 +1364,8 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) device->version = readb(device->reg_base + IOAT_VER_OFFSET); if (device->version >= IOAT_VER_3_0) { + if (is_skx_ioat(pdev)) + device->version = IOAT_VER_3_2; err = ioat3_dma_probe(device, ioat_dca_enabled); if (device->version >= IOAT_VER_3_3) From 1032471b3ec823bce7687034ac5af78a8ac99a9c Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 15 Dec 2016 11:43:30 +0100 Subject: [PATCH 043/297] dmaengine: dw: fix typo in Kconfig platfroms -> platforms Signed-off-by: Jean Delvare Fixes: fed42c198b45 ("dma: dw: add PCI part of the driver") Cc: Viresh Kumar Acked-by: Andy Shevchenko Signed-off-by: Vinod Koul --- drivers/dma/dw/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/dw/Kconfig b/drivers/dma/dw/Kconfig index e00c9b022964..5a37b9fcf40d 100644 --- a/drivers/dma/dw/Kconfig +++ b/drivers/dma/dw/Kconfig @@ -24,5 +24,5 @@ config DW_DMAC_PCI select DW_DMAC_CORE help Support the Synopsys DesignWare AHB DMA controller on the - platfroms that enumerate it as a PCI device. For example, + platforms that enumerate it as a PCI device. For example, Intel Medfield has integrated this GPDMA controller. From 7e96304d99477de1f70db42035071e56439da817 Mon Sep 17 00:00:00 2001 From: M'boumba Cedric Madianga Date: Tue, 13 Dec 2016 14:40:43 +0100 Subject: [PATCH 044/297] dmaengine: stm32-dma: Set correct args number for DMA request from DT This patch sets the right number of arguments to be used for DMA clients which request channels from DT. Signed-off-by: M'boumba Cedric Madianga Reviewed-by: Ludovic BARRE Signed-off-by: Vinod Koul --- drivers/dma/stm32-dma.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c index 3688d0873a3e..a884b857cba4 100644 --- a/drivers/dma/stm32-dma.c +++ b/drivers/dma/stm32-dma.c @@ -972,21 +972,18 @@ static struct dma_chan *stm32_dma_of_xlate(struct of_phandle_args *dma_spec, struct stm32_dma_chan *chan; struct dma_chan *c; - if (dma_spec->args_count < 3) + if (dma_spec->args_count < 4) return NULL; cfg.channel_id = dma_spec->args[0]; cfg.request_line = dma_spec->args[1]; cfg.stream_config = dma_spec->args[2]; - cfg.threshold = 0; + cfg.threshold = dma_spec->args[3]; if ((cfg.channel_id >= STM32_DMA_MAX_CHANNELS) || (cfg.request_line >= STM32_DMA_MAX_REQUEST_ID)) return NULL; - if (dma_spec->args_count > 3) - cfg.threshold = dma_spec->args[3]; - chan = &dmadev->chan[cfg.channel_id]; c = dma_get_slave_channel(&chan->vchan.chan); From 57b5a32135c813f2ab669039fb4ec16b30cb3305 Mon Sep 17 00:00:00 2001 From: M'boumba Cedric Madianga Date: Tue, 13 Dec 2016 14:40:46 +0100 Subject: [PATCH 045/297] dmaengine: stm32-dma: Fix null pointer dereference in stm32_dma_tx_status chan->desc is always set to NULL when a DMA transfer is complete. As a DMA transfer could be complete during the call of stm32_dma_tx_status, we need to be sure that chan->desc is not NULL before using this variable to avoid a null pointer deference issue. Signed-off-by: M'boumba Cedric Madianga Reviewed-by: Ludovic BARRE Signed-off-by: Vinod Koul --- drivers/dma/stm32-dma.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c index a884b857cba4..3056ce7f8c69 100644 --- a/drivers/dma/stm32-dma.c +++ b/drivers/dma/stm32-dma.c @@ -880,7 +880,7 @@ static enum dma_status stm32_dma_tx_status(struct dma_chan *c, struct virt_dma_desc *vdesc; enum dma_status status; unsigned long flags; - u32 residue; + u32 residue = 0; status = dma_cookie_status(c, cookie, state); if ((status == DMA_COMPLETE) || (!state)) @@ -888,16 +888,12 @@ static enum dma_status stm32_dma_tx_status(struct dma_chan *c, spin_lock_irqsave(&chan->vchan.lock, flags); vdesc = vchan_find_desc(&chan->vchan, cookie); - if (cookie == chan->desc->vdesc.tx.cookie) { + if (chan->desc && cookie == chan->desc->vdesc.tx.cookie) residue = stm32_dma_desc_residue(chan, chan->desc, chan->next_sg); - } else if (vdesc) { + else if (vdesc) residue = stm32_dma_desc_residue(chan, to_stm32_dma_desc(vdesc), 0); - } else { - residue = 0; - } - dma_set_residue(state, residue); spin_unlock_irqrestore(&chan->vchan.lock, flags); From 75bdc7f31a3a6e9a12e218b31a44a1f54a91554c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 19 Dec 2016 06:33:51 +0100 Subject: [PATCH 046/297] dmaengine: ti-dma-crossbar: Add some 'of_node_put()' in error path. Add some missing 'of_node_put()' in early exit error path. Signed-off-by: Christophe JAILLET Signed-off-by: Vinod Koul --- drivers/dma/ti-dma-crossbar.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/ti-dma-crossbar.c b/drivers/dma/ti-dma-crossbar.c index 3f24aeb48c0e..2403475a37cf 100644 --- a/drivers/dma/ti-dma-crossbar.c +++ b/drivers/dma/ti-dma-crossbar.c @@ -149,6 +149,7 @@ static int ti_am335x_xbar_probe(struct platform_device *pdev) match = of_match_node(ti_am335x_master_match, dma_node); if (!match) { dev_err(&pdev->dev, "DMA master is not supported\n"); + of_node_put(dma_node); return -EINVAL; } @@ -339,6 +340,7 @@ static int ti_dra7_xbar_probe(struct platform_device *pdev) match = of_match_node(ti_dra7_master_match, dma_node); if (!match) { dev_err(&pdev->dev, "DMA master is not supported\n"); + of_node_put(dma_node); return -EINVAL; } From 1f2ed153b916c95a49a1ca9d7107738664224b7f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 3 Jan 2017 00:20:49 +0900 Subject: [PATCH 047/297] perf probe: Fix to get correct modname from elf header Since 'perf probe' supports cross-arch probes, it is possible to analyze different arch kernel image which has different bits-per-long. In that case, it fails to get the module name because it uses the MOD_NAME_OFFSET macro based on the host machine bits-per-long, instead of the target arch bits-per-long. This fixes above issue by changing modname-offset based on the target archs bit width. This is ok because linux kernel uses LP64 model on 64bit arch. E.g. without this (on x86_64, and target module is arm32): $ perf probe -m build-arm/fs/configfs/configfs.ko -D configfs_lookup p:probe/configfs_lookup :configfs_lookup+0 ^-Here is an empty module name. With this fix, you can see correct module name: $ perf probe -m build-arm/fs/configfs/configfs.ko -D configfs_lookup p:probe/configfs_lookup configfs:configfs_lookup+0 Signed-off-by: Masami Hiramatsu Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/148337043836.6752.383495516397005695.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index d281ae2b54e8..8f810961ec78 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -267,21 +267,6 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address) return true; } -/* - * NOTE: - * '.gnu.linkonce.this_module' section of kernel module elf directly - * maps to 'struct module' from linux/module.h. This section contains - * actual module name which will be used by kernel after loading it. - * But, we cannot use 'struct module' here since linux/module.h is not - * exposed to user-space. Offset of 'name' has remained same from long - * time, so hardcoding it here. - */ -#ifdef __LP64__ -#define MOD_NAME_OFFSET 24 -#else -#define MOD_NAME_OFFSET 12 -#endif - /* * @module can be module name of module file path. In case of path, * inspect elf and find out what is actual module name. @@ -296,6 +281,7 @@ static char *find_module_name(const char *module) Elf_Data *data; Elf_Scn *sec; char *mod_name = NULL; + int name_offset; fd = open(module, O_RDONLY); if (fd < 0) @@ -317,7 +303,21 @@ static char *find_module_name(const char *module) if (!data || !data->d_buf) goto ret_err; - mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET); + /* + * NOTE: + * '.gnu.linkonce.this_module' section of kernel module elf directly + * maps to 'struct module' from linux/module.h. This section contains + * actual module name which will be used by kernel after loading it. + * But, we cannot use 'struct module' here since linux/module.h is not + * exposed to user-space. Offset of 'name' has remained same from long + * time, so hardcoding it here. + */ + if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) + name_offset = 12; + else /* expect ELFCLASS64 by default */ + name_offset = 24; + + mod_name = strdup((char *)data->d_buf + name_offset); ret_err: elf_end(elf); From 836c3ce2566fb8c1754f8d7c9534cad9bc8a6879 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 2 Jan 2017 12:07:37 +0200 Subject: [PATCH 048/297] dmaengine: omap-dma: Fix dynamic lch_map allocation The original patch did not done what it was supposed to be doing and even worst it broke legacy boot (OMAP1). The lch_map size should be the number of available logical channels in sDMA and the od->dma_requests should store the number of available DMA request lines usable in sDMA. In legacy mode we do not have a way to get the DMA request count, in that case we use OMAP_SDMA_REQUESTS (127), despite the fact that OMAP1510 have only 31 DMA request line. Fixes: 2d1a9a946fae ("dmaengine: omap-dma: Dynamically allocate memory for lch_map") Reported-by: Aaro Koskinen Cc: stable@vger.kernel.org # v4.9 Signed-off-by: Peter Ujfalusi Tested-by: Aaro Koskinen Signed-off-by: Vinod Koul --- drivers/dma/omap-dma.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c index ac68666cd3f4..4ad101a47e0a 100644 --- a/drivers/dma/omap-dma.c +++ b/drivers/dma/omap-dma.c @@ -1452,6 +1452,7 @@ static int omap_dma_probe(struct platform_device *pdev) struct omap_dmadev *od; struct resource *res; int rc, i, irq; + u32 lch_count; od = devm_kzalloc(&pdev->dev, sizeof(*od), GFP_KERNEL); if (!od) @@ -1494,20 +1495,31 @@ static int omap_dma_probe(struct platform_device *pdev) spin_lock_init(&od->lock); spin_lock_init(&od->irq_lock); - if (!pdev->dev.of_node) { - od->dma_requests = od->plat->dma_attr->lch_count; - if (unlikely(!od->dma_requests)) - od->dma_requests = OMAP_SDMA_REQUESTS; - } else if (of_property_read_u32(pdev->dev.of_node, "dma-requests", - &od->dma_requests)) { + /* Number of DMA requests */ + od->dma_requests = OMAP_SDMA_REQUESTS; + if (pdev->dev.of_node && of_property_read_u32(pdev->dev.of_node, + "dma-requests", + &od->dma_requests)) { dev_info(&pdev->dev, "Missing dma-requests property, using %u.\n", OMAP_SDMA_REQUESTS); - od->dma_requests = OMAP_SDMA_REQUESTS; } - od->lch_map = devm_kcalloc(&pdev->dev, od->dma_requests, - sizeof(*od->lch_map), GFP_KERNEL); + /* Number of available logical channels */ + if (!pdev->dev.of_node) { + lch_count = od->plat->dma_attr->lch_count; + if (unlikely(!lch_count)) + lch_count = OMAP_SDMA_CHANNELS; + } else if (of_property_read_u32(pdev->dev.of_node, "dma-channels", + &lch_count)) { + dev_info(&pdev->dev, + "Missing dma-channels property, using %u.\n", + OMAP_SDMA_CHANNELS); + lch_count = OMAP_SDMA_CHANNELS; + } + + od->lch_map = devm_kcalloc(&pdev->dev, lch_count, sizeof(*od->lch_map), + GFP_KERNEL); if (!od->lch_map) return -ENOMEM; From f53243b563e8966fb5a5cd8f27d48b832d3b1c43 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Mon, 2 Jan 2017 17:42:08 +0100 Subject: [PATCH 049/297] MAINTAINERS: dmaengine: Update + Hand over the at_hdmac driver to Ludovic Hand over the Microchip / Atmel DMA driver handled by at_hdmac driver to Ludovic who is responsible for the newer at_xdmac driver as well. Also update the entry name and position to follow company changes. Signed-off-by: Nicolas Ferre Signed-off-by: Vinod Koul --- MAINTAINERS | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cfff2c9e3d94..c10150853273 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2194,14 +2194,6 @@ L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Supported F: sound/soc/atmel -ATMEL DMA DRIVER -M: Nicolas Ferre -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Supported -F: drivers/dma/at_hdmac.c -F: drivers/dma/at_hdmac_regs.h -F: include/linux/platform_data/dma-atmel.h - ATMEL XDMA DRIVER M: Ludovic Desroches L: linux-arm-kernel@lists.infradead.org @@ -8174,6 +8166,15 @@ S: Maintained F: drivers/tty/serial/atmel_serial.c F: include/linux/atmel_serial.h +MICROCHIP / ATMEL DMA DRIVER +M: Ludovic Desroches +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: dmaengine@vger.kernel.org +S: Supported +F: drivers/dma/at_hdmac.c +F: drivers/dma/at_hdmac_regs.h +F: include/linux/platform_data/dma-atmel.h + MICROCHIP / ATMEL ISC DRIVER M: Songjun Wu L: linux-media@vger.kernel.org From 5c9e6c2b2ba3ec3a442e2fb5b4286498f8b4dcb7 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 16 Dec 2016 11:39:11 +0100 Subject: [PATCH 050/297] dmaengine: pl330: Fix runtime PM support for terminated transfers PL330 DMA engine driver is leaking a runtime reference after any terminated DMA transactions. This patch fixes this issue by tracking runtime PM state of the device and making additional call to pm_runtime_put() in terminate_all callback if needed. Fixes: ae43b3289186 ("ARM: 8202/1: dmaengine: pl330: Add runtime Power Management support v12") Signed-off-by: Marek Szyprowski Reviewed-by: Krzysztof Kozlowski Signed-off-by: Vinod Koul --- drivers/dma/pl330.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 87fd01539fcb..740bbb942594 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -448,6 +448,9 @@ struct dma_pl330_chan { /* for cyclic capability */ bool cyclic; + + /* for runtime pm tracking */ + bool active; }; struct pl330_dmac { @@ -2033,6 +2036,7 @@ static void pl330_tasklet(unsigned long data) _stop(pch->thread); spin_unlock(&pch->thread->dmac->lock); power_down = true; + pch->active = false; } else { /* Make sure the PL330 Channel thread is active */ spin_lock(&pch->thread->dmac->lock); @@ -2052,6 +2056,7 @@ static void pl330_tasklet(unsigned long data) desc->status = PREP; list_move_tail(&desc->node, &pch->work_list); if (power_down) { + pch->active = true; spin_lock(&pch->thread->dmac->lock); _start(pch->thread); spin_unlock(&pch->thread->dmac->lock); @@ -2166,6 +2171,7 @@ static int pl330_terminate_all(struct dma_chan *chan) unsigned long flags; struct pl330_dmac *pl330 = pch->dmac; LIST_HEAD(list); + bool power_down = false; pm_runtime_get_sync(pl330->ddma.dev); spin_lock_irqsave(&pch->lock, flags); @@ -2176,6 +2182,8 @@ static int pl330_terminate_all(struct dma_chan *chan) pch->thread->req[0].desc = NULL; pch->thread->req[1].desc = NULL; pch->thread->req_running = -1; + power_down = pch->active; + pch->active = false; /* Mark all desc done */ list_for_each_entry(desc, &pch->submitted_list, node) { @@ -2193,6 +2201,8 @@ static int pl330_terminate_all(struct dma_chan *chan) list_splice_tail_init(&pch->completed_list, &pl330->desc_pool); spin_unlock_irqrestore(&pch->lock, flags); pm_runtime_mark_last_busy(pl330->ddma.dev); + if (power_down) + pm_runtime_put_autosuspend(pl330->ddma.dev); pm_runtime_put_autosuspend(pl330->ddma.dev); return 0; @@ -2357,6 +2367,7 @@ static void pl330_issue_pending(struct dma_chan *chan) * updated on work_list emptiness status. */ WARN_ON(list_empty(&pch->submitted_list)); + pch->active = true; pm_runtime_get_sync(pch->dmac->ddma.dev); } list_splice_tail_init(&pch->submitted_list, &pch->work_list); From dcafc45dcb6d8bb6d159ed0a903bd0f3de597fac Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 2 Jan 2017 16:09:59 +0100 Subject: [PATCH 051/297] drm/meson: Fix plane atomic check when no crtc for the plane When no CRTC is associated with the plane, the meson_plane_atomic_check() call breaks the kernel with an Oops. Fixes: bbbe775ec5b5 ("drm: Add support for Amlogic Meson Graphic Controller") Signed-off-by: Neil Armstrong --- drivers/gpu/drm/meson/meson_plane.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/meson/meson_plane.c b/drivers/gpu/drm/meson/meson_plane.c index 4942ca090b46..7890e30eb584 100644 --- a/drivers/gpu/drm/meson/meson_plane.c +++ b/drivers/gpu/drm/meson/meson_plane.c @@ -51,6 +51,9 @@ static int meson_plane_atomic_check(struct drm_plane *plane, struct drm_crtc_state *crtc_state; struct drm_rect clip = { 0, }; + if (!state->crtc) + return 0; + crtc_state = drm_atomic_get_crtc_state(state->state, state->crtc); if (IS_ERR(crtc_state)) return PTR_ERR(crtc_state); From b66fb1da5a8cac3f5c3cdbe41937c91efc4e76a4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Jan 2017 09:19:54 +0100 Subject: [PATCH 052/297] tools lib subcmd: Add OPT_STRING_OPTARG_SET option To allow string options with a default argument and variable set when the option is used. Signed-off-by: Jiri Olsa Tested-by: Wang Nan Cc: David Ahern Cc: Josh Poimboeuf Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1483431600-19887-2-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/parse-options.c | 3 +++ tools/lib/subcmd/parse-options.h | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index 3284bb14ae78..8aad81151d50 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -213,6 +213,9 @@ static int get_value(struct parse_opt_ctx_t *p, else err = get_arg(p, opt, flags, (const char **)opt->value); + if (opt->set) + *(bool *)opt->set = true; + /* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */ if (opt->flags & PARSE_OPT_NOEMPTY) { const char *val = *(const char **)opt->value; diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h index 8866ac438b34..11c3be3bcce7 100644 --- a/tools/lib/subcmd/parse-options.h +++ b/tools/lib/subcmd/parse-options.h @@ -137,6 +137,11 @@ struct option { { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ .value = check_vtype(v, const char **), (a), .help = (h), \ .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) } +#define OPT_STRING_OPTARG_SET(s, l, v, os, a, h, d) \ + { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ + .value = check_vtype(v, const char **), (a), .help = (h), \ + .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d), \ + .set = check_vtype(os, bool *)} #define OPT_STRING_NOEMPTY(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY} #define OPT_DATE(s, l, v, h) \ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb } From efd21307119d5a23ac83ae8fd5a39a5c7aeb8493 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Jan 2017 09:19:55 +0100 Subject: [PATCH 053/297] perf record: Make __record_options static There's no need for this one to be global. Signed-off-by: Jiri Olsa Tested-by: Wang Nan Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1483431600-19887-3-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 74d6a035133a..31cf0ce12a65 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1405,7 +1405,7 @@ static bool dry_run; * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', * using pipes, etc. */ -struct option __record_options[] = { +static struct option __record_options[] = { OPT_CALLBACK('e', "event", &record.evlist, "event", "event selector. use 'perf list' to list available events", parse_events_option), From 60437ac02f398e0ee0927748d4798dd5534ac90d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Jan 2017 09:19:56 +0100 Subject: [PATCH 054/297] perf record: Fix --switch-output documentation and comment There's no --signal-trigger option, also adding the code comment into record man page. Signed-off-by: Jiri Olsa Tested-by: Wang Nan Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1483431600-19887-4-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 4 ++++ tools/perf/builtin-record.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 27fc3617c6a4..5054d9147f0f 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -430,6 +430,10 @@ that gets then processed, possibly via a perf script, to decide if that particular perf.data snapshot should be kept or not. Implies --timestamp-filename, --no-buildid and --no-buildid-cache. +The reason for the latter two is to reduce the data file switching +overhead. You can still switch them on with: + + --switch-output --no-no-buildid --no-no-buildid-cache --dry-run:: Parse options then exit. --dry-run can be used to detect errors in cmdline diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 31cf0ce12a65..4ec10e9427d9 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1636,7 +1636,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) * overhead. Still generate buildid if they are required * explicitly using * - * perf record --signal-trigger --no-no-buildid \ + * perf record --switch-output --no-no-buildid \ * --no-no-buildid-cache * * Following code equals to: From aa7c8da35d1905d80e840d075f07d26ec90144b5 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 20 Dec 2016 13:28:27 -0500 Subject: [PATCH 055/297] btrfs: fix error handling when run_delayed_extent_op fails In __btrfs_run_delayed_refs, the error path when run_delayed_extent_op fails sets locked_ref->processing = 0 but doesn't re-increment delayed_refs->num_heads_ready. As a result, we end up triggering the WARN_ON in btrfs_select_ref_head. Fixes: d7df2c796d7 (Btrfs: attach delayed ref updates to delayed ref heads) Reported-by: Jon Nelson Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e97302f437a1..5366e50c84c6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2572,7 +2572,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ if (must_insert_reserved) locked_ref->must_insert_reserved = 1; + spin_lock(&delayed_refs->lock); locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); From d0280996437081dd12ed1e982ac8aeaa62835ec4 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 20 Dec 2016 13:28:28 -0500 Subject: [PATCH 056/297] btrfs: fix locking when we put back a delayed ref that's too new In __btrfs_run_delayed_refs, when we put back a delayed ref that's too new, we have already dropped the lock on locked_ref when we set ->processing = 0. This patch keeps the lock to cover that assignment. Fixes: d7df2c796d7 (Btrfs: attach delayed ref updates to delayed ref heads) Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5366e50c84c6..ac7e6713033c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2522,11 +2522,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { spin_unlock(&locked_ref->lock); - btrfs_delayed_ref_unlock(locked_ref); spin_lock(&delayed_refs->lock); locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; cond_resched(); count++; From e321f8a801d7b4c40da8005257b05b9c2b51b072 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 30 Nov 2016 16:11:04 -0800 Subject: [PATCH 057/297] Btrfs: use down_read_nested to make lockdep silent If @block_group is not @used_bg, it'll try to get @used_bg's lock without droping @block_group 's lock and lockdep has throwed a scary deadlock warning about it. Fix it by using down_read_nested. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ac7e6713033c..dcd2e798767e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7387,7 +7387,8 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); + /* We should only have one-level nested. */ + down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); spin_lock(&cluster->refill_lock); if (used_bg == cluster->block_group) From 781feef7e6befafd4d9787d1f7ada1f9ccd504e4 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 30 Nov 2016 16:20:25 -0800 Subject: [PATCH 058/297] Btrfs: fix lockdep warning about log_mutex While checking INODE_REF/INODE_EXTREF for a corner case, we may acquire a different inode's log_mutex with holding the current inode's log_mutex, and lockdep has complained this with a possilble deadlock warning. Fix this by using mutex_lock_nested() when processing the other inode's log_mutex. Reviewed-by: Filipe Manana Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f10bf5213ed8..eeffff84f280 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -37,6 +37,7 @@ */ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 +#define LOG_OTHER_INODE 2 /* * directory trouble cases @@ -4641,7 +4642,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) || (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags) && - inode_only == LOG_INODE_EXISTS)) + inode_only >= LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; @@ -4665,7 +4666,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return ret; } - mutex_lock(&BTRFS_I(inode)->log_mutex); + if (inode_only == LOG_OTHER_INODE) { + inode_only = LOG_INODE_EXISTS; + mutex_lock_nested(&BTRFS_I(inode)->log_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&BTRFS_I(inode)->log_mutex); + } /* * a brute force approach to making sure we get the most uptodate @@ -4817,7 +4824,7 @@ again: * unpin it. */ err = btrfs_log_inode(trans, root, other_inode, - LOG_INODE_EXISTS, + LOG_OTHER_INODE, 0, LLONG_MAX, ctx); iput(other_inode); if (err) From c2931667c83ded6504b3857e99cc45b21fa496fb Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 22 Dec 2016 17:13:54 -0800 Subject: [PATCH 059/297] Btrfs: adjust outstanding_extents counter properly when dio write is split Currently how btrfs dio deals with split dio write is not good enough if dio write is split into several segments due to the lack of contiguous space, a large dio write like 'dd bs=1G count=1' can end up with incorrect outstanding_extents counter and endio would complain loudly with an assertion. This fixes the problem by compensating the outstanding_extents counter in inode if a large dio write gets split. Reported-by: Anand Jain Tested-by: Anand Jain Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a713d9d324b0..81b9d9d0450c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7623,11 +7623,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode, * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (dio_data->outstanding_extents) { + if (dio_data->outstanding_extents >= num_extents) { dio_data->outstanding_extents -= num_extents; } else { + /* + * If dio write length has been split due to no large enough + * contiguous space, we need to compensate our inode counter + * appropriately. + */ + u64 num_needed = num_extents - dio_data->outstanding_extents; + spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_extents; + BTRFS_I(inode)->outstanding_extents += num_needed; spin_unlock(&BTRFS_I(inode)->lock); } } From 3b046a97cbd35a73e1eef968dbfb1a0aac745a77 Mon Sep 17 00:00:00 2001 From: Robert LeBlanc Date: Mon, 5 Dec 2016 13:02:57 -0700 Subject: [PATCH 060/297] md/raid1: Refactor raid1_make_request Refactor raid1_make_request to make read and write code in their own functions to clean up the code. Signed-off-by: Robert LeBlanc Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 267 +++++++++++++++++++++++---------------------- 1 file changed, 139 insertions(+), 128 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a1f3fbed9100..14422407e520 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1066,17 +1066,107 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } -static void raid1_make_request(struct mddev *mddev, struct bio * bio) +static void raid1_read_request(struct mddev *mddev, struct bio *bio, + struct r1bio *r1_bio) { struct r1conf *conf = mddev->private; struct raid1_info *mirror; - struct r1bio *r1_bio; struct bio *read_bio; + struct bitmap *bitmap = mddev->bitmap; + const int op = bio_op(bio); + const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + int sectors_handled; + int max_sectors; + int rdisk; + + wait_barrier(conf, bio); + +read_again: + rdisk = read_balance(conf, r1_bio, &max_sectors); + + if (rdisk < 0) { + /* couldn't find anywhere to read from */ + raid_end_bio_io(r1_bio); + return; + } + mirror = conf->mirrors + rdisk; + + if (test_bit(WriteMostly, &mirror->rdev->flags) && + bitmap) { + /* + * Reading from a write-mostly device must take care not to + * over-take any writes that are 'behind' + */ + raid1_log(mddev, "wait behind writes"); + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + r1_bio->read_disk = rdisk; + r1_bio->start_next_window = 0; + + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); + + r1_bio->bios[rdisk] = read_bio; + + read_bio->bi_iter.bi_sector = r1_bio->sector + + mirror->rdev->data_offset; + read_bio->bi_bdev = mirror->rdev->bdev; + read_bio->bi_end_io = raid1_end_read_request; + bio_set_op_attrs(read_bio, op, do_sync); + if (test_bit(FailFast, &mirror->rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) + read_bio->bi_opf |= MD_FAILFAST; + read_bio->bi_private = r1_bio; + + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), + read_bio, disk_devt(mddev->gendisk), + r1_bio->sector); + + if (max_sectors < r1_bio->sectors) { + /* + * could not read all from this device, so we will need another + * r1_bio. + */ + sectors_handled = (r1_bio->sector + max_sectors + - bio->bi_iter.bi_sector); + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + + /* + * Cannot call generic_make_request directly as that will be + * queued in __make_request and subsequent mempool_alloc might + * block waiting for it. So hand bio over to raid1d. + */ + reschedule_retry(r1_bio); + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); +} + +static void raid1_write_request(struct mddev *mddev, struct bio *bio, + struct r1bio *r1_bio) +{ + struct r1conf *conf = mddev->private; int i, disks; - struct bitmap *bitmap; + struct bitmap *bitmap = mddev->bitmap; unsigned long flags; const int op = bio_op(bio); - const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)); @@ -1096,15 +1186,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) md_write_start(mddev, bio); /* wait on superblock update early */ - if (bio_data_dir(bio) == WRITE && - ((bio_end_sector(bio) > mddev->suspend_lo && + if ((bio_end_sector(bio) > mddev->suspend_lo && bio->bi_iter.bi_sector < mddev->suspend_hi) || (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio))))) { - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. + bio->bi_iter.bi_sector, bio_end_sector(bio)))) { + + /* + * As the suspend_* range is controlled by userspace, we want + * an interruptible wait. */ DEFINE_WAIT(w); for (;;) { @@ -1115,128 +1205,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) bio->bi_iter.bi_sector >= mddev->suspend_hi || (mddev_is_clustered(mddev) && !md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio)))) + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) break; schedule(); } finish_wait(&conf->wait_barrier, &w); } - start_next_window = wait_barrier(conf, bio); - bitmap = mddev->bitmap; - - /* - * make_request() can abort the operation when read-ahead is being - * used and no empty request is available. - * - */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio); - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector; - - /* We might need to issue multiple reads to different - * devices if there are bad blocks around, so we keep - * track of the number of reads in bio->bi_phys_segments. - * If this is 0, there is only one r1_bio and no locking - * will be needed when requests complete. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - bio_clear_flag(bio, BIO_SEG_VALID); - - if (rw == READ) { - /* - * read balancing logic: - */ - int rdisk; - -read_again: - rdisk = read_balance(conf, r1_bio, &max_sectors); - - if (rdisk < 0) { - /* couldn't find anywhere to read from */ - raid_end_bio_io(r1_bio); - return; - } - mirror = conf->mirrors + rdisk; - - if (test_bit(WriteMostly, &mirror->rdev->flags) && - bitmap) { - /* Reading from a write-mostly device must - * take care not to over-take any writes - * that are 'behind' - */ - raid1_log(mddev, "wait behind writes"); - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - r1_bio->read_disk = rdisk; - r1_bio->start_next_window = 0; - - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, - max_sectors); - - r1_bio->bios[rdisk] = read_bio; - - read_bio->bi_iter.bi_sector = r1_bio->sector + - mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; - read_bio->bi_end_io = raid1_end_read_request; - bio_set_op_attrs(read_bio, op, do_sync); - if (test_bit(FailFast, &mirror->rdev->flags) && - test_bit(R1BIO_FailFast, &r1_bio->state)) - read_bio->bi_opf |= MD_FAILFAST; - read_bio->bi_private = r1_bio; - - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), - read_bio, disk_devt(mddev->gendisk), - r1_bio->sector); - - if (max_sectors < r1_bio->sectors) { - /* could not read all from this device, so we will - * need another r1_bio. - */ - - sectors_handled = (r1_bio->sector + max_sectors - - bio->bi_iter.bi_sector); - r1_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - /* Cannot call generic_make_request directly - * as that will be queued in __make_request - * and subsequent mempool_alloc might block waiting - * for it. So hand bio over to raid1d. - */ - reschedule_retry(r1_bio); - - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector + - sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); - return; - } - - /* - * WRITE: - */ if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); raid1_log(mddev, "wait queued"); @@ -1280,8 +1257,7 @@ read_again: int bad_sectors; int is_bad; - is_bad = is_badblock(rdev, r1_bio->sector, - max_sectors, + is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, &first_bad, &bad_sectors); if (is_bad < 0) { /* mustn't write here until the bad block is @@ -1370,7 +1346,8 @@ read_again: continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); + bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); if (first_clone) { /* do behind I/O ? @@ -1464,6 +1441,40 @@ read_again: wake_up(&conf->wait_barrier); } +static void raid1_make_request(struct mddev *mddev, struct bio *bio) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + + /* + * make_request() can abort the operation when read-ahead is being + * used and no empty request is available. + * + */ + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio); + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector; + + /* + * We might need to issue multiple reads to different devices if there + * are bad blocks around, so we keep track of the number of reads in + * bio->bi_phys_segments. If this is 0, there is only one r1_bio and + * no locking will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); + + if (bio_data_dir(bio) == READ) + raid1_read_request(mddev, bio, r1_bio); + else + raid1_write_request(mddev, bio, r1_bio); +} + static void raid1_status(struct seq_file *seq, struct mddev *mddev) { struct r1conf *conf = mddev->private; From bb5f1ed70bc3bbbce510907da3432dab267ff508 Mon Sep 17 00:00:00 2001 From: Robert LeBlanc Date: Mon, 5 Dec 2016 13:02:58 -0700 Subject: [PATCH 061/297] md/raid10: Refactor raid10_make_request Refactor raid10_make_request into seperate read and write functions to clean up the code. Shaohua: add the recovery check back to read path Signed-off-by: Robert LeBlanc Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 245 +++++++++++++++++++++++++------------------- 1 file changed, 140 insertions(+), 105 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ab5e86209322..1920756828df 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1087,23 +1087,122 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } -static void __make_request(struct mddev *mddev, struct bio *bio) +static void raid10_read_request(struct mddev *mddev, struct bio *bio, + struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; - struct r10bio *r10_bio; struct bio *read_bio; + const int op = bio_op(bio); + const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + int sectors_handled; + int max_sectors; + sector_t sectors; + struct md_rdev *rdev; + int slot; + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + wait_barrier(conf); + + sectors = bio_sectors(bio); + while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + bio->bi_iter.bi_sector < conf->reshape_progress && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { + /* + * IO spans the reshape position. Need to wait for reshape to + * pass + */ + raid10_log(conf->mddev, "wait reshape"); + allow_barrier(conf); + wait_event(conf->wait_barrier, + conf->reshape_progress <= bio->bi_iter.bi_sector || + conf->reshape_progress >= bio->bi_iter.bi_sector + + sectors); + wait_barrier(conf); + } + +read_again: + rdev = read_balance(conf, r10_bio, &max_sectors); + if (!rdev) { + raid_end_bio_io(r10_bio); + return; + } + slot = r10_bio->read_slot; + + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, + max_sectors); + + r10_bio->devs[slot].bio = read_bio; + r10_bio->devs[slot].rdev = rdev; + + read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + + choose_data_offset(r10_bio, rdev); + read_bio->bi_bdev = rdev->bdev; + read_bio->bi_end_io = raid10_end_read_request; + bio_set_op_attrs(read_bio, op, do_sync); + if (test_bit(FailFast, &rdev->flags) && + test_bit(R10BIO_FailFast, &r10_bio->state)) + read_bio->bi_opf |= MD_FAILFAST; + read_bio->bi_private = r10_bio; + + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), + read_bio, disk_devt(mddev->gendisk), + r10_bio->sector); + if (max_sectors < r10_bio->sectors) { + /* + * Could not read all from this device, so we will need another + * r10_bio. + */ + sectors_handled = (r10_bio->sector + max_sectors + - bio->bi_iter.bi_sector); + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + /* + * Cannot call generic_make_request directly as that will be + * queued in __generic_make_request and subsequent + * mempool_alloc might block waiting for it. so hand bio over + * to raid10d. + */ + reschedule_retry(r10_bio); + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio_sectors(bio) - sectors_handled; + r10_bio->state = 0; + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); + return; +} + +static void raid10_write_request(struct mddev *mddev, struct bio *bio, + struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; int i; const int op = bio_op(bio); - const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_fua = (bio->bi_opf & REQ_FUA); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid10_plug_cb *plug = NULL; + sector_t sectors; int sectors_handled; int max_sectors; - int sectors; md_write_start(mddev, bio); @@ -1118,8 +1217,9 @@ static void __make_request(struct mddev *mddev, struct bio *bio) while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { - /* IO spans the reshape position. Need to wait for - * reshape to pass + /* + * IO spans the reshape position. Need to wait for reshape to + * pass */ raid10_log(conf->mddev, "wait reshape"); allow_barrier(conf); @@ -1129,8 +1229,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio) sectors); wait_barrier(conf); } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - bio_data_dir(bio) == WRITE && (mddev->reshape_backwards ? (bio->bi_iter.bi_sector < conf->reshape_safe && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) @@ -1148,98 +1248,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio) conf->reshape_safe = mddev->reshape_position; } - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = sectors; - - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_iter.bi_sector; - r10_bio->state = 0; - - /* We might need to issue multiple reads to different - * devices if there are bad blocks around, so we keep - * track of the number of reads in bio->bi_phys_segments. - * If this is 0, there is only one r10_bio and no locking - * will be needed when the request completes. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - bio_clear_flag(bio, BIO_SEG_VALID); - - if (rw == READ) { - /* - * read balancing logic: - */ - struct md_rdev *rdev; - int slot; - -read_again: - rdev = read_balance(conf, r10_bio, &max_sectors); - if (!rdev) { - raid_end_bio_io(r10_bio); - return; - } - slot = r10_bio->read_slot; - - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, - max_sectors); - - r10_bio->devs[slot].bio = read_bio; - r10_bio->devs[slot].rdev = rdev; - - read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + - choose_data_offset(r10_bio, rdev); - read_bio->bi_bdev = rdev->bdev; - read_bio->bi_end_io = raid10_end_read_request; - bio_set_op_attrs(read_bio, op, do_sync); - if (test_bit(FailFast, &rdev->flags) && - test_bit(R10BIO_FailFast, &r10_bio->state)) - read_bio->bi_opf |= MD_FAILFAST; - read_bio->bi_private = r10_bio; - - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), - read_bio, disk_devt(mddev->gendisk), - r10_bio->sector); - if (max_sectors < r10_bio->sectors) { - /* Could not read all from this device, so we will - * need another r10_bio. - */ - sectors_handled = (r10_bio->sector + max_sectors - - bio->bi_iter.bi_sector); - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - /* Cannot call generic_make_request directly - * as that will be queued in __generic_make_request - * and subsequent mempool_alloc might block - * waiting for it. so hand bio over to raid10d. - */ - reschedule_retry(r10_bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = bio_sectors(bio) - sectors_handled; - r10_bio->state = 0; - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_iter.bi_sector + - sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); - return; - } - - /* - * WRITE: - */ if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); raid10_log(mddev, "wait queued"); @@ -1300,8 +1308,7 @@ retry_write: int bad_sectors; int is_bad; - is_bad = is_badblock(rdev, dev_sector, - max_sectors, + is_bad = is_badblock(rdev, dev_sector, max_sectors, &first_bad, &bad_sectors); if (is_bad < 0) { /* Mustn't write here until the bad block @@ -1405,8 +1412,7 @@ retry_write: r10_bio->devs[i].bio = mbio; mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, - rdev)); + choose_data_offset(r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; bio_set_op_attrs(mbio, op, do_sync | do_fua); @@ -1457,8 +1463,7 @@ retry_write: r10_bio->devs[i].repl_bio = mbio; mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr + - choose_data_offset( - r10_bio, rdev)); + choose_data_offset(r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; bio_set_op_attrs(mbio, op, do_sync | do_fua); @@ -1503,6 +1508,36 @@ retry_write: one_write_done(r10_bio); } +static void __make_request(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio_sectors(bio); + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_iter.bi_sector; + r10_bio->state = 0; + + /* + * We might need to issue multiple reads to different devices if there + * are bad blocks around, so we keep track of the number of reads in + * bio->bi_phys_segments. If this is 0, there is only one r10_bio and + * no locking will be needed when the request completes. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); + + if (bio_data_dir(bio) == READ) + raid10_read_request(mddev, bio, r10_bio); + else + raid10_write_request(mddev, bio, r10_bio); +} + static void raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; From 074859184d770824f4437dca716bdeb625ae8b1c Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 3 Jan 2017 12:42:42 +0100 Subject: [PATCH 062/297] tools lib traceevent: Fix prev/next_prio for deadline tasks Currently, the sched:sched_switch tracepoint reports deadline tasks with priority -1. But when reading the trace via perf script I've got the following output: # ./d & # (d is a deadline task, see [1]) # perf record -e sched:sched_switch -a sleep 1 # perf script ... swapper 0 [000] 2146.962441: sched:sched_switch: swapper/0:0 [120] R ==> d:2593 [4294967295] d 2593 [000] 2146.972472: sched:sched_switch: d:2593 [4294967295] R ==> g:2590 [4294967295] The task d reports the wrong priority [4294967295]. This happens because the "int prio" is stored in an unsigned long long val. Although it is set as a %lld, as int is shorter than unsigned long long, trace_seq_printf prints it as a positive number. The fix is just to cast the val as an int, and print it as a %d, as in the sched:sched_switch tracepoint's "format". The output with the fix is: # ./d & # perf record -e sched:sched_switch -a sleep 1 # perf script ... swapper 0 [000] 4306.374037: sched:sched_switch: swapper/0:0 [120] R ==> d:10941 [-1] d 10941 [000] 4306.383823: sched:sched_switch: d:10941 [-1] R ==> swapper/0:0 [120] [1] d.c --- #include #include #include #include #include struct sched_attr { __u32 size, sched_policy; __u64 sched_flags; __s32 sched_nice; __u32 sched_priority; __u64 sched_runtime, sched_deadline, sched_period; }; int sched_setattr(pid_t pid, const struct sched_attr *attr, unsigned int flags) { return syscall(__NR_sched_setattr, pid, attr, flags); } int main(void) { struct sched_attr attr = { .size = sizeof(attr), .sched_policy = SCHED_DEADLINE, /* This creates a 10ms/30ms reservation */ .sched_runtime = 10 * 1000 * 1000, .sched_period = attr.sched_deadline = 30 * 1000 * 1000, }; if (sched_setattr(0, &attr, 0) < 0) { perror("sched_setattr"); return -1; } for(;;); } --- Committer notes: Got the program from the provided URL, http://bristot.me/lkml/d.c, trimmed it and included in the cset log above, so that we have everything needed to test it in one place. Signed-off-by: Daniel Bristot de Oliveira Acked-by: Steven Rostedt Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Daniel Bristot de Oliveira Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/866ef75bcebf670ae91c6a96daa63597ba981f0d.1483443552.git.bristot@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/plugin_sched_switch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/traceevent/plugin_sched_switch.c b/tools/lib/traceevent/plugin_sched_switch.c index f1ce60065258..ec30c2fcbac0 100644 --- a/tools/lib/traceevent/plugin_sched_switch.c +++ b/tools/lib/traceevent/plugin_sched_switch.c @@ -111,7 +111,7 @@ static int sched_switch_handler(struct trace_seq *s, trace_seq_printf(s, "%lld ", val); if (pevent_get_field_val(s, event, "prev_prio", record, &val, 0) == 0) - trace_seq_printf(s, "[%lld] ", val); + trace_seq_printf(s, "[%d] ", (int) val); if (pevent_get_field_val(s, event, "prev_state", record, &val, 0) == 0) write_state(s, val); @@ -129,7 +129,7 @@ static int sched_switch_handler(struct trace_seq *s, trace_seq_printf(s, "%lld", val); if (pevent_get_field_val(s, event, "next_prio", record, &val, 0) == 0) - trace_seq_printf(s, " [%lld]", val); + trace_seq_printf(s, " [%d]", (int) val); return 0; } From 30a9c6444810429aa2b7cbfbd453ce339baaadbf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Jan 2017 12:03:59 -0300 Subject: [PATCH 063/297] perf tools: Install tools/lib/traceevent plugins with install-bin Those are binaries as well, so should be installed by: make -C tools/perf install-bin' too. Cc: Alexander Shishkin Cc: Daniel Bristot de Oliveira Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-3841b37u05evxrs1igkyu6ks@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index e9ec531131ca..4db68aec9913 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -704,9 +704,9 @@ install-tests: all install-gtk $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \ $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr' -install-bin: install-tools install-tests +install-bin: install-tools install-tests install-traceevent-plugins -install: install-bin try-install-man install-traceevent-plugins +install: install-bin try-install-man install-python_ext: $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)' From 7934c98a6e04028eb34c1293bfb5a6b0ab630b66 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Jan 2017 15:19:21 -0300 Subject: [PATCH 064/297] perf symbols: Robustify reading of build-id from sysfs Markus reported that perf segfaults when reading /sys/kernel/notes from a kernel linked with GNU gold, due to what looks like a gold bug, so do some bounds checking to avoid crashing in that case. Reported-by: Markus Trippelsdorf Report-Link: http://lkml.kernel.org/r/20161219161821.GA294@x4 Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-ryhgs6a6jxvz207j2636w31c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-elf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 99400b0e8f2a..adbc6c02c3aa 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -537,6 +537,12 @@ int sysfs__read_build_id(const char *filename, void *build_id, size_t size) break; } else { int n = namesz + descsz; + + if (n > (int)sizeof(bf)) { + n = sizeof(bf); + pr_debug("%s: truncating reading of build id in sysfs file %s: n_namesz=%u, n_descsz=%u.\n", + __func__, filename, nhdr.n_namesz, nhdr.n_descsz); + } if (read(fd, bf, n) != n) break; } From 47e3a5edc6538d66e470aaed3b7c57255cb37ca1 Mon Sep 17 00:00:00 2001 From: Paul Donohue Date: Tue, 3 Jan 2017 10:39:28 -0800 Subject: [PATCH 065/297] Input: ALPS - fix TrackStick Y axis handling for SS5 hardware A minus character was lost in commit 23fce365, causing the Y axis to be inverted for SS5 TrackStick events. (Pushing the TrackStick up caused the pointer to move down, and vice versa.) Restore the lost minus. Fixes: 23fce365c6a2 ("Input: ALPS - clean up code for SS5 hardware") Signed-off-by: Paul Donohue Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/alps.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/mouse/alps.h b/drivers/input/mouse/alps.h index cde6f4bd8ea2..6d279aa27cb9 100644 --- a/drivers/input/mouse/alps.h +++ b/drivers/input/mouse/alps.h @@ -114,7 +114,7 @@ enum SS4_PACKET_ID { (_b[1] & 0x7F) \ ) -#define SS4_TS_Y_V2(_b) (s8)( \ +#define SS4_TS_Y_V2(_b) -(s8)( \ ((_b[3] & 0x01) << 7) | \ (_b[2] & 0x7F) \ ) From 01427fe7c4b956b878e55e966690624a3624e991 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 3 Jan 2017 11:51:48 -0800 Subject: [PATCH 066/297] Input: adxl34x - make it enumerable in ACPI environment The ACPI-enabled platform may contain _DSD method to enable this driver using compatible string. Remove OF specifics to re-use existing code on ACPI-enabled platforms. Suggested-by: Mika Westerberg Signed-off-by: Andy Shevchenko Signed-off-by: Dmitry Torokhov --- drivers/input/misc/adxl34x-i2c.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/input/misc/adxl34x-i2c.c b/drivers/input/misc/adxl34x-i2c.c index a8b0a2eec344..7fed92fb8cc1 100644 --- a/drivers/input/misc/adxl34x-i2c.c +++ b/drivers/input/misc/adxl34x-i2c.c @@ -136,7 +136,6 @@ static const struct i2c_device_id adxl34x_id[] = { MODULE_DEVICE_TABLE(i2c, adxl34x_id); -#ifdef CONFIG_OF static const struct of_device_id adxl34x_of_id[] = { /* * The ADXL346 is backward-compatible with the ADXL345. Differences are @@ -153,13 +152,12 @@ static const struct of_device_id adxl34x_of_id[] = { }; MODULE_DEVICE_TABLE(of, adxl34x_of_id); -#endif static struct i2c_driver adxl34x_driver = { .driver = { .name = "adxl34x", .pm = &adxl34x_i2c_pm, - .of_match_table = of_match_ptr(adxl34x_of_id), + .of_match_table = adxl34x_of_id, }, .probe = adxl34x_i2c_probe, .remove = adxl34x_i2c_remove, From f97fd383d9a10fd125bcdafba03240685aed5608 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 19 Dec 2016 15:47:14 +0100 Subject: [PATCH 067/297] drm: tilcdc: simplify the recovery from sync lost error on rev1 Revision 2 of LCDC suffers from an issue where a SYNC_LOST error caused by limited memory bandwidth may leave the picture shifted a couple pixels to the right. This issue has not been observed on revision 1, while the recovery mechanism introduces a different issue, where the END_OF_FRAME interrupt doesn't fire while drm is waiting for vblanks. On rev1: recover from sync lost errors by simply clearing the RASTER_ENABLE bit in the RASTER_CTRL register and re-enabling it again as is suggested by the datasheet. Signed-off-by: Bartosz Golaszewski Reviewed-by: Jyri Sarha Signed-off-by: Jyri Sarha --- drivers/gpu/drm/tilcdc/tilcdc_crtc.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c index 9942b0577d6e..20041073e46d 100644 --- a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c +++ b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c @@ -856,7 +856,7 @@ irqreturn_t tilcdc_crtc_irq(struct drm_crtc *crtc) struct tilcdc_crtc *tilcdc_crtc = to_tilcdc_crtc(crtc); struct drm_device *dev = crtc->dev; struct tilcdc_drm_private *priv = dev->dev_private; - uint32_t stat; + uint32_t stat, reg; stat = tilcdc_read_irqstatus(dev); tilcdc_clear_irqstatus(dev, stat); @@ -921,17 +921,26 @@ irqreturn_t tilcdc_crtc_irq(struct drm_crtc *crtc) dev_err_ratelimited(dev->dev, "%s(0x%08x): Sync lost", __func__, stat); tilcdc_crtc->frame_intact = false; - if (tilcdc_crtc->sync_lost_count++ > - SYNC_LOST_COUNT_LIMIT) { - dev_err(dev->dev, "%s(0x%08x): Sync lost flood detected, recovering", __func__, stat); - queue_work(system_wq, &tilcdc_crtc->recover_work); - if (priv->rev == 1) + if (priv->rev == 1) { + reg = tilcdc_read(dev, LCDC_RASTER_CTRL_REG); + if (reg & LCDC_RASTER_ENABLE) { tilcdc_clear(dev, LCDC_RASTER_CTRL_REG, - LCDC_V1_SYNC_LOST_INT_ENA); - else + LCDC_RASTER_ENABLE); + tilcdc_set(dev, LCDC_RASTER_CTRL_REG, + LCDC_RASTER_ENABLE); + } + } else { + if (tilcdc_crtc->sync_lost_count++ > + SYNC_LOST_COUNT_LIMIT) { + dev_err(dev->dev, + "%s(0x%08x): Sync lost flood detected, recovering", + __func__, stat); + queue_work(system_wq, + &tilcdc_crtc->recover_work); tilcdc_write(dev, LCDC_INT_ENABLE_CLR_REG, LCDC_SYNC_LOST); - tilcdc_crtc->sync_lost_count = 0; + tilcdc_crtc->sync_lost_count = 0; + } } } From aebe55c2d4b998741c0847ace1b4af47d73c763b Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 3 Jan 2017 01:14:27 +0200 Subject: [PATCH 068/297] drm: Clean up planes in atomic commit helper failure path If waiting for fences fails for blocking commits, planes must be cleaned up before returning. Cc: stable@vger.kernel.org Fixes: f6ce410a59a4 ("drm/fence: allow fence waiting to be interrupted by userspace") Signed-off-by: Laurent Pinchart Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20170102231427.7192-1-laurent.pinchart@ideasonboard.com --- drivers/gpu/drm/drm_atomic_helper.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 583f47f27b36..34f757bcabae 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -1259,8 +1259,10 @@ int drm_atomic_helper_commit(struct drm_device *dev, if (!nonblock) { ret = drm_atomic_helper_wait_for_fences(dev, state, true); - if (ret) + if (ret) { + drm_atomic_helper_cleanup_planes(dev, state); return ret; + } } /* From 0c931a290cc0377c99a8cd970a49e736dbb23e0e Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 2 Jan 2017 16:14:15 +0100 Subject: [PATCH 069/297] drm/meson: Fix CVBS initialization when HDMI is configured by bootloader When the HDMI output is configured by the bootloader, there is mismatch is the pipeline configuration and the Vsync interrupt fails to trigger. This commit disables the HDMI blocks in the probe phase. Fixes: bbbe775ec5b5 ("drm: Add support for Amlogic Meson Graphic Controller") Signed-off-by: Neil Armstrong --- drivers/gpu/drm/meson/meson_venc.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/gpu/drm/meson/meson_venc.c b/drivers/gpu/drm/meson/meson_venc.c index d836b2274531..f7c870172220 100644 --- a/drivers/gpu/drm/meson/meson_venc.c +++ b/drivers/gpu/drm/meson/meson_venc.c @@ -38,6 +38,11 @@ * - TV Panel encoding via ENCT */ +/* HHI Registers */ +#define HHI_VDAC_CNTL0 0x2F4 /* 0xbd offset in data sheet */ +#define HHI_VDAC_CNTL1 0x2F8 /* 0xbe offset in data sheet */ +#define HHI_HDMI_PHY_CNTL0 0x3a0 /* 0xe8 offset in data sheet */ + struct meson_cvbs_enci_mode meson_cvbs_enci_pal = { .mode_tag = MESON_VENC_MODE_CVBS_PAL, .hso_begin = 3, @@ -242,6 +247,20 @@ void meson_venc_disable_vsync(struct meson_drm *priv) void meson_venc_init(struct meson_drm *priv) { + /* Disable CVBS VDAC */ + regmap_write(priv->hhi, HHI_VDAC_CNTL0, 0); + regmap_write(priv->hhi, HHI_VDAC_CNTL1, 8); + + /* Power Down Dacs */ + writel_relaxed(0xff, priv->io_base + _REG(VENC_VDAC_SETTING)); + + /* Disable HDMI PHY */ + regmap_write(priv->hhi, HHI_HDMI_PHY_CNTL0, 0); + + /* Disable HDMI */ + writel_bits_relaxed(0x3, 0, + priv->io_base + _REG(VPU_HDMI_SETTING)); + /* Disable all encoders */ writel_relaxed(0, priv->io_base + _REG(ENCI_VIDEO_EN)); writel_relaxed(0, priv->io_base + _REG(ENCP_VIDEO_EN)); From 5db60ea93d4fbf146c8f7ca286b8b2a091761460 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 4 Jan 2017 10:51:02 +0100 Subject: [PATCH 070/297] drm/meson: Fix CVBS VDAC disable This commit fixes the VDAC disabling register write values. Fixes: bbbe775ec5b5 ("drm: Add support for Amlogic Meson Graphic Controller") Signed-off-by: Neil Armstrong --- drivers/gpu/drm/meson/meson_venc_cvbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/meson/meson_venc_cvbs.c b/drivers/gpu/drm/meson/meson_venc_cvbs.c index c809c085fd78..a2bcc70a03ef 100644 --- a/drivers/gpu/drm/meson/meson_venc_cvbs.c +++ b/drivers/gpu/drm/meson/meson_venc_cvbs.c @@ -167,7 +167,7 @@ static void meson_venc_cvbs_encoder_disable(struct drm_encoder *encoder) /* Disable CVBS VDAC */ regmap_write(priv->hhi, HHI_VDAC_CNTL0, 0); - regmap_write(priv->hhi, HHI_VDAC_CNTL1, 0); + regmap_write(priv->hhi, HHI_VDAC_CNTL1, 8); } static void meson_venc_cvbs_encoder_enable(struct drm_encoder *encoder) From eebc509b20881b92d62e317b2c073e57c5f200f0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 4 Jan 2017 12:29:05 +0900 Subject: [PATCH 071/297] perf probe: Fix --funcs to show correct symbols for offline module Fix --funcs (-F) option to show correct symbols for offline module. Since previous perf-probe uses machine__findnew_module_map() for offline module, even if user passes a module file (with full path) which is for other architecture, perf-probe always tries to load symbol map for current kernel module. This fix uses dso__new_map() to load the map from given binary as same as a map for user applications. Signed-off-by: Masami Hiramatsu Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/148350053478.19001.15435255244512631545.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 8f810961ec78..542e6472c4d7 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -163,7 +163,7 @@ static struct map *kernel_get_module_map(const char *module) /* A file path -- this is an offline module */ if (module && strchr(module, '/')) - return machine__findnew_module_map(host_machine, 0, module); + return dso__new_map(module); if (!module) module = "kernel"; @@ -173,6 +173,7 @@ static struct map *kernel_get_module_map(const char *module) if (strncmp(pos->dso->short_name + 1, module, pos->dso->short_name_len - 2) == 0 && module[pos->dso->short_name_len - 2] == '\0') { + map__get(pos); return pos; } } @@ -188,15 +189,6 @@ struct map *get_target_map(const char *target, bool user) return kernel_get_module_map(target); } -static void put_target_map(struct map *map, bool user) -{ - if (map && user) { - /* Only the user map needs to be released */ - map__put(map); - } -} - - static int convert_exec_to_group(const char *exec, char **result) { char *ptr1, *ptr2, *exec_copy; @@ -412,7 +404,7 @@ static int find_alternative_probe_point(struct debuginfo *dinfo, } out: - put_target_map(map, uprobes); + map__put(map); return ret; } @@ -2869,7 +2861,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, } out: - put_target_map(map, pev->uprobes); + map__put(map); free(syms); return ret; @@ -3362,10 +3354,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter, return ret; /* Get a symbol map */ - if (user) - map = dso__new_map(target); - else - map = kernel_get_module_map(target); + map = get_target_map(target, user); if (!map) { pr_err("Failed to get a map for %s\n", (target) ? : "kernel"); return -EINVAL; @@ -3397,9 +3386,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter, } end: - if (user) { - map__put(map); - } + map__put(map); exit_probe_symbol_maps(); return ret; From 8a937a25a7e3c19d5fb3f9d92f605cf5fda219d8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 4 Jan 2017 12:30:19 +0900 Subject: [PATCH 072/297] perf probe: Fix to probe on gcc generated symbols for offline kernel Fix perf-probe to show probe definition on gcc generated symbols for offline kernel (including cross-arch kernel image). gcc sometimes optimizes functions and generate new symbols with suffixes such as ".constprop.N" or ".isra.N" etc. Since those symbol names are not recorded in DWARF, we have to find correct generated symbols from offline ELF binary to probe on it (kallsyms doesn't correct it). For online kernel or uprobes we don't need it because those are rebased on _text, or a section relative address. E.g. Without this: $ perf probe -k build-arm/vmlinux -F __slab_alloc* __slab_alloc.constprop.9 $ perf probe -k build-arm/vmlinux -D __slab_alloc p:probe/__slab_alloc __slab_alloc+0 If you put above definition on target machine, it should fail because there is no __slab_alloc in kallsyms. With this fix, perf probe shows correct probe definition on __slab_alloc.constprop.9: $ perf probe -k build-arm/vmlinux -D __slab_alloc p:probe/__slab_alloc __slab_alloc.constprop.9+0 Signed-off-by: Masami Hiramatsu Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/148350060434.19001.11864836288580083501.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 48 ++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 542e6472c4d7..4a57c8a60bd9 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -610,6 +610,51 @@ error: return ret ? : -ENOENT; } +/* + * Rename DWARF symbols to ELF symbols -- gcc sometimes optimizes functions + * and generate new symbols with suffixes such as .constprop.N or .isra.N + * etc. Since those symbols are not recorded in DWARF, we have to find + * correct generated symbols from offline ELF binary. + * For online kernel or uprobes we don't need this because those are + * rebased on _text, or already a section relative address. + */ +static int +post_process_offline_probe_trace_events(struct probe_trace_event *tevs, + int ntevs, const char *pathname) +{ + struct symbol *sym; + struct map *map; + unsigned long stext = 0; + u64 addr; + int i; + + /* Prepare a map for offline binary */ + map = dso__new_map(pathname); + if (!map || get_text_start_address(pathname, &stext) < 0) { + pr_warning("Failed to get ELF symbols for %s\n", pathname); + return -EINVAL; + } + + for (i = 0; i < ntevs; i++) { + addr = tevs[i].point.address + tevs[i].point.offset - stext; + sym = map__find_symbol(map, addr); + if (!sym) + continue; + if (!strcmp(sym->name, tevs[i].point.symbol)) + continue; + /* If we have no realname, use symbol for it */ + if (!tevs[i].point.realname) + tevs[i].point.realname = tevs[i].point.symbol; + else + free(tevs[i].point.symbol); + tevs[i].point.symbol = strdup(sym->name); + tevs[i].point.offset = addr - sym->start; + } + map__put(map); + + return 0; +} + static int add_exec_to_probe_trace_events(struct probe_trace_event *tevs, int ntevs, const char *exec) { @@ -671,7 +716,8 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs, /* Skip post process if the target is an offline kernel */ if (symbol_conf.ignore_vmlinux_buildid) - return 0; + return post_process_offline_probe_trace_events(tevs, ntevs, + symbol_conf.vmlinux_name); reloc_sym = kernel_get_ref_reloc_sym(); if (!reloc_sym) { From 4ee437fbf626b5ad756889d8bc0fcead3d66dde7 Mon Sep 17 00:00:00 2001 From: Caleb Crome Date: Tue, 3 Jan 2017 10:22:57 -0800 Subject: [PATCH 073/297] ASoC: fsl_ssi: set fifo watermark to more reliable value The fsl_ssi fifo watermark is by default set to 2 free spaces (i.e. activate DMA on FIFO when only 2 spaces are left.) This means the DMA must service the fifo within 2 audio samples, which is just not enough time for many use cases with high data rate. In many configurations the audio channel slips (causing l/r swap in stereo configurations, or channel slipping in multi-channel configurations). This patch gives more breathing room and allows the SSI to operate reliably by changing the fifio refill watermark to 8. There is no change in behavior for older chips (with an 8-deep fifo). Only the newer chips with a 15-deep fifo get the new behavior. I suspect a new fifo depth setting could be optimized on the older chips too, but I have not tested. Signed-off-by: Caleb Crome Reviewed-by: Fabio Estevam Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_ssi.c | 74 +++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 21 deletions(-) diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c index 50349437d961..fde08660b63b 100644 --- a/sound/soc/fsl/fsl_ssi.c +++ b/sound/soc/fsl/fsl_ssi.c @@ -224,6 +224,12 @@ struct fsl_ssi_soc_data { * @dbg_stats: Debugging statistics * * @soc: SoC specific data + * + * @fifo_watermark: the FIFO watermark setting. Notifies DMA when + * there are @fifo_watermark or fewer words in TX fifo or + * @fifo_watermark or more empty words in RX fifo. + * @dma_maxburst: max number of words to transfer in one go. So far, + * this is always the same as fifo_watermark. */ struct fsl_ssi_private { struct regmap *regs; @@ -263,6 +269,9 @@ struct fsl_ssi_private { const struct fsl_ssi_soc_data *soc; struct device *dev; + + u32 fifo_watermark; + u32 dma_maxburst; }; /* @@ -1051,21 +1060,7 @@ static int _fsl_ssi_set_dai_fmt(struct device *dev, regmap_write(regs, CCSR_SSI_SRCR, srcr); regmap_write(regs, CCSR_SSI_SCR, scr); - /* - * Set the watermark for transmit FIFI 0 and receive FIFO 0. We don't - * use FIFO 1. We program the transmit water to signal a DMA transfer - * if there are only two (or fewer) elements left in the FIFO. Two - * elements equals one frame (left channel, right channel). This value, - * however, depends on the depth of the transmit buffer. - * - * We set the watermark on the same level as the DMA burstsize. For - * fiq it is probably better to use the biggest possible watermark - * size. - */ - if (ssi_private->use_dma) - wm = ssi_private->fifo_depth - 2; - else - wm = ssi_private->fifo_depth; + wm = ssi_private->fifo_watermark; regmap_write(regs, CCSR_SSI_SFCSR, CCSR_SSI_SFCSR_TFWM0(wm) | CCSR_SSI_SFCSR_RFWM0(wm) | @@ -1373,12 +1368,8 @@ static int fsl_ssi_imx_probe(struct platform_device *pdev, dev_dbg(&pdev->dev, "could not get baud clock: %ld\n", PTR_ERR(ssi_private->baudclk)); - /* - * We have burstsize be "fifo_depth - 2" to match the SSI - * watermark setting in fsl_ssi_startup(). - */ - ssi_private->dma_params_tx.maxburst = ssi_private->fifo_depth - 2; - ssi_private->dma_params_rx.maxburst = ssi_private->fifo_depth - 2; + ssi_private->dma_params_tx.maxburst = ssi_private->dma_maxburst; + ssi_private->dma_params_rx.maxburst = ssi_private->dma_maxburst; ssi_private->dma_params_tx.addr = ssi_private->ssi_phys + CCSR_SSI_STX0; ssi_private->dma_params_rx.addr = ssi_private->ssi_phys + CCSR_SSI_SRX0; @@ -1543,6 +1534,47 @@ static int fsl_ssi_probe(struct platform_device *pdev) /* Older 8610 DTs didn't have the fifo-depth property */ ssi_private->fifo_depth = 8; + /* + * Set the watermark for transmit FIFO 0 and receive FIFO 0. We don't + * use FIFO 1 but set the watermark appropriately nontheless. + * We program the transmit water to signal a DMA transfer + * if there are N elements left in the FIFO. For chips with 15-deep + * FIFOs, set watermark to 8. This allows the SSI to operate at a + * high data rate without channel slipping. Behavior is unchanged + * for the older chips with a fifo depth of only 8. A value of 4 + * might be appropriate for the older chips, but is left at + * fifo_depth-2 until sombody has a chance to test. + * + * We set the watermark on the same level as the DMA burstsize. For + * fiq it is probably better to use the biggest possible watermark + * size. + */ + switch (ssi_private->fifo_depth) { + case 15: + /* + * 2 samples is not enough when running at high data + * rates (like 48kHz @ 16 bits/channel, 16 channels) + * 8 seems to split things evenly and leave enough time + * for the DMA to fill the FIFO before it's over/under + * run. + */ + ssi_private->fifo_watermark = 8; + ssi_private->dma_maxburst = 8; + break; + case 8: + default: + /* + * maintain old behavior for older chips. + * Keeping it the same because I don't have an older + * board to test with. + * I suspect this could be changed to be something to + * leave some more space in the fifo. + */ + ssi_private->fifo_watermark = ssi_private->fifo_depth - 2; + ssi_private->dma_maxburst = ssi_private->fifo_depth - 2; + break; + } + dev_set_drvdata(&pdev->dev, ssi_private); if (ssi_private->soc->imx) { From dd853fd216d1485ed3045ff772079cc8689a9a4a Mon Sep 17 00:00:00 2001 From: Lukasz Odzioba Date: Wed, 28 Dec 2016 14:55:40 +0100 Subject: [PATCH 074/297] x86/cpu: Fix bootup crashes by sanitizing the argument of the 'clearcpuid=' command-line option A negative number can be specified in the cmdline which will be used as setup_clear_cpu_cap() argument. With that we can clear/set some bit in memory predceeding boot_cpu_data/cpu_caps_cleared which may cause kernel to misbehave. This patch adds lower bound check to setup_disablecpuid(). Boris Petkov reproduced a crash: [ 1.234575] BUG: unable to handle kernel paging request at ffffffff858bd540 [ 1.236535] IP: memcpy_erms+0x6/0x10 Signed-off-by: Lukasz Odzioba Acked-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andi.kleen@intel.com Cc: bp@alien8.de Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: slaoub@gmail.com Fixes: ac72e7888a61 ("x86: add generic clearcpuid=... option") Link: http://lkml.kernel.org/r/1482933340-11857-1-git-send-email-lukasz.odzioba@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dc1697ca5191..9bab7a8a4293 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1221,7 +1221,7 @@ static __init int setup_disablecpuid(char *arg) { int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) + if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) setup_clear_cpu_cap(bit); else return 0; From 754c73cf4d2463022b2c9ae208026bf22564ed06 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 2 Jan 2017 11:22:29 +0200 Subject: [PATCH 075/297] x86/cpu: Fix typo in the comment for Anniedale The proper spelling of Anniedale SoC with 'e' in the middle. Fix typo in the comment line in intel-family.h header. Signed-off-by: Andy Shevchenko Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170102092229.87036-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/intel-family.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 34a46dc076d3..8167fdb67ae8 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -57,7 +57,7 @@ #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ From 159d3726db12b3476bc59ea0ab0a702103d466b5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 2 Jan 2017 11:24:50 +0200 Subject: [PATCH 076/297] x86/platform/intel-mid: Rename 'spidev' to 'mrfld_spidev' The current implementation supports only Intel Merrifield platforms. Don't mess with the rest of the Intel MID family by not registering device with wrong properties. Signed-off-by: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170102092450.87229-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/device_libs/Makefile | 2 +- .../{platform_spidev.c => platform_mrfld_spidev.c} | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) rename arch/x86/platform/intel-mid/device_libs/{platform_spidev.c => platform_mrfld_spidev.c} (91%) diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index 61b5ed2b7d40..90e4f2a6625b 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -15,7 +15,7 @@ obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o # SPI Devices -obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_spidev.o +obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o # I2C Devices obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c similarity index 91% rename from arch/x86/platform/intel-mid/device_libs/platform_spidev.c rename to arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c index 30c601b399ee..27186ad654c9 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_spidev.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c @@ -11,6 +11,7 @@ * of the License. */ +#include #include #include #include @@ -34,6 +35,9 @@ static void __init *spidev_platform_data(void *info) { struct spi_board_info *spi_info = info; + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return ERR_PTR(-ENODEV); + spi_info->mode = SPI_MODE_0; spi_info->controller_data = &spidev_spi_chip; From 74545f63890e38520eb4d1dbedcadaa9c0dbc824 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Thu, 22 Dec 2016 17:17:40 -0800 Subject: [PATCH 077/297] perf/x86: Set pmu->module in Intel PMU modules The conversion of Intel PMU drivers into modules did not include reference counting. The machine will crash when attempting to access deleted code if an event from a module PMU is started and the module removed before the event is destroyed. i.e. this crashes the machine: $ insmod intel-rapl-perf.ko $ perf stat -e power/energy-cores/ -C 0 & $ rmmod intel-rapl-perf.ko Set THIS_MODULE to pmu->module in Intel module PMUs so that generic code can handle reference counting and deny rmmod while an event still exists. Signed-off-by: David Carrillo-Cisneros Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Dave Hansen Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Srinivas Pandruvada Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1482455860-116269-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cstate.c | 2 ++ arch/x86/events/intel/rapl.c | 1 + arch/x86/events/intel/uncore.c | 1 + 3 files changed, 4 insertions(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index fec8a461bdef..1076c9a77292 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu cstate_pkg_pmu = { @@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static const struct cstate_model nhm_cstates __initconst = { diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index bd34124449b0..17c3564d087a 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -697,6 +697,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; return 0; } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 97c246f84dea..8c4ccdc3a3f3 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -733,6 +733,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .start = uncore_pmu_event_start, .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, + .module = THIS_MODULE, }; } else { pmu->pmu = *pmu->type->pmu; From 753aacfd2e95df6a0caf23c03dc309020765bea9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Jan 2017 10:57:14 +0100 Subject: [PATCH 078/297] nl80211: fix sched scan netlink socket owner destruction A single netlink socket might own multiple interfaces *and* a scheduled scan request (which might belong to another interface), so when it goes away both may need to be destroyed. Remove the schedule_scan_stop indirection to fix this - it's only needed for interface destruction because of the way this works right now, with a single work taking care of all interfaces. Cc: stable@vger.kernel.org Fixes: 93a1e86ce10e4 ("nl80211: Stop scheduled scan if netlink client disappears") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3df85a751a85..ef5eff93a8b8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14502,13 +14502,17 @@ static int nl80211_netlink_notify(struct notifier_block * nb, list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { bool schedule_destroy_work = false; - bool schedule_scan_stop = false; struct cfg80211_sched_scan_request *sched_scan_req = rcu_dereference(rdev->sched_scan_req); if (sched_scan_req && notify->portid && - sched_scan_req->owner_nlportid == notify->portid) - schedule_scan_stop = true; + sched_scan_req->owner_nlportid == notify->portid) { + sched_scan_req->owner_nlportid = 0; + + if (rdev->ops->sched_scan_stop && + rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + schedule_work(&rdev->sched_scan_stop_wk); + } list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) { cfg80211_mlme_unregister_socket(wdev, notify->portid); @@ -14539,12 +14543,6 @@ static int nl80211_netlink_notify(struct notifier_block * nb, spin_unlock(&rdev->destroy_list_lock); schedule_work(&rdev->destroy_work); } - } else if (schedule_scan_stop) { - sched_scan_req->owner_nlportid = 0; - - if (rdev->ops->sched_scan_stop && - rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) - schedule_work(&rdev->sched_scan_stop_wk); } } From 60448b077ed93d227e6c117a9e87db76ff0c1911 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Wed, 4 Jan 2017 15:44:52 -0600 Subject: [PATCH 079/297] ASoC: Intel: bytcr-rt5640: fix settings in internal clock mode Frequency value of zero did not make sense, use same 24.576MHz setting and only change the clock source in idle mode Suggested-by: Bard Liao Signed-off-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- sound/soc/intel/boards/bytcr_rt5640.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index e33e4777a65c..8d2fb2d6f532 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -142,7 +142,7 @@ static int platform_clock_control(struct snd_soc_dapm_widget *w, * for Jack detection and button press */ ret = snd_soc_dai_set_sysclk(codec_dai, RT5640_SCLK_S_RCCLK, - 0, + 48000 * 512, SND_SOC_CLOCK_IN); if (!ret) { if ((byt_rt5640_quirk & BYT_RT5640_MCLK_EN) && priv->mclk) From 08f9572671c8047e7234cbf150869aa3c3d59a97 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 5 Jan 2017 14:25:59 +0100 Subject: [PATCH 080/297] HID: ignore Petzl USB headlamp This headlamp contains a dummy HID descriptor which pretends to be a mouse-like device, but can't be used as a mouse at all. Reported-by: Lukas Ocilka Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 1 + drivers/hid/hid-ids.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index cff060b56da9..ea36b557d5ee 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2496,6 +2496,7 @@ static const struct hid_device_id hid_ignore_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_PANJIT, 0x0002) }, { HID_USB_DEVICE(USB_VENDOR_ID_PANJIT, 0x0003) }, { HID_USB_DEVICE(USB_VENDOR_ID_PANJIT, 0x0004) }, + { HID_USB_DEVICE(USB_VENDOR_ID_PETZL, USB_DEVICE_ID_PETZL_HEADLAMP) }, { HID_USB_DEVICE(USB_VENDOR_ID_PHILIPS, USB_DEVICE_ID_PHILIPS_IEEE802154_DONGLE) }, { HID_USB_DEVICE(USB_VENDOR_ID_POWERCOM, USB_DEVICE_ID_POWERCOM_UPS) }, #if IS_ENABLED(CONFIG_MOUSE_SYNAPTICS_USB) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 54bd22dc1411..f46f2c5117fa 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -816,6 +816,9 @@ #define USB_VENDOR_ID_PETALYNX 0x18b1 #define USB_DEVICE_ID_PETALYNX_MAXTER_REMOTE 0x0037 +#define USB_VENDOR_ID_PETZL 0x2122 +#define USB_DEVICE_ID_PETZL_HEADLAMP 0x1234 + #define USB_VENDOR_ID_PHILIPS 0x0471 #define USB_DEVICE_ID_PHILIPS_IEEE802154_DONGLE 0x0617 From 9b60047a9c950e3fde186466774ffd1ab1104d4e Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 5 Jan 2017 02:54:27 -0500 Subject: [PATCH 081/297] r8169: fix the typo in the comment >From the realtek data sheet, the PID0 should be bit 0. Signed-off-by: Zhu Yanjun Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 44389c90056a..8f1623bf2134 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -696,7 +696,7 @@ enum rtl_tx_desc_bit_1 { enum rtl_rx_desc_bit { /* Rx private */ PID1 = (1 << 18), /* Protocol ID bit 1/2 */ - PID0 = (1 << 17), /* Protocol ID bit 2/2 */ + PID0 = (1 << 17), /* Protocol ID bit 0/2 */ #define RxProtoUDP (PID1) #define RxProtoTCP (PID0) From 28ca833ecf89c585a9543fb21aef6b2bdbbaa48a Mon Sep 17 00:00:00 2001 From: JackieLiu Date: Tue, 13 Dec 2016 13:55:27 +0800 Subject: [PATCH 082/297] md/raid5-cache: removes unnecessary write-through mode judgments The write-through mode has been returned in front of the function, do not need to do it again. Signed-off-by: JackieLiu Reviewed-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index d7bfb6fc8aef..49aea4231084 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2418,9 +2418,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (do_wakeup) wake_up(&conf->wait_for_overlap); - if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) - return; - spin_lock_irq(&conf->log->stripe_in_journal_lock); list_del_init(&sh->r5c); spin_unlock_irq(&conf->log->stripe_in_journal_lock); From 3c66abbaaf69671dfd3eb9fa7740b5d7ec688231 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 14 Dec 2016 15:38:01 -0800 Subject: [PATCH 083/297] md/r5cache: simplify handling of sh->log_start in recovery We only need to update sh->log_start at the end of recovery, which is r5c_recovery_rewrite_data_only_stripes(), so it is not necessary to set it before that. In this patch, log_start is removed from r5c_recovery_alloc_stripe(). After updating all sh->log_start, rewrite_data_only_stripes() also updates log->next_checkpoints to the last sh->log_start. Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 49aea4231084..b178a8fef266 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1682,8 +1682,7 @@ out: static struct stripe_head * r5c_recovery_alloc_stripe(struct r5conf *conf, - sector_t stripe_sect, - sector_t log_start) + sector_t stripe_sect) { struct stripe_head *sh; @@ -1692,7 +1691,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf, return NULL; /* no more stripe available */ r5l_recovery_reset_stripe(sh); - sh->log_start = log_start; return sh; } @@ -1862,7 +1860,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, stripe_sect); if (!sh) { - sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos); + sh = r5c_recovery_alloc_stripe(conf, stripe_sect); /* * cannot get stripe from raid5_get_active_stripe * try replay some stripes @@ -1871,7 +1869,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, r5c_recovery_replay_stripes( cached_stripe_list, ctx); sh = r5c_recovery_alloc_stripe( - conf, stripe_sect, ctx->pos); + conf, stripe_sect); } if (!sh) { pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", @@ -1879,8 +1877,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, conf->min_nr_stripes * 2); raid5_set_cache_size(mddev, conf->min_nr_stripes * 2); - sh = r5c_recovery_alloc_stripe( - conf, stripe_sect, ctx->pos); + sh = r5c_recovery_alloc_stripe(conf, + stripe_sect); } if (!sh) { pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", @@ -1894,7 +1892,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { r5l_recovery_replay_one_stripe(conf, sh, ctx); - sh->log_start = ctx->pos; list_move_tail(&sh->lru, cached_stripe_list); } r5l_recovery_load_data(log, sh, ctx, payload, @@ -1933,8 +1930,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log, set_bit(R5_UPTODATE, &dev->flags); } } - list_add_tail(&sh->r5c, &log->stripe_in_journal_list); - atomic_inc(&log->stripe_in_journal_count); } /* @@ -2070,6 +2065,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, struct stripe_head *sh, *next; struct mddev *mddev = log->rdev->mddev; struct page *page; + sector_t next_checkpoint = MaxSector; page = alloc_page(GFP_KERNEL); if (!page) { @@ -2078,6 +2074,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, return -ENOMEM; } + WARN_ON(list_empty(&ctx->cached_list)); + list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { struct r5l_meta_block *mb; int i; @@ -2123,12 +2121,15 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_WRITE, REQ_FUA, false); sh->log_start = ctx->pos; + list_add_tail(&sh->r5c, &log->stripe_in_journal_list); + atomic_inc(&log->stripe_in_journal_count); ctx->pos = write_pos; ctx->seq += 1; - + next_checkpoint = sh->log_start; list_del_init(&sh->lru); raid5_release_stripe(sh); } + log->next_checkpoint = next_checkpoint; __free_page(page); return 0; } @@ -2139,7 +2140,6 @@ static int r5l_recovery_log(struct r5l_log *log) struct r5l_recovery_ctx ctx; int ret; sector_t pos; - struct stripe_head *sh; ctx.pos = log->last_checkpoint; ctx.seq = log->last_cp_seq; @@ -2164,9 +2164,6 @@ static int r5l_recovery_log(struct r5l_log *log) log->next_checkpoint = ctx.pos; r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); - } else { - sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru); - log->next_checkpoint = sh->log_start; } if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) From d2250f105f18a43fdab17421bd80b0ffc9fcc53f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 14 Dec 2016 15:38:02 -0800 Subject: [PATCH 084/297] md/r5cache: assign conf->log before r5l_load_log() r5l_load_log() calls functions that requires a proper conf->log, for example, r5c_is_writeback(). Therefore, we should set conf->log before calling r5l_load_log(). If r5l_load_log() fails, conf->log is set back to NULL. Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b178a8fef266..bff1b4a949e8 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2633,14 +2633,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); + rcu_assign_pointer(conf->log, log); + if (r5l_load_log(log)) goto error; - rcu_assign_pointer(conf->log, log); set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; error: + rcu_assign_pointer(conf->log, NULL); md_unregister_thread(&log->reclaim_thread); reclaim_thread: mempool_destroy(log->meta_pool); From 99f17890f04cff0262de7393c60a2f6d9c9c7e71 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 23 Dec 2016 00:52:30 +0000 Subject: [PATCH 085/297] md/r5cache: fix spelling mistake on "recoverying" Trivial fix to spelling mistake "recoverying" to "recovering" in pr_dbg message. Signed-off-by: Colin Ian King Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index bff1b4a949e8..0e8ed2c327b0 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2170,7 +2170,7 @@ static int r5l_recovery_log(struct r5l_log *log) pr_debug("md/raid:%s: starting from clean shutdown\n", mdname(mddev)); else { - pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", + pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", mdname(mddev), ctx.data_only_stripes, ctx.data_parity_stripes); From 394ed8e4743b0cfc5496fe49059fbfc2bc8eae35 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 4 Jan 2017 16:10:19 -0800 Subject: [PATCH 086/297] md: cleanup mddev flag clear for takeover Commit 6995f0b (md: takeover should clear unrelated bits) clear unrelated bits, but it's quite fragile. To avoid error in the future, define a macro for unsupported mddev flags for each raid type and use it to clear unsupported mddev flags. This should be less error-prone. Suggested-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.h | 8 ++++++++ drivers/md/raid0.c | 12 ++++++++---- drivers/md/raid1.c | 8 ++++++-- drivers/md/raid5.c | 5 ++++- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index e38936d05df1..2a514036a83d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -212,6 +212,7 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); struct md_cluster_info; +/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ enum mddev_flags { MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */ MD_CLOSING, /* If set, we are closing the array, do not open @@ -702,4 +703,11 @@ static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; } + +/* clear unsupported mddev_flags */ +static inline void mddev_clear_unsupported_flags(struct mddev *mddev, + unsigned long unsupported_flags) +{ + mddev->flags &= ~unsupported_flags; +} #endif /* _MD_MD_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index a162fedeb51a..848365d474f3 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -26,6 +26,11 @@ #include "raid0.h" #include "raid5.h" +#define UNSUPPORTED_MDDEV_FLAGS \ + ((1L << MD_HAS_JOURNAL) | \ + (1L << MD_JOURNAL_CLEAN) | \ + (1L << MD_FAILFAST_SUPPORTED)) + static int raid0_congested(struct mddev *mddev, int bits) { struct r0conf *conf = mddev->private; @@ -539,8 +544,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) mddev->delta_disks = -1; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; - clear_bit(MD_HAS_JOURNAL, &mddev->flags); - clear_bit(MD_JOURNAL_CLEAN, &mddev->flags); + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -583,7 +587,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) mddev->degraded = 0; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; - clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); return priv_conf; @@ -626,7 +630,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) mddev->raid_disks = 1; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; - clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); return priv_conf; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 14422407e520..7b0f647bcccb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -42,6 +42,10 @@ #include "raid1.h" #include "bitmap.h" +#define UNSUPPORTED_MDDEV_FLAGS \ + ((1L << MD_HAS_JOURNAL) | \ + (1L << MD_JOURNAL_CLEAN)) + /* * Number of guaranteed r1bios in case of extreme VM load: */ @@ -3257,8 +3261,8 @@ static void *raid1_takeover(struct mddev *mddev) if (!IS_ERR(conf)) { /* Array must appear to be quiesced */ conf->array_frozen = 1; - clear_bit(MD_HAS_JOURNAL, &mddev->flags); - clear_bit(MD_JOURNAL_CLEAN, &mddev->flags); + mddev_clear_unsupported_flags(mddev, + UNSUPPORTED_MDDEV_FLAGS); } return conf; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 06d7279bdd04..7b1da6e95a56 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -62,6 +62,8 @@ #include "raid0.h" #include "bitmap.h" +#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) + #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE @@ -7830,7 +7832,8 @@ static void *raid5_takeover_raid1(struct mddev *mddev) ret = setup_conf(mddev); if (!IS_ERR_VALUE(ret)) - clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); + mddev_clear_unsupported_flags(mddev, + UNSUPPORTED_MDDEV_FLAGS); return ret; } From a2b1e8a20c992b01eeb76de00d4f534cbe9f3822 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 14 Dec 2016 11:59:34 +0100 Subject: [PATCH 087/297] selftests: do not require bash for the generated test Nothing in this minimal script seems to require bash. We often run these tests on embedded devices where the only shell available is the busybox ash. Use sh instead. Signed-off-by: Rolf Eike Beer Cc: stable@vger.kernel.org Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 71b05891a6a1..831022b12848 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -90,7 +90,7 @@ ifdef INSTALL_PATH done; @# Ask all targets to emit their test scripts - echo "#!/bin/bash" > $(ALL_SCRIPT) + echo "#!/bin/sh" > $(ALL_SCRIPT) echo "cd \$$(dirname \$$0)" >> $(ALL_SCRIPT) echo "ROOT=\$$PWD" >> $(ALL_SCRIPT) From d979e13a3fa9067c8cd46e292ed859626d443996 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 14 Dec 2016 11:58:20 +0100 Subject: [PATCH 088/297] selftests: do not require bash to run bpf tests Nothing in this minimal script seems to require bash. We often run these tests on embedded devices where the only shell available is the busybox ash. Use sh instead. Signed-off-by: Rolf Eike Beer Cc: stable@vger.kernel.org Acked-by: Daniel Borkmann Signed-off-by: Shuah Khan --- tools/testing/selftests/bpf/test_kmod.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh index 92e627adf354..6d58cca8e235 100755 --- a/tools/testing/selftests/bpf/test_kmod.sh +++ b/tools/testing/selftests/bpf/test_kmod.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh SRC_TREE=../../../../ From 3659f98b5375d195f1870c3e508fe51e52206839 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 14 Dec 2016 11:59:57 +0100 Subject: [PATCH 089/297] selftests: do not require bash to run netsocktests testcase Nothing in this minimal script seems to require bash. We often run these tests on embedded devices where the only shell available is the busybox ash. Use sh instead. Signed-off-by: Rolf Eike Beer Cc: stable@vger.kernel.org Signed-off-by: Shuah Khan --- tools/testing/selftests/net/run_netsocktests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/run_netsocktests b/tools/testing/selftests/net/run_netsocktests index c09a682df56a..16058bbea7a8 100755 --- a/tools/testing/selftests/net/run_netsocktests +++ b/tools/testing/selftests/net/run_netsocktests @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh echo "--------------------" echo "running socket test" From 7738789fba09108a28a5fb4739595d9a0a2f85fe Mon Sep 17 00:00:00 2001 From: Colin King Date: Tue, 27 Dec 2016 16:17:21 +0000 Subject: [PATCH 090/297] selftests: x86/pkeys: fix spelling mistake: "itertation" -> "iteration" Fix spelling mistake in print test pass message. Signed-off-by: Colin Ian King Signed-off-by: Shuah Khan --- tools/testing/selftests/x86/protection_keys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c index bdd58c78902e..df9e0a0cdf29 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/x86/protection_keys.c @@ -1367,7 +1367,7 @@ void run_tests_once(void) tracing_off(); close_test_fds(); - printf("test %2d PASSED (itertation %d)\n", test_nr, iteration_nr); + printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); dprintf1("======================\n\n"); } iteration_nr++; From 1c3415a06b1016a596bfe59e0cfee56c773aa958 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 5 Jan 2017 14:14:54 -0800 Subject: [PATCH 091/297] Input: elants_i2c - avoid divide by 0 errors on bad touchscreen data The following crash may be seen if bad data is received from the touchscreen. [ 2189.425150] elants_i2c i2c-ELAN0001:00: unknown packet ff ff ff ff [ 2189.430738] divide error: 0000 [#1] PREEMPT SMP [ 2189.434679] gsmi: Log Shutdown Reason 0x03 [ 2189.434689] Modules linked in: ip6t_REJECT nf_reject_ipv6 rfcomm evdi uinput uvcvideo cmac videobuf2_vmalloc videobuf2_memops snd_hda_codec_hdmi i2c_dev videobuf2_core snd_soc_sst_cht_bsw_rt5645 snd_hda_intel snd_intel_sst_acpi btusb btrtl btbcm btintel bluetooth snd_soc_sst_acpi snd_hda_codec snd_intel_sst_core snd_hwdep snd_soc_sst_mfld_platform snd_hda_core snd_soc_rt5645 memconsole_x86_legacy memconsole zram snd_soc_rl6231 fuse ip6table_filter iwlmvm iwlwifi iwl7000_mac80211 cfg80211 iio_trig_sysfs joydev cros_ec_sensors cros_ec_sensors_core industrialio_triggered_buffer kfifo_buf industrialio snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device ppp_async ppp_generic slhc tun [ 2189.434866] CPU: 0 PID: 106 Comm: irq/184-ELAN000 Tainted: G W 3.18.0-13101-g57e8190 #1 [ 2189.434883] Hardware name: GOOGLE Ultima, BIOS Google_Ultima.7287.131.43 07/20/2016 [ 2189.434898] task: ffff88017a0b6d80 ti: ffff88017a2bc000 task.ti: ffff88017a2bc000 [ 2189.434913] RIP: 0010:[] [] elants_i2c_irq+0x190/0x200 [ 2189.434937] RSP: 0018:ffff88017a2bfd98 EFLAGS: 00010293 [ 2189.434948] RAX: 0000000000000000 RBX: ffff88017a967828 RCX: ffff88017a9678e8 [ 2189.434962] RDX: 0000000000000000 RSI: 0000000000000246 RDI: 0000000000000000 [ 2189.434975] RBP: ffff88017a2bfdd8 R08: 00000000000003e8 R09: 0000000000000000 [ 2189.434989] R10: 0000000000000000 R11: 000000000044a2bd R12: ffff88017a991800 [ 2189.435001] R13: ffffffffbe8a2a53 R14: ffff88017a0b6d80 R15: ffff88017a0b6d80 [ 2189.435011] FS: 0000000000000000(0000) GS:ffff88017fc00000(0000) knlGS:0000000000000000 [ 2189.435022] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 2189.435030] CR2: 00007f678d94b000 CR3: 000000003f41a000 CR4: 00000000001007f0 [ 2189.435039] Stack: [ 2189.435044] ffff88017a2bfda8 ffff88017a9678e8 646464647a2bfdd8 0000000006e09574 [ 2189.435060] 0000000000000000 ffff88017a088b80 ffff88017a921000 ffffffffbe8a2a53 [ 2189.435074] ffff88017a2bfe08 ffffffffbe8a2a73 ffff88017a0b6d80 0000000006e09574 [ 2189.435089] Call Trace: [ 2189.435101] [] ? irq_thread_dtor+0xa9/0xa9 [ 2189.435112] [] irq_thread_fn+0x20/0x40 [ 2189.435123] [] irq_thread+0x14e/0x222 [ 2189.435135] [] ? __schedule+0x3b3/0x57a [ 2189.435145] [] ? wake_threads_waitq+0x2d/0x2d [ 2189.435156] [] ? irq_thread_fn+0x40/0x40 [ 2189.435168] [] kthread+0x10e/0x116 [ 2189.435178] [] ? __kthread_parkme+0x67/0x67 [ 2189.435189] [] ret_from_fork+0x7c/0xb0 [ 2189.435199] [] ? __kthread_parkme+0x67/0x67 [ 2189.435208] Code: ff ff eb 73 0f b6 bb c1 00 00 00 83 ff 03 7e 13 49 8d 7c 24 20 ba 04 00 00 00 48 c7 c6 8a cd 21 bf eb 4d 0f b6 83 c2 00 00 00 99 ff 83 f8 37 75 15 48 6b f7 37 4c 8d a3 c4 00 00 00 4c 8d ac [ 2189.435312] RIP [] elants_i2c_irq+0x190/0x200 [ 2189.435323] RSP [ 2189.435350] ---[ end trace f4945345a75d96dd ]--- [ 2189.443841] Kernel panic - not syncing: Fatal exception [ 2189.444307] Kernel Offset: 0x3d800000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 2189.444519] gsmi: Log Shutdown Reason 0x02 The problem was seen with a 3.18 based kernel, but there is no reason to believe that the upstream code is safe. Fixes: 66aee90088da2 ("Input: add support for Elan eKTH I2C touchscreens") Signed-off-by: Guenter Roeck Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/elants_i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/touchscreen/elants_i2c.c b/drivers/input/touchscreen/elants_i2c.c index 02aec284deca..3e6003d32e56 100644 --- a/drivers/input/touchscreen/elants_i2c.c +++ b/drivers/input/touchscreen/elants_i2c.c @@ -914,9 +914,9 @@ static irqreturn_t elants_i2c_irq(int irq, void *_dev) case QUEUE_HEADER_NORMAL: report_count = ts->buf[FW_HDR_COUNT]; - if (report_count > 3) { + if (report_count == 0 || report_count > 3) { dev_err(&client->dev, - "too large report count: %*ph\n", + "bad report count: %*ph\n", HEADER_SIZE, ts->buf); break; } From 9698b6f473555a722bf81a3371998427d5d27bde Mon Sep 17 00:00:00 2001 From: Satish Kharat Date: Wed, 14 Dec 2016 13:20:41 -0800 Subject: [PATCH 092/297] scsi: fnic: Avoid sending reset to firmware when another reset is in progress This fix is to avoid calling fnic_fw_reset_handler through fnic_host_reset when a finc reset is alreay in progress. Signed-off-by: Satish Kharat Signed-off-by: Sesidhar Baddela Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic.h | 1 + drivers/scsi/fnic/fnic_scsi.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h index 9ddc9200e0a4..9e4b7709043e 100644 --- a/drivers/scsi/fnic/fnic.h +++ b/drivers/scsi/fnic/fnic.h @@ -248,6 +248,7 @@ struct fnic { struct completion *remove_wait; /* device remove thread blocks */ atomic_t in_flight; /* io counter */ + bool internal_reset_inprogress; u32 _reserved; /* fill hole */ unsigned long state_flags; /* protected by host lock */ enum fnic_state state; diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index 2544a37ece0a..adb3d5871e74 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -2581,6 +2581,19 @@ int fnic_host_reset(struct scsi_cmnd *sc) unsigned long wait_host_tmo; struct Scsi_Host *shost = sc->device->host; struct fc_lport *lp = shost_priv(shost); + struct fnic *fnic = lport_priv(lp); + unsigned long flags; + + spin_lock_irqsave(&fnic->fnic_lock, flags); + if (fnic->internal_reset_inprogress == 0) { + fnic->internal_reset_inprogress = 1; + } else { + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "host reset in progress skipping another host reset\n"); + return SUCCESS; + } + spin_unlock_irqrestore(&fnic->fnic_lock, flags); /* * If fnic_reset is successful, wait for fabric login to complete @@ -2601,6 +2614,9 @@ int fnic_host_reset(struct scsi_cmnd *sc) } } + spin_lock_irqsave(&fnic->fnic_lock, flags); + fnic->internal_reset_inprogress = 0; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); return ret; } From 0371adcdaca92912baaa3256ed13e058a016e62d Mon Sep 17 00:00:00 2001 From: Burak Ok Date: Wed, 21 Dec 2016 14:45:53 +0100 Subject: [PATCH 093/297] scsi: snic: Return error code on memory allocation failure If a call to mempool_create_slab_pool() in snic_probe() returns NULL, return -ENOMEM to indicate failure. mempool_creat_slab_pool() only fails if it cannot allocate memory. https://bugzilla.kernel.org/show_bug.cgi?id=189061 Reported-by: bianpan2010@ruc.edu.cn Signed-off-by: Burak Ok Signed-off-by: Andreas Schaertl Acked-by: Narsimhulu Musini Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/snic/snic_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/snic/snic_main.c b/drivers/scsi/snic/snic_main.c index 396b32dca074..7cf70aaec0ba 100644 --- a/drivers/scsi/snic/snic_main.c +++ b/drivers/scsi/snic/snic_main.c @@ -591,6 +591,7 @@ snic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!pool) { SNIC_HOST_ERR(shost, "dflt sgl pool creation failed\n"); + ret = -ENOMEM; goto err_free_res; } @@ -601,6 +602,7 @@ snic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!pool) { SNIC_HOST_ERR(shost, "max sgl pool creation failed\n"); + ret = -ENOMEM; goto err_free_dflt_sgl_pool; } @@ -611,6 +613,7 @@ snic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!pool) { SNIC_HOST_ERR(shost, "snic tmreq info pool creation failed.\n"); + ret = -ENOMEM; goto err_free_max_sgl_pool; } From 2d1148f0f45079d25a0fa0d67e4fdb2a656d12fb Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Fri, 23 Dec 2016 20:40:19 -0800 Subject: [PATCH 094/297] scsi: bfa: Increase requested firmware version to 3.2.5.1 bna & bfa firmware version 3.2.5.1 was submitted to linux-firmware on Feb 17 19:10:20 2015 -0500 in 0ab54ff1dc ("linux-firmware: Add QLogic BR Series Adapter Firmware"). bna was updated to use the newer firmware on Feb 19 16:02:32 2015 -0500 in 3f307c3d70 ("bna: Update the Driver and Firmware Version") bfa was not updated. I presume this was an oversight but it broke support for bfa+bna cards such as the following 04:00.0 Fibre Channel [0c04]: Brocade Communications Systems, Inc. 1010/1020/1007/1741 10Gbps CNA [1657:0014] (rev 01) 04:00.1 Fibre Channel [0c04]: Brocade Communications Systems, Inc. 1010/1020/1007/1741 10Gbps CNA [1657:0014] (rev 01) 04:00.2 Ethernet controller [0200]: Brocade Communications Systems, Inc. 1010/1020/1007/1741 10Gbps CNA [1657:0014] (rev 01) 04:00.3 Ethernet controller [0200]: Brocade Communications Systems, Inc. 1010/1020/1007/1741 10Gbps CNA [1657:0014] (rev 01) Currently, if the bfa module is loaded first, bna fails to probe the respective devices with [ 215.026787] bna: QLogic BR-series 10G Ethernet driver - version: 3.2.25.1 [ 215.043707] bna 0000:04:00.2: bar0 mapped to ffffc90001fc0000, len 262144 [ 215.060656] bna 0000:04:00.2: initialization failed err=1 [ 215.073893] bna 0000:04:00.3: bar0 mapped to ffffc90002040000, len 262144 [ 215.090644] bna 0000:04:00.3: initialization failed err=1 Whereas if bna is loaded first, bfa fails with [ 249.592109] QLogic BR-series BFA FC/FCOE SCSI driver - version: 3.2.25.0 [ 249.610738] bfa 0000:04:00.0: Running firmware version is incompatible with the driver version [ 249.833513] bfa 0000:04:00.0: bfa init failed [ 249.833919] scsi host6: QLogic BR-series FC/FCOE Adapter, hwpath: 0000:04:00.0 driver: 3.2.25.0 [ 249.841446] bfa 0000:04:00.1: Running firmware version is incompatible with the driver version [ 250.045449] bfa 0000:04:00.1: bfa init failed [ 250.045962] scsi host7: QLogic BR-series FC/FCOE Adapter, hwpath: 0000:04:00.1 driver: 3.2.25.0 Increase bfa's requested firmware version. Also increase the driver version. I only tested that all of the devices probe without error. Reported-by: Tim Ehlers Signed-off-by: Benjamin Poirier Acked-by: Rasesh Mody Signed-off-by: Martin K. Petersen --- drivers/scsi/bfa/bfad.c | 6 +++--- drivers/scsi/bfa/bfad_drv.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c index 9d253cb83ee7..e70410beb83a 100644 --- a/drivers/scsi/bfa/bfad.c +++ b/drivers/scsi/bfa/bfad.c @@ -64,9 +64,9 @@ int max_rport_logins = BFA_FCS_MAX_RPORT_LOGINS; u32 bfi_image_cb_size, bfi_image_ct_size, bfi_image_ct2_size; u32 *bfi_image_cb, *bfi_image_ct, *bfi_image_ct2; -#define BFAD_FW_FILE_CB "cbfw-3.2.3.0.bin" -#define BFAD_FW_FILE_CT "ctfw-3.2.3.0.bin" -#define BFAD_FW_FILE_CT2 "ct2fw-3.2.3.0.bin" +#define BFAD_FW_FILE_CB "cbfw-3.2.5.1.bin" +#define BFAD_FW_FILE_CT "ctfw-3.2.5.1.bin" +#define BFAD_FW_FILE_CT2 "ct2fw-3.2.5.1.bin" static u32 *bfad_load_fwimg(struct pci_dev *pdev); static void bfad_free_fwimg(void); diff --git a/drivers/scsi/bfa/bfad_drv.h b/drivers/scsi/bfa/bfad_drv.h index f9e862093a25..cfcfff48e8e1 100644 --- a/drivers/scsi/bfa/bfad_drv.h +++ b/drivers/scsi/bfa/bfad_drv.h @@ -58,7 +58,7 @@ #ifdef BFA_DRIVER_VERSION #define BFAD_DRIVER_VERSION BFA_DRIVER_VERSION #else -#define BFAD_DRIVER_VERSION "3.2.25.0" +#define BFAD_DRIVER_VERSION "3.2.25.1" #endif #define BFAD_PROTO_NAME FCPI_NAME From a33d331761bc5dd330499ca5ceceb67f0640a8e6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 5 Jan 2017 10:26:38 +0100 Subject: [PATCH 095/297] x86/CPU/AMD: Fix Bulldozer topology The following commit: 8196dab4fc15 ("x86/cpu: Get rid of compute_unit_id") ... broke the initial strategy for Bulldozer-based cores' topology, where we consider each thread of a compute unit a standalone core and not a HT or SMT thread. Revert to the firmware-supplied core_id numbering and do not make them thread siblings as we don't consider them for such even if they technically are, more or less. Reported-and-tested-by: Brice Goglin Tested-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: # v4.6+ Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 8196dab4fc15 ("x86/cpu: Get rid of compute_unit_id") Link: http://lkml.kernel.org/r/20170105092638.5247-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 71cae73a5076..1d3167269a67 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -309,15 +309,8 @@ static void amd_get_topology(struct cpuinfo_x86 *c) /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 eax, ebx, ecx, edx; - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 7; - - /* get compute unit information */ - smp_num_siblings = ((ebx >> 8) & 3) + 1; - c->x86_max_cores /= smp_num_siblings; - c->cpu_core_id = ebx & 0xff; + node_id = cpuid_ecx(0x8000001e) & 7; /* * We may have multiple LLCs if L3 caches exist, so check if we From 1ebb71143758f45dc0fa76e2f48429e13b16d110 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 6 Jan 2017 15:33:36 +0100 Subject: [PATCH 096/297] HID: hid-cypress: validate length of report Make sure we have enough of a report structure to validate before looking at it. Reported-by: Benoit Camredon Tested-by: Benoit Camredon Cc: stable Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- drivers/hid/hid-cypress.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-cypress.c b/drivers/hid/hid-cypress.c index 1b764d1745f3..1689568b597d 100644 --- a/drivers/hid/hid-cypress.c +++ b/drivers/hid/hid-cypress.c @@ -39,6 +39,9 @@ static __u8 *cp_report_fixup(struct hid_device *hdev, __u8 *rdesc, if (!(quirks & CP_RDESC_SWAPPED_MIN_MAX)) return rdesc; + if (*rsize < 4) + return rdesc; + for (i = 0; i < *rsize - 4; i++) if (rdesc[i] == 0x29 && rdesc[i + 2] == 0x19) { rdesc[i] = 0x19; From bc65a326c579e93a5c2120a65ede72f11369ee5a Mon Sep 17 00:00:00 2001 From: Jeeja KP Date: Mon, 2 Jan 2017 09:50:05 +0530 Subject: [PATCH 097/297] ASoC: Intel: Skylake: Release FW ctx in cleanup Saved firmware ctx was not never released, so release Firmware ctx in cleanup routine. Signed-off-by: Jeeja KP Acked-by: Vinod Koul Signed-off-by: Mark Brown --- sound/soc/intel/skylake/skl-sst.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/intel/skylake/skl-sst.c b/sound/soc/intel/skylake/skl-sst.c index 8fc3178bc79c..b30bd384c8d3 100644 --- a/sound/soc/intel/skylake/skl-sst.c +++ b/sound/soc/intel/skylake/skl-sst.c @@ -515,6 +515,9 @@ EXPORT_SYMBOL_GPL(skl_sst_init_fw); void skl_sst_dsp_cleanup(struct device *dev, struct skl_sst *ctx) { + + if (ctx->dsp->fw) + release_firmware(ctx->dsp->fw); skl_clear_module_table(ctx->dsp); skl_freeup_uuid_list(ctx); skl_ipc_free(&ctx->ipc); From 9f169b9f52a4afccdab7a7d2311b0c53a78a1e6b Mon Sep 17 00:00:00 2001 From: Patrick Lai Date: Sat, 31 Dec 2016 22:44:39 -0800 Subject: [PATCH 098/297] ASoC: dpcm: Avoid putting stream state to STOP when FE stream is paused When multiple front-ends are using the same back-end, putting state of a front-end to STOP state upon receiving pause command will result in backend stream getting released by DPCM framework unintentionally. In order to avoid backend to be released when another active front-end stream is present, put the stream state to PAUSED state instead of STOP state. Signed-off-by: Patrick Lai Signed-off-by: Mark Brown --- sound/soc/soc-pcm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index e7a1eaa2772f..6aba14009c92 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -2184,9 +2184,11 @@ static int dpcm_fe_dai_do_trigger(struct snd_pcm_substream *substream, int cmd) break; case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: - case SNDRV_PCM_TRIGGER_PAUSE_PUSH: fe->dpcm[stream].state = SND_SOC_DPCM_STATE_STOP; break; + case SNDRV_PCM_TRIGGER_PAUSE_PUSH: + fe->dpcm[stream].state = SND_SOC_DPCM_STATE_PAUSED; + break; } out: From 978d3639fd13d987950e4ce85c8737ae92154b2c Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Wed, 4 Jan 2017 22:18:24 +0300 Subject: [PATCH 099/297] sh_eth: fix EESIPR values for SH77{34|63} As the SH77{34|63} manuals are freely available, I've checked the EESIPR values written against the manuals, and they appeared to set the reserved bits 11-15 (which should be 0 on write). Fix those EESIPR values. Fixes: 380af9e390ec ("net: sh_eth: CPU dependency code collect to "struct sh_eth_cpu_data"") Fixes: f5d12767c8fd ("sh_eth: get SH77{34|63} support out of #ifdef") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 00fafabab1d0..3b388f5704c5 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -802,7 +802,7 @@ static struct sh_eth_cpu_data sh7734_data = { .ecsr_value = ECSR_ICD | ECSR_MPD, .ecsipr_value = ECSIPR_LCHNGIP | ECSIPR_ICDIP | ECSIPR_MPDIP, - .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, + .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003f07ff, .tx_check = EESR_TC1 | EESR_FTC, .eesr_err_check = EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT | @@ -832,7 +832,7 @@ static struct sh_eth_cpu_data sh7763_data = { .ecsr_value = ECSR_ICD | ECSR_MPD, .ecsipr_value = ECSIPR_LCHNGIP | ECSIPR_ICDIP | ECSIPR_MPDIP, - .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff, + .eesipr_value = DMAC_M_RFRMER | DMAC_M_ECI | 0x003f07ff, .tx_check = EESR_TC1 | EESR_FTC, .eesr_err_check = EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT | From 0f1f9cbc04dbb3cc310f70a11cba0cf1f2109d9c Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Thu, 5 Jan 2017 00:29:32 +0300 Subject: [PATCH 100/297] sh_eth: R8A7740 supports packet shecksumming The R8A7740 GEther controller supports the packet checksum offloading but the 'hw_crc' (bad name, I'll fix it) flag isn't set in the R8A7740 data, thus CSMR isn't cleared... Fixes: 73a0d907301e ("net: sh_eth: add support R8A7740") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 3b388f5704c5..f729a6b43958 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -574,6 +574,7 @@ static struct sh_eth_cpu_data r8a7740_data = { .rpadir_value = 2 << 16, .no_trimd = 1, .no_ade = 1, + .hw_crc = 1, .tsu = 1, .select_mii = 1, .shift_rd0 = 1, From 896b4db685cf06bd7d50ed22c53ebd069e0b90e9 Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Wed, 4 Jan 2017 15:07:16 -0600 Subject: [PATCH 101/297] amd-xgbe: Fix IRQ processing when running in single IRQ mode When running in single IRQ mode, the additional IRQ routines were being skipped because only the XGMAC interrupt status was being checked. Update the code so that the additional IRQ routines are checked whenever an interrupt is received. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 155190db682d..9943629fcbf9 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -539,6 +539,7 @@ static irqreturn_t xgbe_isr(int irq, void *data) } } +isr_done: /* If there is not a separate AN irq, handle it here */ if (pdata->dev_irq == pdata->an_irq) pdata->phy_if.an_isr(irq, pdata); @@ -551,7 +552,6 @@ static irqreturn_t xgbe_isr(int irq, void *data) if (pdata->vdata->i2c_support && (pdata->dev_irq == pdata->i2c_irq)) pdata->i2c_if.i2c_isr(irq, pdata); -isr_done: return IRQ_HANDLED; } From 5ca7d1ca77dc23934504b95a96d2660d345f83c2 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 5 Jan 2017 14:48:07 -0600 Subject: [PATCH 102/297] net: phy: dp83867: fix irq generation For proper IRQ generation by DP83867 phy the INT/PWDN pin has to be programmed as an interrupt output instead of a Powerdown input in Configuration Register 3 (CFG3), Address 0x001E, bit 7 INT_OE = 1. The current driver doesn't do this and as result IRQs will not be generated by DP83867 phy even if they are properly configured in DT. Hence, fix IRQ generation by properly configuring CFG3.INT_OE bit and ensure that Link Status Change (LINK_STATUS_CHNG_INT) and Auto-Negotiation Complete (AUTONEG_COMP_INT) interrupt are enabled. After this the DP83867 driver will work properly in interrupt enabled mode. Signed-off-by: Grygorii Strashko Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 1b639242f9e2..e84ae084e259 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -29,6 +29,7 @@ #define MII_DP83867_MICR 0x12 #define MII_DP83867_ISR 0x13 #define DP83867_CTRL 0x1f +#define DP83867_CFG3 0x1e /* Extended Registers */ #define DP83867_RGMIICTL 0x0032 @@ -98,6 +99,8 @@ static int dp83867_config_intr(struct phy_device *phydev) micr_status |= (MII_DP83867_MICR_AN_ERR_INT_EN | MII_DP83867_MICR_SPEED_CHNG_INT_EN | + MII_DP83867_MICR_AUTONEG_COMP_INT_EN | + MII_DP83867_MICR_LINK_STS_CHNG_INT_EN | MII_DP83867_MICR_DUP_MODE_CHNG_INT_EN | MII_DP83867_MICR_SLEEP_MODE_CHNG_INT_EN); @@ -214,6 +217,13 @@ static int dp83867_config_init(struct phy_device *phydev) } } + /* Enable Interrupt output INT_OE in CFG3 register */ + if (phy_interrupt_is_valid(phydev)) { + val = phy_read(phydev, DP83867_CFG3); + val |= BIT(7); + phy_write(phydev, DP83867_CFG3, val); + } + return 0; } From 93e246f783e6bd1bc64fdfbfe68b18161f69b28e Mon Sep 17 00:00:00 2001 From: David Forster Date: Fri, 6 Jan 2017 10:27:59 +0000 Subject: [PATCH 103/297] vti6: fix device register to report IFLA_INFO_KIND vti6 interface is registered before the rtnl_link_ops block is attached. As a result the resulting RTM_NEWLINK is missing IFLA_INFO_KIND. Re-order attachment of rtnl_link_ops block to fix. Signed-off-by: Dave Forster Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index f4b4a4a5f4ba..d82042c8d8fd 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -189,12 +189,12 @@ static int vti6_tnl_create2(struct net_device *dev) struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; + dev->rtnl_link_ops = &vti6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); - dev->rtnl_link_ops = &vti6_link_ops; dev_hold(dev); vti6_tnl_link(ip6n, t); From 7f4c4f80fd22ec7722e778c1d099e828d2b5dc40 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 19 Dec 2016 13:30:13 -0500 Subject: [PATCH 104/297] MAINTAINERS: Update mailing list for radeon and amdgpu amdgpu and radeon development has moved to this list. Signed-off-by: Alex Deucher --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c7b8cf1240d9..cb4611867a88 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4116,7 +4116,7 @@ F: drivers/gpu/drm/cirrus/ RADEON and AMDGPU DRM DRIVERS M: Alex Deucher M: Christian König -L: dri-devel@lists.freedesktop.org +L: amd-gfx@lists.freedesktop.org T: git git://people.freedesktop.org/~agd5f/linux S: Supported F: drivers/gpu/drm/radeon/ From c4642a479fac9f5c224ff7425d86c427b94011af Mon Sep 17 00:00:00 2001 From: Junwei Zhang Date: Wed, 14 Dec 2016 15:32:28 -0500 Subject: [PATCH 105/297] drm/amd/amdgpu: add Polaris12 support (v3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2: agd: squash in various fixes v3: agd: squash in: drm/amdgpu: remove unnecessary smc sk firmware for polaris12 Signed-off-by: Junwei Zhang Reviewed-by: Alex Deucher Reviewed-by: Christian König Reviewed-by: Ken Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/dce_v11_0.c | 13 ++++++--- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 31 ++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 6 +++++ drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 5 ++-- drivers/gpu/drm/amd/amdgpu/vi.c | 10 +++++++ drivers/gpu/drm/amd/include/amd_shared.h | 1 + 11 files changed, 76 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c index 9ada56c16a58..4c851fde1e82 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c @@ -840,6 +840,9 @@ static int amdgpu_cgs_get_firmware_info(struct cgs_device *cgs_device, else if (type == CGS_UCODE_ID_SMU_SK) strcpy(fw_name, "amdgpu/polaris10_smc_sk.bin"); break; + case CHIP_POLARIS12: + strcpy(fw_name, "amdgpu/polaris12_smc.bin"); + break; default: DRM_ERROR("SMC firmware not supported\n"); return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 60bd4afe45c8..fe3bb94fe58d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -73,6 +73,7 @@ static const char *amdgpu_asic_name[] = { "STONEY", "POLARIS10", "POLARIS11", + "POLARIS12", "LAST", }; @@ -1277,6 +1278,7 @@ static int amdgpu_early_init(struct amdgpu_device *adev) case CHIP_FIJI: case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: case CHIP_CARRIZO: case CHIP_STONEY: if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index a81dfaeeb8c0..1d564beb0fde 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -65,6 +65,7 @@ #define FIRMWARE_STONEY "amdgpu/stoney_uvd.bin" #define FIRMWARE_POLARIS10 "amdgpu/polaris10_uvd.bin" #define FIRMWARE_POLARIS11 "amdgpu/polaris11_uvd.bin" +#define FIRMWARE_POLARIS12 "amdgpu/polaris12_uvd.bin" /** * amdgpu_uvd_cs_ctx - Command submission parser context @@ -98,6 +99,7 @@ MODULE_FIRMWARE(FIRMWARE_FIJI); MODULE_FIRMWARE(FIRMWARE_STONEY); MODULE_FIRMWARE(FIRMWARE_POLARIS10); MODULE_FIRMWARE(FIRMWARE_POLARIS11); +MODULE_FIRMWARE(FIRMWARE_POLARIS12); static void amdgpu_uvd_idle_work_handler(struct work_struct *work); @@ -149,6 +151,9 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev) case CHIP_POLARIS11: fw_name = FIRMWARE_POLARIS11; break; + case CHIP_POLARIS12: + fw_name = FIRMWARE_POLARIS12; + break; default: return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 69b66b9e7f57..8fec802d3908 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -52,6 +52,7 @@ #define FIRMWARE_STONEY "amdgpu/stoney_vce.bin" #define FIRMWARE_POLARIS10 "amdgpu/polaris10_vce.bin" #define FIRMWARE_POLARIS11 "amdgpu/polaris11_vce.bin" +#define FIRMWARE_POLARIS12 "amdgpu/polaris12_vce.bin" #ifdef CONFIG_DRM_AMDGPU_CIK MODULE_FIRMWARE(FIRMWARE_BONAIRE); @@ -66,6 +67,7 @@ MODULE_FIRMWARE(FIRMWARE_FIJI); MODULE_FIRMWARE(FIRMWARE_STONEY); MODULE_FIRMWARE(FIRMWARE_POLARIS10); MODULE_FIRMWARE(FIRMWARE_POLARIS11); +MODULE_FIRMWARE(FIRMWARE_POLARIS12); static void amdgpu_vce_idle_work_handler(struct work_struct *work); @@ -121,6 +123,9 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size) case CHIP_POLARIS11: fw_name = FIRMWARE_POLARIS11; break; + case CHIP_POLARIS12: + fw_name = FIRMWARE_POLARIS12; + break; default: return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c index b3d62b909f43..2006abbbfb62 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c @@ -167,6 +167,7 @@ static void dce_v11_0_init_golden_registers(struct amdgpu_device *adev) (const u32)ARRAY_SIZE(stoney_golden_settings_a11)); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_program_register_sequence(adev, polaris11_golden_settings_a11, (const u32)ARRAY_SIZE(polaris11_golden_settings_a11)); @@ -608,6 +609,7 @@ static int dce_v11_0_get_num_crtc (struct amdgpu_device *adev) num_crtc = 6; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: num_crtc = 5; break; default: @@ -1589,6 +1591,7 @@ static int dce_v11_0_audio_init(struct amdgpu_device *adev) adev->mode_info.audio.num_pins = 8; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: adev->mode_info.audio.num_pins = 6; break; default: @@ -2388,7 +2391,8 @@ static u32 dce_v11_0_pick_pll(struct drm_crtc *crtc) int pll; if ((adev->asic_type == CHIP_POLARIS10) || - (adev->asic_type == CHIP_POLARIS11)) { + (adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) { struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(amdgpu_crtc->encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; @@ -2822,7 +2826,8 @@ static int dce_v11_0_crtc_mode_set(struct drm_crtc *crtc, return -EINVAL; if ((adev->asic_type == CHIP_POLARIS10) || - (adev->asic_type == CHIP_POLARIS11)) { + (adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) { struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(amdgpu_crtc->encoder); int encoder_mode = @@ -2992,6 +2997,7 @@ static int dce_v11_0_early_init(void *handle) adev->mode_info.num_dig = 6; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: adev->mode_info.num_hpd = 5; adev->mode_info.num_dig = 5; break; @@ -3101,7 +3107,8 @@ static int dce_v11_0_hw_init(void *handle) amdgpu_atombios_crtc_powergate_init(adev); amdgpu_atombios_encoder_init_dig(adev); if ((adev->asic_type == CHIP_POLARIS10) || - (adev->asic_type == CHIP_POLARIS11)) { + (adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) { amdgpu_atombios_crtc_set_dce_clock(adev, adev->clock.default_dispclk, DCE_CLOCK_TYPE_DISPCLK, ATOM_GCK_DFS); amdgpu_atombios_crtc_set_dce_clock(adev, 0, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index d0ec00986f38..373374164bd5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -139,6 +139,13 @@ MODULE_FIRMWARE("amdgpu/polaris10_mec.bin"); MODULE_FIRMWARE("amdgpu/polaris10_mec2.bin"); MODULE_FIRMWARE("amdgpu/polaris10_rlc.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_ce.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_pfp.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_me.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_mec.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_mec2.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_rlc.bin"); + static const struct amdgpu_gds_reg_offset amdgpu_gds_reg_offset[] = { {mmGDS_VMID0_BASE, mmGDS_VMID0_SIZE, mmGDS_GWS_VMID0, mmGDS_OA_VMID0}, @@ -689,6 +696,7 @@ static void gfx_v8_0_init_golden_registers(struct amdgpu_device *adev) (const u32)ARRAY_SIZE(tonga_golden_common_all)); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_program_register_sequence(adev, golden_settings_polaris11_a11, (const u32)ARRAY_SIZE(golden_settings_polaris11_a11)); @@ -903,6 +911,9 @@ static int gfx_v8_0_init_microcode(struct amdgpu_device *adev) case CHIP_POLARIS10: chip_name = "polaris10"; break; + case CHIP_POLARIS12: + chip_name = "polaris12"; + break; case CHIP_STONEY: chip_name = "stoney"; break; @@ -1768,6 +1779,7 @@ static int gfx_v8_0_gpu_early_init(struct amdgpu_device *adev) gb_addr_config = TONGA_GB_ADDR_CONFIG_GOLDEN; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: ret = amdgpu_atombios_get_gfx_info(adev); if (ret) return ret; @@ -2682,6 +2694,7 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev) break; case CHIP_POLARIS11: + case CHIP_POLARIS12: modearray[0] = (ARRAY_MODE(ARRAY_2D_TILED_THIN1) | PIPE_CONFIG(ADDR_SURF_P4_16x16) | TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | @@ -3503,6 +3516,7 @@ gfx_v8_0_raster_config(struct amdgpu_device *adev, u32 *rconf, u32 *rconf1) *rconf1 |= 0x0; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: *rconf |= RB_MAP_PKR0(2) | RB_XSEL2(1) | SE_MAP(2) | SE_XSEL(1) | SE_YSEL(1); *rconf1 |= 0x0; @@ -4021,7 +4035,8 @@ static void gfx_v8_0_init_pg(struct amdgpu_device *adev) cz_enable_cp_power_gating(adev, true); else cz_enable_cp_power_gating(adev, false); - } else if (adev->asic_type == CHIP_POLARIS11) { + } else if ((adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) { gfx_v8_0_init_csb(adev); gfx_v8_0_init_save_restore_list(adev); gfx_v8_0_enable_save_restore_machine(adev); @@ -4095,7 +4110,8 @@ static int gfx_v8_0_rlc_resume(struct amdgpu_device *adev) RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK); WREG32(mmRLC_CGCG_CGLS_CTRL, tmp); if (adev->asic_type == CHIP_POLARIS11 || - adev->asic_type == CHIP_POLARIS10) { + adev->asic_type == CHIP_POLARIS10 || + adev->asic_type == CHIP_POLARIS12) { tmp = RREG32(mmRLC_CGCG_CGLS_CTRL_3D); tmp &= ~0x3; WREG32(mmRLC_CGCG_CGLS_CTRL_3D, tmp); @@ -4283,6 +4299,7 @@ static int gfx_v8_0_cp_gfx_start(struct amdgpu_device *adev) amdgpu_ring_write(ring, 0x0000002A); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_ring_write(ring, 0x16000012); amdgpu_ring_write(ring, 0x00000000); break; @@ -4664,7 +4681,8 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev) (adev->asic_type == CHIP_FIJI) || (adev->asic_type == CHIP_STONEY) || (adev->asic_type == CHIP_POLARIS11) || - (adev->asic_type == CHIP_POLARIS10)) { + (adev->asic_type == CHIP_POLARIS10) || + (adev->asic_type == CHIP_POLARIS12)) { WREG32(mmCP_MEC_DOORBELL_RANGE_LOWER, AMDGPU_DOORBELL_KIQ << 2); WREG32(mmCP_MEC_DOORBELL_RANGE_UPPER, @@ -4700,7 +4718,8 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev) mqd->cp_hqd_persistent_state = tmp; if (adev->asic_type == CHIP_STONEY || adev->asic_type == CHIP_POLARIS11 || - adev->asic_type == CHIP_POLARIS10) { + adev->asic_type == CHIP_POLARIS10 || + adev->asic_type == CHIP_POLARIS12) { tmp = RREG32(mmCP_ME1_PIPE3_INT_CNTL); tmp = REG_SET_FIELD(tmp, CP_ME1_PIPE3_INT_CNTL, GENERIC2_INT_ENABLE, 1); WREG32(mmCP_ME1_PIPE3_INT_CNTL, tmp); @@ -5279,7 +5298,8 @@ static int gfx_v8_0_late_init(void *handle) static void gfx_v8_0_enable_gfx_static_mg_power_gating(struct amdgpu_device *adev, bool enable) { - if (adev->asic_type == CHIP_POLARIS11) + if ((adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) /* Send msg to SMU via Powerplay */ amdgpu_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_SMC, @@ -5353,6 +5373,7 @@ static int gfx_v8_0_set_powergating_state(void *handle, gfx_v8_0_enable_gfx_dynamic_mg_power_gating(adev, false); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: if ((adev->pg_flags & AMD_PG_SUPPORT_GFX_SMG) && enable) gfx_v8_0_enable_gfx_static_mg_power_gating(adev, true); else diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index 0daac3a5be79..476bc9f1954b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -46,6 +46,7 @@ static int gmc_v8_0_wait_for_idle(void *handle); MODULE_FIRMWARE("amdgpu/tonga_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris11_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris10_mc.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_mc.bin"); static const u32 golden_settings_tonga_a11[] = { @@ -130,6 +131,7 @@ static void gmc_v8_0_init_golden_registers(struct amdgpu_device *adev) (const u32)ARRAY_SIZE(golden_settings_tonga_a11)); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_program_register_sequence(adev, golden_settings_polaris11_a11, (const u32)ARRAY_SIZE(golden_settings_polaris11_a11)); @@ -225,6 +227,9 @@ static int gmc_v8_0_init_microcode(struct amdgpu_device *adev) case CHIP_POLARIS10: chip_name = "polaris10"; break; + case CHIP_POLARIS12: + chip_name = "polaris12"; + break; case CHIP_FIJI: case CHIP_CARRIZO: case CHIP_STONEY: diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index 1170a64a3184..034ace79ed49 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -60,6 +60,8 @@ MODULE_FIRMWARE("amdgpu/polaris10_sdma.bin"); MODULE_FIRMWARE("amdgpu/polaris10_sdma1.bin"); MODULE_FIRMWARE("amdgpu/polaris11_sdma.bin"); MODULE_FIRMWARE("amdgpu/polaris11_sdma1.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_sdma.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_sdma1.bin"); static const u32 sdma_offsets[SDMA_MAX_INSTANCE] = @@ -206,6 +208,7 @@ static void sdma_v3_0_init_golden_registers(struct amdgpu_device *adev) (const u32)ARRAY_SIZE(golden_settings_tonga_a11)); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_program_register_sequence(adev, golden_settings_polaris11_a11, (const u32)ARRAY_SIZE(golden_settings_polaris11_a11)); @@ -278,6 +281,9 @@ static int sdma_v3_0_init_microcode(struct amdgpu_device *adev) case CHIP_POLARIS10: chip_name = "polaris10"; break; + case CHIP_POLARIS12: + chip_name = "polaris12"; + break; case CHIP_CARRIZO: chip_name = "carrizo"; break; diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c index 6b3293a1c7b8..5fb0b7f5c065 100644 --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c @@ -320,11 +320,12 @@ static unsigned vce_v3_0_get_harvest_config(struct amdgpu_device *adev) { u32 tmp; - /* Fiji, Stoney, Polaris10, Polaris11 are single pipe */ + /* Fiji, Stoney, Polaris10, Polaris11, Polaris12 are single pipe */ if ((adev->asic_type == CHIP_FIJI) || (adev->asic_type == CHIP_STONEY) || (adev->asic_type == CHIP_POLARIS10) || - (adev->asic_type == CHIP_POLARIS11)) + (adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) return AMDGPU_VCE_HARVEST_VCE1; /* Tonga and CZ are dual or single pipe */ diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index bf088d6d9bf1..c2ac54f11341 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -88,6 +88,7 @@ MODULE_FIRMWARE("amdgpu/polaris10_smc.bin"); MODULE_FIRMWARE("amdgpu/polaris10_smc_sk.bin"); MODULE_FIRMWARE("amdgpu/polaris11_smc.bin"); MODULE_FIRMWARE("amdgpu/polaris11_smc_sk.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_smc.bin"); /* * Indirect registers accessor @@ -312,6 +313,7 @@ static void vi_init_golden_registers(struct amdgpu_device *adev) break; case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: default: break; } @@ -671,6 +673,7 @@ static int vi_read_register(struct amdgpu_device *adev, u32 se_num, case CHIP_TONGA: case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: case CHIP_CARRIZO: case CHIP_STONEY: asic_register_table = cz_allowed_read_registers; @@ -994,6 +997,11 @@ static int vi_common_early_init(void *handle) adev->pg_flags = 0; adev->external_rev_id = adev->rev_id + 0x50; break; + case CHIP_POLARIS12: + adev->cg_flags = AMD_CG_SUPPORT_UVD_MGCG; + adev->pg_flags = 0; + adev->external_rev_id = adev->rev_id + 0x64; + break; case CHIP_CARRIZO: adev->cg_flags = AMD_CG_SUPPORT_UVD_MGCG | AMD_CG_SUPPORT_GFX_MGCG | @@ -1346,6 +1354,7 @@ static int vi_common_set_clockgating_state(void *handle, case CHIP_TONGA: case CHIP_POLARIS10: case CHIP_POLARIS11: + case CHIP_POLARIS12: vi_common_set_clockgating_state_by_smu(adev, state); default: break; @@ -1429,6 +1438,7 @@ int vi_set_ip_blocks(struct amdgpu_device *adev) break; case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: amdgpu_ip_block_add(adev, &vi_common_ip_block); amdgpu_ip_block_add(adev, &gmc_v8_1_ip_block); amdgpu_ip_block_add(adev, &tonga_ih_ip_block); diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index c02469ada9f1..5f59117109d7 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -46,6 +46,7 @@ enum amd_asic_type { CHIP_STONEY, CHIP_POLARIS10, CHIP_POLARIS11, + CHIP_POLARIS12, CHIP_LAST, }; From f4309526576db325264b6dc9ee150ee70b330a42 Mon Sep 17 00:00:00 2001 From: Junwei Zhang Date: Wed, 14 Dec 2016 15:40:48 -0500 Subject: [PATCH 106/297] drm/amdgpu/powerplay: add Polaris12 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Junwei Zhang Reviewed-by: Alex Deucher Reviewed-by: Christian König Reviewed-by: Ken Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c | 1 + drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c | 3 ++- drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c | 2 +- drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c | 1 + 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c index fc592c2b0e16..95a568df8551 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c @@ -98,6 +98,7 @@ static int amdgpu_pp_early_init(void *handle) switch (adev->asic_type) { case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: case CHIP_TONGA: case CHIP_FIJI: case CHIP_TOPAZ: diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c index dc6700aee18f..b03606405a53 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c @@ -95,6 +95,7 @@ int hwmgr_init(struct amd_pp_init *pp_init, struct pp_instance *handle) break; case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: polaris_set_asic_special_caps(hwmgr); hwmgr->feature_mask &= ~(PP_UVD_HANDSHAKE_MASK); break; @@ -745,7 +746,7 @@ int polaris_set_asic_special_caps(struct pp_hwmgr *hwmgr) phm_cap_set(hwmgr->platform_descriptor.platformCaps, PHM_PlatformCaps_TablelessHardwareInterface); - if (hwmgr->chip_id == CHIP_POLARIS11) + if ((hwmgr->chip_id == CHIP_POLARIS11) || (hwmgr->chip_id == CHIP_POLARIS12)) phm_cap_set(hwmgr->platform_descriptor.platformCaps, PHM_PlatformCaps_SPLLShutdownSupport); return 0; diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c index 26477f0f09dc..6cd1287a7a8f 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c @@ -521,7 +521,7 @@ int smu7_enable_didt_config(struct pp_hwmgr *hwmgr) PP_ASSERT_WITH_CODE((result == 0), "DIDT Config failed.", return result); result = smu7_program_pt_config_registers(hwmgr, DIDTConfig_Polaris10); PP_ASSERT_WITH_CODE((result == 0), "DIDT Config failed.", return result); - } else if (hwmgr->chip_id == CHIP_POLARIS11) { + } else if ((hwmgr->chip_id == CHIP_POLARIS11) || (hwmgr->chip_id == CHIP_POLARIS12)) { result = smu7_program_pt_config_registers(hwmgr, GCCACConfig_Polaris11); PP_ASSERT_WITH_CODE((result == 0), "DIDT Config failed.", return result); result = smu7_program_pt_config_registers(hwmgr, DIDTConfig_Polaris11); diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c index e5812aa456f3..6e618aa20719 100644 --- a/drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c +++ b/drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c @@ -65,6 +65,7 @@ int smum_init(struct amd_pp_init *pp_init, struct pp_instance *handle) break; case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: polaris10_smum_init(smumgr); break; default: From fc8e9c54699e42754094ff475da46440778d8f19 Mon Sep 17 00:00:00 2001 From: Junwei Zhang Date: Thu, 4 Aug 2016 12:54:22 +0800 Subject: [PATCH 107/297] drm/amd/amdgpu: add Polaris12 PCI ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Junwei Zhang Reviewed-by: Alex Deucher Reviewed-by: Christian König Reviewed-by: Ken Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 8cb937b2bfcc..2534adaebe30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -418,6 +418,13 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x67CA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67CC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67CF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, + /* Polaris12 */ + {0x1002, 0x6980, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6981, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6985, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6986, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6987, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x699F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0, 0, 0} }; From d6df71e125b4e4ab8932349ce81e09ef73304b91 Mon Sep 17 00:00:00 2001 From: Yintian Tao Date: Wed, 21 Dec 2016 14:32:21 +0800 Subject: [PATCH 108/297] drm/amdgpu: remove static integer for uvd pp state At two gpu core condition, static integer will cause that second gpu core uvd state setting will be directly skipped due to the first one setting Signed-off-by: Yintian Tao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c index a79e283590fb..6de6becce745 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c @@ -791,15 +791,10 @@ static int uvd_v5_0_set_clockgating_state(void *handle, { struct amdgpu_device *adev = (struct amdgpu_device *)handle; bool enable = (state == AMD_CG_STATE_GATE) ? true : false; - static int curstate = -1; if (!(adev->cg_flags & AMD_CG_SUPPORT_UVD_MGCG)) return 0; - if (curstate == state) - return 0; - - curstate = state; if (enable) { /* wait for STATUS to clear */ if (uvd_v5_0_wait_for_idle(handle)) From 70fd80d6f7e37bf637331c682fafcce1112750ac Mon Sep 17 00:00:00 2001 From: Rex Zhu Date: Wed, 21 Dec 2016 17:44:59 +0800 Subject: [PATCH 109/297] drm/amd/powerplay: extend smu's response timeout time. Signed-off-by: Rex Zhu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/amd_shared.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index 5f59117109d7..85f358764bbc 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -23,7 +23,7 @@ #ifndef __AMD_SHARED_H__ #define __AMD_SHARED_H__ -#define AMD_MAX_USEC_TIMEOUT 100000 /* 100 ms */ +#define AMD_MAX_USEC_TIMEOUT 200000 /* 200 ms */ /* * Supported ASIC types From 5165484b02f2cbedb5bf3a41ff5e8ae16069016c Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Thu, 15 Dec 2016 13:43:59 +0800 Subject: [PATCH 110/297] drm/amdgpu: update si kicker smc firmware Use the appropriate smc firmware for each chip revision. Using the wrong one can cause stability issues. Acked-by: Edward O'Callaghan Signed-off-by: Flora Cui Reviewed-by: Junwei Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/si_dpm.c | 57 ++++++++++++++--------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/si_dpm.c b/drivers/gpu/drm/amd/amdgpu/si_dpm.c index 6c65a1a2de79..0566e9adaf8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/si_dpm.c @@ -56,7 +56,6 @@ #define BIOS_SCRATCH_4 0x5cd MODULE_FIRMWARE("radeon/tahiti_smc.bin"); -MODULE_FIRMWARE("radeon/tahiti_k_smc.bin"); MODULE_FIRMWARE("radeon/pitcairn_smc.bin"); MODULE_FIRMWARE("radeon/pitcairn_k_smc.bin"); MODULE_FIRMWARE("radeon/verde_smc.bin"); @@ -7687,49 +7686,49 @@ static int si_dpm_init_microcode(struct amdgpu_device *adev) chip_name = "tahiti"; break; case CHIP_PITCAIRN: - if ((adev->pdev->revision == 0x81) || - (adev->pdev->device == 0x6810) || - (adev->pdev->device == 0x6811) || - (adev->pdev->device == 0x6816) || - (adev->pdev->device == 0x6817) || - (adev->pdev->device == 0x6806)) + if ((adev->pdev->revision == 0x81) && + ((adev->pdev->device == 0x6810) || + (adev->pdev->device == 0x6811))) chip_name = "pitcairn_k"; else chip_name = "pitcairn"; break; case CHIP_VERDE: - if ((adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || - (adev->pdev->revision == 0x87) || - (adev->pdev->device == 0x6820) || - (adev->pdev->device == 0x6821) || - (adev->pdev->device == 0x6822) || - (adev->pdev->device == 0x6823) || - (adev->pdev->device == 0x682A) || - (adev->pdev->device == 0x682B)) + if (((adev->pdev->device == 0x6820) && + ((adev->pdev->revision == 0x81) || + (adev->pdev->revision == 0x83))) || + ((adev->pdev->device == 0x6821) && + ((adev->pdev->revision == 0x83) || + (adev->pdev->revision == 0x87))) || + ((adev->pdev->revision == 0x87) && + ((adev->pdev->device == 0x6823) || + (adev->pdev->device == 0x682b)))) chip_name = "verde_k"; else chip_name = "verde"; break; case CHIP_OLAND: - if ((adev->pdev->revision == 0xC7) || - (adev->pdev->revision == 0x80) || - (adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || - (adev->pdev->revision == 0x87) || - (adev->pdev->device == 0x6604) || - (adev->pdev->device == 0x6605)) + if (((adev->pdev->revision == 0x81) && + ((adev->pdev->device == 0x6600) || + (adev->pdev->device == 0x6604) || + (adev->pdev->device == 0x6605) || + (adev->pdev->device == 0x6610))) || + ((adev->pdev->revision == 0x83) && + (adev->pdev->device == 0x6610))) chip_name = "oland_k"; else chip_name = "oland"; break; case CHIP_HAINAN: - if ((adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || - (adev->pdev->revision == 0xC3) || - (adev->pdev->device == 0x6664) || - (adev->pdev->device == 0x6665) || - (adev->pdev->device == 0x6667)) + if (((adev->pdev->revision == 0x81) && + (adev->pdev->device == 0x6660)) || + ((adev->pdev->revision == 0x83) && + ((adev->pdev->device == 0x6660) || + (adev->pdev->device == 0x6663) || + (adev->pdev->device == 0x6665) || + (adev->pdev->device == 0x6667))) || + ((adev->pdev->revision == 0xc3) && + (adev->pdev->device == 0x6665))) chip_name = "hainan_k"; else chip_name = "hainan"; From 6458bd4dfd9414cba5804eb9907fe2a824278c34 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 5 Jan 2017 12:15:52 -0500 Subject: [PATCH 111/297] drm/radeon: update smc firmware selection for SI Use the appropriate smc firmware for each chip revision. Using the wrong one can cause stability issues. Acked-by: Edward O'Callaghan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/si.c | 60 +++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index ad4d7b8b8322..e8a38d296855 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -50,7 +50,6 @@ MODULE_FIRMWARE("radeon/tahiti_ce.bin"); MODULE_FIRMWARE("radeon/tahiti_mc.bin"); MODULE_FIRMWARE("radeon/tahiti_rlc.bin"); MODULE_FIRMWARE("radeon/tahiti_smc.bin"); -MODULE_FIRMWARE("radeon/tahiti_k_smc.bin"); MODULE_FIRMWARE("radeon/PITCAIRN_pfp.bin"); MODULE_FIRMWARE("radeon/PITCAIRN_me.bin"); @@ -1657,9 +1656,6 @@ static int si_init_microcode(struct radeon_device *rdev) switch (rdev->family) { case CHIP_TAHITI: chip_name = "TAHITI"; - /* XXX: figure out which Tahitis need the new ucode */ - if (0) - new_smc = true; new_chip_name = "tahiti"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; me_req_size = SI_PM4_UCODE_SIZE * 4; @@ -1671,12 +1667,9 @@ static int si_init_microcode(struct radeon_device *rdev) break; case CHIP_PITCAIRN: chip_name = "PITCAIRN"; - if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->device == 0x6810) || - (rdev->pdev->device == 0x6811) || - (rdev->pdev->device == 0x6816) || - (rdev->pdev->device == 0x6817) || - (rdev->pdev->device == 0x6806)) + if ((rdev->pdev->revision == 0x81) && + ((rdev->pdev->device == 0x6810) || + (rdev->pdev->device == 0x6811))) new_smc = true; new_chip_name = "pitcairn"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; @@ -1689,15 +1682,15 @@ static int si_init_microcode(struct radeon_device *rdev) break; case CHIP_VERDE: chip_name = "VERDE"; - if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || - (rdev->pdev->revision == 0x87) || - (rdev->pdev->device == 0x6820) || - (rdev->pdev->device == 0x6821) || - (rdev->pdev->device == 0x6822) || - (rdev->pdev->device == 0x6823) || - (rdev->pdev->device == 0x682A) || - (rdev->pdev->device == 0x682B)) + if (((rdev->pdev->device == 0x6820) && + ((rdev->pdev->revision == 0x81) || + (rdev->pdev->revision == 0x83))) || + ((rdev->pdev->device == 0x6821) && + ((rdev->pdev->revision == 0x83) || + (rdev->pdev->revision == 0x87))) || + ((rdev->pdev->revision == 0x87) && + ((rdev->pdev->device == 0x6823) || + (rdev->pdev->device == 0x682b)))) new_smc = true; new_chip_name = "verde"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; @@ -1710,13 +1703,13 @@ static int si_init_microcode(struct radeon_device *rdev) break; case CHIP_OLAND: chip_name = "OLAND"; - if ((rdev->pdev->revision == 0xC7) || - (rdev->pdev->revision == 0x80) || - (rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || - (rdev->pdev->revision == 0x87) || - (rdev->pdev->device == 0x6604) || - (rdev->pdev->device == 0x6605)) + if (((rdev->pdev->revision == 0x81) && + ((rdev->pdev->device == 0x6600) || + (rdev->pdev->device == 0x6604) || + (rdev->pdev->device == 0x6605) || + (rdev->pdev->device == 0x6610))) || + ((rdev->pdev->revision == 0x83) && + (rdev->pdev->device == 0x6610))) new_smc = true; new_chip_name = "oland"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; @@ -1728,12 +1721,15 @@ static int si_init_microcode(struct radeon_device *rdev) break; case CHIP_HAINAN: chip_name = "HAINAN"; - if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || - (rdev->pdev->revision == 0xC3) || - (rdev->pdev->device == 0x6664) || - (rdev->pdev->device == 0x6665) || - (rdev->pdev->device == 0x6667)) + if (((rdev->pdev->revision == 0x81) && + (rdev->pdev->device == 0x6660)) || + ((rdev->pdev->revision == 0x83) && + ((rdev->pdev->device == 0x6660) || + (rdev->pdev->device == 0x6663) || + (rdev->pdev->device == 0x6665) || + (rdev->pdev->device == 0x6667))) || + ((rdev->pdev->revision == 0xc3) && + (rdev->pdev->device == 0x6665))) new_smc = true; new_chip_name = "hainan"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; From 8a08403bcb39f5d0e733bcf59a8a74f16b538f6e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 5 Jan 2017 12:39:01 -0500 Subject: [PATCH 112/297] drm/radeon: drop verde dpm quirks fixes: https://bugs.freedesktop.org/show_bug.cgi?id=98897 https://bugs.launchpad.net/bugs/1651981 Acked-by: Edward O'Callaghan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Cc: Adrian Fiergolski --- drivers/gpu/drm/radeon/si_dpm.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 8b5e697f2549..13ba73fd9b68 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -3008,19 +3008,6 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, (rdev->pdev->device == 0x6817) || (rdev->pdev->device == 0x6806)) max_mclk = 120000; - } else if (rdev->family == CHIP_VERDE) { - if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || - (rdev->pdev->revision == 0x87) || - (rdev->pdev->device == 0x6820) || - (rdev->pdev->device == 0x6821) || - (rdev->pdev->device == 0x6822) || - (rdev->pdev->device == 0x6823) || - (rdev->pdev->device == 0x682A) || - (rdev->pdev->device == 0x682B)) { - max_sclk = 75000; - max_mclk = 80000; - } } else if (rdev->family == CHIP_OLAND) { if ((rdev->pdev->revision == 0xC7) || (rdev->pdev->revision == 0x80) || From 7192c54a68013f6058b1bb505645fcd07015191c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 5 Jan 2017 13:02:37 -0500 Subject: [PATCH 113/297] drm/amdgpu: drop verde dpm quirks Port of radeon change to amdgpu. Acked-by: Edward O'Callaghan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/si_dpm.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/si_dpm.c b/drivers/gpu/drm/amd/amdgpu/si_dpm.c index 0566e9adaf8a..10bedfac27b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/si_dpm.c @@ -3487,19 +3487,6 @@ static void si_apply_state_adjust_rules(struct amdgpu_device *adev, (adev->pdev->device == 0x6817) || (adev->pdev->device == 0x6806)) max_mclk = 120000; - } else if (adev->asic_type == CHIP_VERDE) { - if ((adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || - (adev->pdev->revision == 0x87) || - (adev->pdev->device == 0x6820) || - (adev->pdev->device == 0x6821) || - (adev->pdev->device == 0x6822) || - (adev->pdev->device == 0x6823) || - (adev->pdev->device == 0x682A) || - (adev->pdev->device == 0x682B)) { - max_sclk = 75000; - max_mclk = 80000; - } } else if (adev->asic_type == CHIP_OLAND) { if ((adev->pdev->revision == 0xC7) || (adev->pdev->revision == 0x80) || From bcd5e1a49f0d54afd3c5411bed2f59996e1c53e4 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 6 Jan 2017 14:26:54 -0500 Subject: [PATCH 114/297] netlabel: add CALIPSO to the list of built-in protocols When we added CALIPSO support in Linux v4.8 we forgot to add it to the list of supported protocols with display at boot. Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/netlabel/netlabel_kapi.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 28c56b95fb7f..ea7c67050792 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1502,10 +1502,7 @@ static int __init netlbl_init(void) printk(KERN_INFO "NetLabel: Initializing\n"); printk(KERN_INFO "NetLabel: domain hash size = %u\n", (1 << NETLBL_DOMHSH_BITSIZE)); - printk(KERN_INFO "NetLabel: protocols =" - " UNLABELED" - " CIPSOv4" - "\n"); + printk(KERN_INFO "NetLabel: protocols = UNLABELED CIPSOv4 CALIPSO\n"); ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE); if (ret_val != 0) From 1d0f110a2c6c4bca3dbcc4b0e27f1e3dc2d44a2c Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 6 Jan 2017 20:30:02 +0100 Subject: [PATCH 115/297] be2net: fix accesses to unicast list Commit 988d44b "be2net: Avoid redundant addition of mac address in HW" introduced be_dev_mac_add & be_uc_mac_add helpers that incorrectly access adapter->uc_list as an array of bytes instead of an array of be_eth_addr. Consequently NIC is not filled with valid data so unicast filtering is broken. Cc: Sathya Perla Cc: Ajit Khaparde Cc: Sriharsha Basavapatna Cc: Somnath Kotur Fixes: 988d44b be2net: Avoid redundant addition of mac address in HW Signed-off-by: Ivan Vecera Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 225e9a4877d7..3510352866c2 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -275,8 +275,7 @@ static int be_dev_mac_add(struct be_adapter *adapter, u8 *mac) /* Check if mac has already been added as part of uc-list */ for (i = 0; i < adapter->uc_macs; i++) { - if (ether_addr_equal((u8 *)&adapter->uc_list[i * ETH_ALEN], - mac)) { + if (ether_addr_equal(adapter->uc_list[i].mac, mac)) { /* mac already added, skip addition */ adapter->pmac_id[0] = adapter->pmac_id[i + 1]; return 0; @@ -1655,14 +1654,12 @@ static void be_clear_mc_list(struct be_adapter *adapter) static int be_uc_mac_add(struct be_adapter *adapter, int uc_idx) { - if (ether_addr_equal((u8 *)&adapter->uc_list[uc_idx * ETH_ALEN], - adapter->dev_mac)) { + if (ether_addr_equal(adapter->uc_list[uc_idx].mac, adapter->dev_mac)) { adapter->pmac_id[uc_idx + 1] = adapter->pmac_id[0]; return 0; } - return be_cmd_pmac_add(adapter, - (u8 *)&adapter->uc_list[uc_idx * ETH_ALEN], + return be_cmd_pmac_add(adapter, adapter->uc_list[uc_idx].mac, adapter->if_handle, &adapter->pmac_id[uc_idx + 1], 0); } From 20b1e22d01a4b0b11d3a1066e9feb04be38607ec Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 5 Jan 2017 13:51:29 +0100 Subject: [PATCH 116/297] x86/efi: Don't allocate memmap through memblock after mm_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the following commit: 4bc9f92e64c8 ("x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data") ... efi_bgrt_init() calls into the memblock allocator through efi_mem_reserve() => efi_arch_mem_reserve() *after* mm_init() has been called. Indeed, KASAN reports a bad read access later on in efi_free_boot_services(): BUG: KASAN: use-after-free in efi_free_boot_services+0xae/0x24c at addr ffff88022de12740 Read of size 4 by task swapper/0/0 page:ffffea0008b78480 count:0 mapcount:-127 mapping: (null) index:0x1 flags: 0x5fff8000000000() [...] Call Trace: dump_stack+0x68/0x9f kasan_report_error+0x4c8/0x500 kasan_report+0x58/0x60 __asan_load4+0x61/0x80 efi_free_boot_services+0xae/0x24c start_kernel+0x527/0x562 x86_64_start_reservations+0x24/0x26 x86_64_start_kernel+0x157/0x17a start_cpu+0x5/0x14 The instruction at the given address is the first read from the memmap's memory, i.e. the read of md->type in efi_free_boot_services(). Note that the writes earlier in efi_arch_mem_reserve() don't splat because they're done through early_memremap()ed addresses. So, after memblock is gone, allocations should be done through the "normal" page allocator. Introduce a helper, efi_memmap_alloc() for this. Use it from efi_arch_mem_reserve(), efi_free_boot_services() and, for the sake of consistency, from efi_fake_memmap() as well. Note that for the latter, the memmap allocations cease to be page aligned. This isn't needed though. Tested-by: Dan Williams Signed-off-by: Nicolai Stange Reviewed-by: Ard Biesheuvel Cc: # v4.9 Cc: Dave Young Cc: Linus Torvalds Cc: Matt Fleming Cc: Mika Penttilä Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Fixes: 4bc9f92e64c8 ("x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data") Link: http://lkml.kernel.org/r/20170105125130.2815-1-nicstange@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/quirks.c | 4 ++-- drivers/firmware/efi/fake_mem.c | 3 +-- drivers/firmware/efi/memmap.c | 38 +++++++++++++++++++++++++++++++++ include/linux/efi.h | 1 + 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 10aca63a50d7..30031d5293c4 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -214,7 +214,7 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) new_size = efi.memmap.desc_size * num_entries; - new_phys = memblock_alloc(new_size, 0); + new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { pr_err("Could not allocate boot services memmap\n"); return; @@ -355,7 +355,7 @@ void __init efi_free_boot_services(void) } new_size = efi.memmap.desc_size * num_entries; - new_phys = memblock_alloc(new_size, 0); + new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { pr_err("Failed to allocate new EFI memmap\n"); return; diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index 520a40e5e0e4..6c7d60c239b5 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -71,8 +71,7 @@ void __init efi_fake_memmap(void) } /* allocate memory for new EFI memmap */ - new_memmap_phy = memblock_alloc(efi.memmap.desc_size * new_nr_map, - PAGE_SIZE); + new_memmap_phy = efi_memmap_alloc(new_nr_map); if (!new_memmap_phy) return; diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index f03ddecd232b..78686443cb37 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -9,6 +9,44 @@ #include #include #include +#include +#include + +static phys_addr_t __init __efi_memmap_alloc_early(unsigned long size) +{ + return memblock_alloc(size, 0); +} + +static phys_addr_t __init __efi_memmap_alloc_late(unsigned long size) +{ + unsigned int order = get_order(size); + struct page *p = alloc_pages(GFP_KERNEL, order); + + if (!p) + return 0; + + return PFN_PHYS(page_to_pfn(p)); +} + +/** + * efi_memmap_alloc - Allocate memory for the EFI memory map + * @num_entries: Number of entries in the allocated map. + * + * Depending on whether mm_init() has already been invoked or not, + * either memblock or "normal" page allocation is used. + * + * Returns the physical address of the allocated memory map on + * success, zero on failure. + */ +phys_addr_t __init efi_memmap_alloc(unsigned int num_entries) +{ + unsigned long size = num_entries * efi.memmap.desc_size; + + if (slab_is_available()) + return __efi_memmap_alloc_late(size); + + return __efi_memmap_alloc_early(size); +} /** * __efi_memmap_init - Common code for mapping the EFI memory map diff --git a/include/linux/efi.h b/include/linux/efi.h index a07a476178cd..0c5420208c40 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -950,6 +950,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, #endif extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); +extern phys_addr_t __init efi_memmap_alloc(unsigned int num_entries); extern int __init efi_memmap_init_early(struct efi_memory_map_data *data); extern int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size); extern void __init efi_memmap_unmap(void); From 6052cd1af86f9833b6b0b60d5d4787c4a06d65ea Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 6 Jan 2017 21:59:30 +0100 Subject: [PATCH 117/297] be2net: fix unicast list filling The adapter->pmac_id[0] item is used for primary MAC address but this is not true for adapter->uc_list[0] as is assumed in be_set_uc_list(). There are N UC addresses copied first from net_device to adapter->uc_list[1..N] and then N UC addresses from adapter->uc_list[0..N-1] are sent to HW. So the last UC address is never stored into HW and address 00:00:00:00;00:00 (from uc_list[0]) is used instead. Cc: Sathya Perla Cc: Ajit Khaparde Cc: Sriharsha Basavapatna Cc: Somnath Kotur Fixes: b717241 be2net: replace polling with sleeping in the FW completion path Signed-off-by: Ivan Vecera Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 3510352866c2..ec010ced6c99 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1695,9 +1695,8 @@ static void be_set_uc_list(struct be_adapter *adapter) } if (adapter->update_uc_list) { - i = 1; /* First slot is claimed by the Primary MAC */ - /* cache the uc-list in adapter array */ + i = 0; netdev_for_each_uc_addr(ha, netdev) { ether_addr_copy(adapter->uc_list[i].mac, ha->addr); i++; From f5992b72ebe0dde488fa8f706b887194020c66fc Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 6 Jan 2017 16:18:53 -0500 Subject: [PATCH 118/297] tg3: Fix race condition in tg3_get_stats64(). The driver's ndo_get_stats64() method is not always called under RTNL. So it can race with driver close or ethtool reconfigurations. Fix the race condition by taking tp->lock spinlock in tg3_free_consistent() when freeing the tp->hw_stats memory block. tg3_get_stats64() is already taking tp->lock. Reported-by: Wang Yufen Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 185e9e047aa9..ae42de4fdddf 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -8720,11 +8720,14 @@ static void tg3_free_consistent(struct tg3 *tp) tg3_mem_rx_release(tp); tg3_mem_tx_release(tp); + /* Protect tg3_get_stats64() from reading freed tp->hw_stats. */ + tg3_full_lock(tp, 0); if (tp->hw_stats) { dma_free_coherent(&tp->pdev->dev, sizeof(struct tg3_hw_stats), tp->hw_stats, tp->stats_mapping); tp->hw_stats = NULL; } + tg3_full_unlock(tp); } /* From 9d5ecb09d525469abd1a10c096cb5a17206523f2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 7 Jan 2017 00:26:33 +0100 Subject: [PATCH 119/297] bpf: change back to orig prog on too many passes If after too many passes still no image could be emitted, then swap back to the original program as we do in all other cases and don't use the one with blinding. Fixes: 959a75791603 ("bpf, x86: add support for constant blinding") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e76d1af60f7a..bb660e53cbd6 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1172,6 +1172,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; prog->jited = 1; + } else { + prog = orig_prog; } out_addrs: From a2cd64f30140c5aebd9359f66c00c19d5c6bece6 Mon Sep 17 00:00:00 2001 From: "Kweh, Hock Leong" Date: Sat, 7 Jan 2017 17:32:03 +0800 Subject: [PATCH 120/297] net: stmmac: fix maxmtu assignment to be within valid range There is no checking valid value of maxmtu when getting it from device tree. This resolution added the checking condition to ensure the assignment is made within a valid range. Signed-off-by: Kweh, Hock Leong Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 10 +++++++++- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 6 ++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 39eb7a65bb9f..a276a32d57f2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3319,8 +3319,16 @@ int stmmac_dvr_probe(struct device *device, ndev->max_mtu = JUMBO_LEN; else ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD + NET_IP_ALIGN); - if (priv->plat->maxmtu < ndev->max_mtu) + /* Will not overwrite ndev->max_mtu if plat->maxmtu > ndev->max_mtu + * as well as plat->maxmtu < ndev->min_mtu which is a invalid range. + */ + if ((priv->plat->maxmtu < ndev->max_mtu) && + (priv->plat->maxmtu >= ndev->min_mtu)) ndev->max_mtu = priv->plat->maxmtu; + else if (priv->plat->maxmtu < ndev->min_mtu) + netdev_warn(priv->dev, + "%s: warning: maxmtu having invalid value (%d)\n", + __func__, priv->plat->maxmtu); if (flow_ctrl) priv->flow_ctrl = FLOW_AUTO; /* RX/TX pause on */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index a2831773431a..3da4737620cb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -89,6 +89,9 @@ static void stmmac_default_data(struct plat_stmmacenet_data *plat) /* Set default value for unicast filter entries */ plat->unicast_filter_entries = 1; + + /* Set the maxmtu to a default of JUMBO_LEN */ + plat->maxmtu = JUMBO_LEN; } static int quark_default_data(struct plat_stmmacenet_data *plat, @@ -126,6 +129,9 @@ static int quark_default_data(struct plat_stmmacenet_data *plat, /* Set default value for unicast filter entries */ plat->unicast_filter_entries = 1; + /* Set the maxmtu to a default of JUMBO_LEN */ + plat->maxmtu = JUMBO_LEN; + return 0; } From a4c61b92b3a4cbda35bb0251a5063a68f0861b2c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 7 Jan 2017 21:01:56 -0800 Subject: [PATCH 121/297] net: dsa: bcm_sf2: Do not clobber b53_switch_ops We make the bcm_sf2 driver override ds->ops which points to b53_switch_ops since b53_switch_alloc() did the assignent. This is all well and good until a second b53 switch comes in, and ends up using the bcm_sf2 operations. Make a proper local copy, substitute the ds->ops pointer and then override the operations. Fixes: f458995b9ad8 ("net: dsa: bcm_sf2: Utilize core B53 driver when possible") Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 9ec33b51a0ed..2f9f910c0e40 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -982,6 +982,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) const char *reg_names[BCM_SF2_REGS_NUM] = BCM_SF2_REGS_NAME; struct device_node *dn = pdev->dev.of_node; struct b53_platform_data *pdata; + struct dsa_switch_ops *ops; struct bcm_sf2_priv *priv; struct b53_device *dev; struct dsa_switch *ds; @@ -995,6 +996,10 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; + ops = devm_kzalloc(&pdev->dev, sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + dev = b53_switch_alloc(&pdev->dev, &bcm_sf2_io_ops, priv); if (!dev) return -ENOMEM; @@ -1014,6 +1019,8 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) ds = dev->ds; /* Override the parts that are non-standard wrt. normal b53 devices */ + memcpy(ops, ds->ops, sizeof(*ops)); + ds->ops = ops; ds->ops->get_tag_protocol = bcm_sf2_sw_get_tag_protocol; ds->ops->setup = bcm_sf2_sw_setup; ds->ops->get_phy_flags = bcm_sf2_sw_get_phy_flags; From 2cfe8f8290bd28cf1ee67db914a6e76cf8e6437b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 7 Jan 2017 21:01:57 -0800 Subject: [PATCH 122/297] net: dsa: bcm_sf2: Utilize nested MDIO read/write We are implementing a MDIO bus which is behind another one, so use the nested version of the accessors to get lockdep annotations correct. Fixes: 461cd1b03e32 ("net: dsa: bcm_sf2: Register our slave MDIO bus") Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 2f9f910c0e40..2ce7ae97ac91 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -393,7 +393,7 @@ static int bcm_sf2_sw_mdio_read(struct mii_bus *bus, int addr, int regnum) if (addr == BRCM_PSEUDO_PHY_ADDR && priv->indir_phy_mask & BIT(addr)) return bcm_sf2_sw_indir_rw(priv, 1, addr, regnum, 0); else - return mdiobus_read(priv->master_mii_bus, addr, regnum); + return mdiobus_read_nested(priv->master_mii_bus, addr, regnum); } static int bcm_sf2_sw_mdio_write(struct mii_bus *bus, int addr, int regnum, @@ -407,7 +407,7 @@ static int bcm_sf2_sw_mdio_write(struct mii_bus *bus, int addr, int regnum, if (addr == BRCM_PSEUDO_PHY_ADDR && priv->indir_phy_mask & BIT(addr)) bcm_sf2_sw_indir_rw(priv, 0, addr, regnum, val); else - mdiobus_write(priv->master_mii_bus, addr, regnum, val); + mdiobus_write_nested(priv->master_mii_bus, addr, regnum, val); return 0; } From ac0c7cf8be00f269f82964cf7b144ca3edc5dbc4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 6 Jan 2017 14:12:51 +0100 Subject: [PATCH 123/297] btrfs: fix crash when tracepoint arguments are freed by wq callbacks Enabling btrfs tracepoints leads to instant crash, as reported. The wq callbacks could free the memory and the tracepoints started to dereference the members to get to fs_info. The proposed fix https://marc.info/?l=linux-btrfs&m=148172436722606&w=2 removed the tracepoints but we could preserve them by passing only the required data in a safe way. Fixes: bc074524e123 ("btrfs: prefix fsid to all trace events") CC: stable@vger.kernel.org # 4.8+ Reported-by: Sebastian Andrzej Siewior Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/async-thread.c | 15 +++++++++++---- include/trace/events/btrfs.h | 22 +++++++++++++--------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 63d197724519..ff0b0be92d61 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -273,6 +273,8 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) unsigned long flags; while (1) { + void *wtag; + spin_lock_irqsave(lock, flags); if (list_empty(list)) break; @@ -299,11 +301,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) spin_unlock_irqrestore(lock, flags); /* - * we don't want to call the ordered free functions - * with the lock held though + * We don't want to call the ordered free functions with the + * lock held though. Save the work as tag for the trace event, + * because the callback could free the structure. */ + wtag = work; work->ordered_free(work); - trace_btrfs_all_work_done(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); } spin_unlock_irqrestore(lock, flags); } @@ -311,6 +315,7 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) static void normal_work_helper(struct btrfs_work *work) { struct __btrfs_workqueue *wq; + void *wtag; int need_order = 0; /* @@ -324,6 +329,8 @@ static void normal_work_helper(struct btrfs_work *work) if (work->ordered_func) need_order = 1; wq = work->wq; + /* Safe for tracepoints in case work gets freed by the callback */ + wtag = work; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -333,7 +340,7 @@ static void normal_work_helper(struct btrfs_work *work) run_ordered_work(wq); } if (!need_order) - trace_btrfs_all_work_done(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); } void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index c14bed4ab097..b09225c77676 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1157,22 +1157,26 @@ DECLARE_EVENT_CLASS(btrfs__work, __entry->func, __entry->ordered_func, __entry->ordered_free) ); -/* For situiations that the work is freed */ +/* + * For situiations when the work is freed, we pass fs_info and a tag that that + * matches address of the work structure so it can be paired with the + * scheduling event. + */ DECLARE_EVENT_CLASS(btrfs__work__done, - TP_PROTO(struct btrfs_work *work), + TP_PROTO(struct btrfs_fs_info *fs_info, void *wtag), - TP_ARGS(work), + TP_ARGS(fs_info, wtag), TP_STRUCT__entry_btrfs( - __field( void *, work ) + __field( void *, wtag ) ), - TP_fast_assign_btrfs(btrfs_work_owner(work), - __entry->work = work; + TP_fast_assign_btrfs(fs_info, + __entry->wtag = wtag; ), - TP_printk_btrfs("work->%p", __entry->work) + TP_printk_btrfs("work->%p", __entry->wtag) ); DEFINE_EVENT(btrfs__work, btrfs_work_queued, @@ -1191,9 +1195,9 @@ DEFINE_EVENT(btrfs__work, btrfs_work_sched, DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done, - TP_PROTO(struct btrfs_work *work), + TP_PROTO(struct btrfs_fs_info *fs_info, void *wtag), - TP_ARGS(work) + TP_ARGS(fs_info, wtag) ); DEFINE_EVENT(btrfs__work, btrfs_ordered_sched, From 92a1bf76a89ad338f00eb9a2c7689a3907fbcaad Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 17 Nov 2016 15:00:50 -0800 Subject: [PATCH 124/297] Btrfs: add 'inode' for extent map tracepoint 'inode' is an important field for btrfs_get_extent, lets trace it. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- include/trace/events/btrfs.h | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a713d9d324b0..fab189c67eff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7059,7 +7059,7 @@ insert: write_unlock(&em_tree->lock); out: - trace_btrfs_get_extent(root, em); + trace_btrfs_get_extent(root, inode, em); btrfs_free_path(path); if (trans) { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b09225c77676..3048f5205363 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -184,14 +184,16 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, TRACE_EVENT_CONDITION(btrfs_get_extent, - TP_PROTO(struct btrfs_root *root, struct extent_map *map), + TP_PROTO(struct btrfs_root *root, struct inode *inode, + struct extent_map *map), - TP_ARGS(root, map), + TP_ARGS(root, inode, map), TP_CONDITION(map), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) + __field( u64, ino ) __field( u64, start ) __field( u64, len ) __field( u64, orig_start ) @@ -204,7 +206,8 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; - __entry->start = map->start; + __entry->ino = btrfs_ino(inode); + __entry->start = map->start; __entry->len = map->len; __entry->orig_start = map->orig_start; __entry->block_start = map->block_start; @@ -214,11 +217,12 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __entry->compress_type = map->compress_type; ), - TP_printk_btrfs("root = %llu(%s), start = %llu, len = %llu, " + TP_printk_btrfs("root = %llu(%s), ino = %llu start = %llu, len = %llu, " "orig_start = %llu, block_start = %llu(%s), " "block_len = %llu, flags = %s, refs = %u, " "compress_type = %u", show_root_type(__entry->root_objectid), + (unsigned long long)__entry->ino, (unsigned long long)__entry->start, (unsigned long long)__entry->len, (unsigned long long)__entry->orig_start, From 7856654842bdbebc0fbcbf51573da5d70a787aba Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 30 Nov 2016 16:10:10 -0800 Subject: [PATCH 125/297] Btrfs: add truncated_len for ordered extent tracepoints This can help us monitor truncated ordered extents. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 3048f5205363..2026a89786b0 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -263,6 +263,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __field( int, compress_type ) __field( int, refs ) __field( u64, root_objectid ) + __field( u64, truncated_len ) ), TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), @@ -277,10 +278,12 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __entry->refs = atomic_read(&ordered->refs); __entry->root_objectid = BTRFS_I(inode)->root->root_key.objectid; + __entry->truncated_len = ordered->truncated_len; ), TP_printk_btrfs("root = %llu(%s), ino = %llu, file_offset = %llu, " "start = %llu, len = %llu, disk_len = %llu, " + "truncated_len = %llu, " "bytes_left = %llu, flags = %s, compress_type = %d, " "refs = %d", show_root_type(__entry->root_objectid), @@ -289,6 +292,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, (unsigned long long)__entry->start, (unsigned long long)__entry->len, (unsigned long long)__entry->disk_len, + (unsigned long long)__entry->truncated_len, (unsigned long long)__entry->bytes_left, show_ordered_flags(__entry->flags), __entry->compress_type, __entry->refs) From 562a7a07bf61e2949f7cbdb6ac7537ad9e2794d1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 6 Jan 2017 15:51:36 +0100 Subject: [PATCH 126/297] btrfs: make tracepoint format strings more compact We've recently added the fsid to trace events, this makes the line quite long. To reduce the it again, remove extra spaces around = and remove ",". Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 112 +++++++++++++++++------------------ 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 2026a89786b0..88d18a8ceb59 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -130,8 +130,8 @@ DECLARE_EVENT_CLASS(btrfs__inode, BTRFS_I(inode)->root->root_key.objectid; ), - TP_printk_btrfs("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, " - "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu", + TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%lu blocks=%llu " + "disk_i_size=%llu last_trans=%llu logged_trans=%llu", show_root_type(__entry->root_objectid), (unsigned long long)__entry->generation, (unsigned long)__entry->ino, @@ -217,10 +217,10 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __entry->compress_type = map->compress_type; ), - TP_printk_btrfs("root = %llu(%s), ino = %llu start = %llu, len = %llu, " - "orig_start = %llu, block_start = %llu(%s), " - "block_len = %llu, flags = %s, refs = %u, " - "compress_type = %u", + TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu " + "orig_start=%llu block_start=%llu(%s) " + "block_len=%llu flags=%s refs=%u " + "compress_type=%u", show_root_type(__entry->root_objectid), (unsigned long long)__entry->ino, (unsigned long long)__entry->start, @@ -281,11 +281,11 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __entry->truncated_len = ordered->truncated_len; ), - TP_printk_btrfs("root = %llu(%s), ino = %llu, file_offset = %llu, " - "start = %llu, len = %llu, disk_len = %llu, " - "truncated_len = %llu, " - "bytes_left = %llu, flags = %s, compress_type = %d, " - "refs = %d", + TP_printk_btrfs("root=%llu(%s) ino=%llu file_offset=%llu " + "start=%llu len=%llu disk_len=%llu " + "truncated_len=%llu " + "bytes_left=%llu flags=%s compress_type=%d " + "refs=%d", show_root_type(__entry->root_objectid), (unsigned long long)__entry->ino, (unsigned long long)__entry->file_offset, @@ -362,10 +362,10 @@ DECLARE_EVENT_CLASS(btrfs__writepage, BTRFS_I(inode)->root->root_key.objectid; ), - TP_printk_btrfs("root = %llu(%s), ino = %lu, page_index = %lu, " - "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " - "range_end = %llu, for_kupdate = %d, " - "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", + TP_printk_btrfs("root=%llu(%s) ino=%lu page_index=%lu " + "nr_to_write=%ld pages_skipped=%ld range_start=%llu " + "range_end=%llu for_kupdate=%d " + "for_reclaim=%d range_cyclic=%d writeback_index=%lu", show_root_type(__entry->root_objectid), (unsigned long)__entry->ino, __entry->index, __entry->nr_to_write, __entry->pages_skipped, @@ -408,8 +408,8 @@ TRACE_EVENT(btrfs_writepage_end_io_hook, BTRFS_I(page->mapping->host)->root->root_key.objectid; ), - TP_printk_btrfs("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, " - "end = %llu, uptodate = %d", + TP_printk_btrfs("root=%llu(%s) ino=%lu page_index=%lu start=%llu " + "end=%llu uptodate=%d", show_root_type(__entry->root_objectid), (unsigned long)__entry->ino, (unsigned long)__entry->index, (unsigned long long)__entry->start, @@ -441,7 +441,7 @@ TRACE_EVENT(btrfs_sync_file, BTRFS_I(inode)->root->root_key.objectid; ), - TP_printk_btrfs("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d", + TP_printk_btrfs("root=%llu(%s) ino=%ld parent=%ld datasync=%d", show_root_type(__entry->root_objectid), (unsigned long)__entry->ino, (unsigned long)__entry->parent, __entry->datasync) @@ -492,9 +492,9 @@ TRACE_EVENT(btrfs_add_block_group, __entry->create = create; ), - TP_printk("%pU: block_group offset = %llu, size = %llu, " - "flags = %llu(%s), bytes_used = %llu, bytes_super = %llu, " - "create = %d", __entry->fsid, + TP_printk("%pU: block_group offset=%llu size=%llu " + "flags=%llu(%s) bytes_used=%llu bytes_super=%llu " + "create=%d", __entry->fsid, (unsigned long long)__entry->offset, (unsigned long long)__entry->size, (unsigned long long)__entry->flags, @@ -543,9 +543,9 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, __entry->seq = ref->seq; ), - TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, " - "parent = %llu(%s), ref_root = %llu(%s), level = %d, " - "type = %s, seq = %llu", + TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s " + "parent=%llu(%s) ref_root=%llu(%s) level=%d " + "type=%s seq=%llu", (unsigned long long)__entry->bytenr, (unsigned long long)__entry->num_bytes, show_ref_action(__entry->action), @@ -608,9 +608,9 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, __entry->seq = ref->seq; ), - TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, " - "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, " - "offset = %llu, type = %s, seq = %llu", + TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s " + "parent=%llu(%s) ref_root=%llu(%s) owner=%llu " + "offset=%llu type=%s seq=%llu", (unsigned long long)__entry->bytenr, (unsigned long long)__entry->num_bytes, show_ref_action(__entry->action), @@ -665,7 +665,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, __entry->is_data = head_ref->is_data; ), - TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d", + TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s is_data=%d", (unsigned long long)__entry->bytenr, (unsigned long long)__entry->num_bytes, show_ref_action(__entry->action), @@ -729,8 +729,8 @@ DECLARE_EVENT_CLASS(btrfs__chunk, __entry->root_objectid = fs_info->chunk_root->root_key.objectid; ), - TP_printk_btrfs("root = %llu(%s), offset = %llu, size = %llu, " - "num_stripes = %d, sub_stripes = %d, type = %s", + TP_printk_btrfs("root=%llu(%s) offset=%llu size=%llu " + "num_stripes=%d sub_stripes=%d type=%s", show_root_type(__entry->root_objectid), (unsigned long long)__entry->offset, (unsigned long long)__entry->size, @@ -779,8 +779,8 @@ TRACE_EVENT(btrfs_cow_block, __entry->cow_level = btrfs_header_level(cow); ), - TP_printk_btrfs("root = %llu(%s), refs = %d, orig_buf = %llu " - "(orig_level = %d), cow_buf = %llu (cow_level = %d)", + TP_printk_btrfs("root=%llu(%s) refs=%d orig_buf=%llu " + "(orig_level=%d) cow_buf=%llu (cow_level=%d)", show_root_type(__entry->root_objectid), __entry->refs, (unsigned long long)__entry->buf_start, @@ -844,7 +844,7 @@ TRACE_EVENT(btrfs_trigger_flush, __assign_str(reason, reason) ), - TP_printk("%pU: %s: flush = %d(%s), flags = %llu(%s), bytes = %llu", + TP_printk("%pU: %s: flush=%d(%s) flags=%llu(%s) bytes=%llu", __entry->fsid, __get_str(reason), __entry->flush, show_flush_action(__entry->flush), (unsigned long long)__entry->flags, @@ -887,8 +887,8 @@ TRACE_EVENT(btrfs_flush_space, __entry->ret = ret; ), - TP_printk("%pU: state = %d(%s), flags = %llu(%s), num_bytes = %llu, " - "orig_bytes = %llu, ret = %d", __entry->fsid, __entry->state, + TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu " + "orig_bytes=%llu ret=%d", __entry->fsid, __entry->state, show_flush_state(__entry->state), (unsigned long long)__entry->flags, __print_flags((unsigned long)__entry->flags, "|", @@ -913,7 +913,7 @@ DECLARE_EVENT_CLASS(btrfs__reserved_extent, __entry->len = len; ), - TP_printk_btrfs("root = %llu(%s), start = %llu, len = %llu", + TP_printk_btrfs("root=%llu(%s) start=%llu len=%llu", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), (unsigned long long)__entry->start, (unsigned long long)__entry->len) @@ -952,7 +952,7 @@ TRACE_EVENT(find_free_extent, __entry->data = data; ), - TP_printk_btrfs("root = %Lu(%s), len = %Lu, empty_size = %Lu, flags = %Lu(%s)", + TP_printk_btrfs("root=%Lu(%s) len=%Lu empty_size=%Lu flags=%Lu(%s)", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->num_bytes, __entry->empty_size, __entry->data, __print_flags((unsigned long)__entry->data, "|", @@ -981,8 +981,8 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent, __entry->len = len; ), - TP_printk_btrfs("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), " - "start = %Lu, len = %Lu", + TP_printk_btrfs("root=%Lu(%s) block_group=%Lu flags=%Lu(%s) " + "start=%Lu len=%Lu", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, @@ -1033,8 +1033,8 @@ TRACE_EVENT(btrfs_find_cluster, __entry->min_bytes = min_bytes; ), - TP_printk_btrfs("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu," - " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid, + TP_printk_btrfs("block_group=%Lu flags=%Lu(%s) start=%Lu len=%Lu " + "empty_size=%Lu min_bytes=%Lu", __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->start, @@ -1055,7 +1055,7 @@ TRACE_EVENT(btrfs_failed_cluster_setup, __entry->bg_objectid = block_group->key.objectid; ), - TP_printk_btrfs("block_group = %Lu", __entry->bg_objectid) + TP_printk_btrfs("block_group=%Lu", __entry->bg_objectid) ); TRACE_EVENT(btrfs_setup_cluster, @@ -1083,8 +1083,8 @@ TRACE_EVENT(btrfs_setup_cluster, __entry->bitmap = bitmap; ), - TP_printk_btrfs("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, " - "size = %Lu, max_size = %Lu, bitmap = %d", + TP_printk_btrfs("block_group=%Lu flags=%Lu(%s) window_start=%Lu " + "size=%Lu max_size=%Lu bitmap=%d", __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", @@ -1111,7 +1111,7 @@ TRACE_EVENT(alloc_extent_state, __entry->ip = IP ), - TP_printk("state=%p; mask = %s; caller = %pS", __entry->state, + TP_printk("state=%p mask=%s caller=%pS", __entry->state, show_gfp_flags(__entry->mask), (void *)__entry->ip) ); @@ -1131,7 +1131,7 @@ TRACE_EVENT(free_extent_state, __entry->ip = IP ), - TP_printk(" state=%p; caller = %pS", __entry->state, + TP_printk("state=%p caller=%pS", __entry->state, (void *)__entry->ip) ); @@ -1159,8 +1159,8 @@ DECLARE_EVENT_CLASS(btrfs__work, __entry->normal_work = &work->normal_work; ), - TP_printk_btrfs("work=%p (normal_work=%p), wq=%p, func=%pf, ordered_func=%p," - " ordered_free=%p", + TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%pf ordered_func=%p " + "ordered_free=%p", __entry->work, __entry->normal_work, __entry->wq, __entry->func, __entry->ordered_func, __entry->ordered_free) ); @@ -1233,7 +1233,7 @@ DECLARE_EVENT_CLASS(btrfs__workqueue, __entry->high = high; ), - TP_printk_btrfs("name=%s%s, wq=%p", __get_str(name), + TP_printk_btrfs("name=%s%s wq=%p", __get_str(name), __print_flags(__entry->high, "", {(WQ_HIGHPRI), "-high"}), __entry->wq) @@ -1288,7 +1288,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_data_map, __entry->free_reserved = free_reserved; ), - TP_printk_btrfs("rootid=%llu, ino=%lu, free_reserved=%llu", + TP_printk_btrfs("rootid=%llu ino=%lu free_reserved=%llu", __entry->rootid, __entry->ino, __entry->free_reserved) ); @@ -1335,7 +1335,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data, __entry->op = op; ), - TP_printk_btrfs("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s", + TP_printk_btrfs("root=%llu ino=%lu start=%llu len=%llu reserved=%llu op=%s", __entry->rootid, __entry->ino, __entry->start, __entry->len, __entry->reserved, __print_flags((unsigned long)__entry->op, "", @@ -1373,7 +1373,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref, __entry->reserved = reserved; ), - TP_printk_btrfs("root=%llu, reserved=%llu, op=free", + TP_printk_btrfs("root=%llu reserved=%llu op=free", __entry->ref_root, __entry->reserved) ); @@ -1400,7 +1400,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent, __entry->num_bytes = rec->num_bytes; ), - TP_printk_btrfs("bytenr = %llu, num_bytes = %llu", + TP_printk_btrfs("bytenr=%llu num_bytes=%llu", (unsigned long long)__entry->bytenr, (unsigned long long)__entry->num_bytes) ); @@ -1442,8 +1442,8 @@ TRACE_EVENT(btrfs_qgroup_account_extent, __entry->nr_new_roots = nr_new_roots; ), - TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, " - "nr_new_roots = %llu", + TP_printk_btrfs("bytenr=%llu num_bytes=%llu nr_old_roots=%llu " + "nr_new_roots=%llu", __entry->bytenr, __entry->num_bytes, __entry->nr_old_roots, @@ -1469,7 +1469,7 @@ TRACE_EVENT(qgroup_update_counters, __entry->cur_new_count = cur_new_count; ), - TP_printk_btrfs("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu", + TP_printk_btrfs("qgid=%llu cur_old_count=%llu cur_new_count=%llu", __entry->qgid, __entry->cur_old_count, __entry->cur_new_count) From fac69d0efad08fc15e4dbfc116830782acc0dc9a Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Sat, 7 Jan 2017 10:38:31 +0100 Subject: [PATCH 127/297] x86/boot: Add missing declaration of string functions Add the missing declarations of basic string functions to string.h to allow a clean build. Fixes: 5be865661516 ("String-handling functions for the new x86 setup code.") Signed-off-by: Nicholas Mc Guire Link: http://lkml.kernel.org/r/1483781911-21399-1-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- arch/x86/boot/string.c | 1 + arch/x86/boot/string.h | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index cc3bd583dce1..9e240fcba784 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -14,6 +14,7 @@ #include #include "ctype.h" +#include "string.h" int memcmp(const void *s1, const void *s2, size_t len) { diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 725e820602b1..113588ddb43f 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -18,4 +18,13 @@ int memcmp(const void *s1, const void *s2, size_t len); #define memset(d,c,l) __builtin_memset(d,c,l) #define memcmp __builtin_memcmp +extern int strcmp(const char *str1, const char *str2); +extern int strncmp(const char *cs, const char *ct, size_t count); +extern size_t strlen(const char *s); +extern char *strstr(const char *s1, const char *s2); +extern size_t strnlen(const char *s, size_t maxlen); +extern unsigned int atou(const char *s); +extern unsigned long long simple_strtoull(const char *cp, char **endp, + unsigned int base); + #endif /* BOOT_STRING_H */ From 02c5c03283c52157d336abf5e44ffcda10579fbf Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 27 Dec 2016 12:05:05 +0800 Subject: [PATCH 128/297] ASoC: rt5645: set sel_i2s_pre_div1 to 2 The i2s clock pre-divider 1 is used for both i2s1 and sysclk. The i2s1 is usually used for the main i2s and the pre-divider will be set in hw_params function. However, if i2s2 is used, the pre-divider is not set in the hw_params function and the default value of i2s clock pre-divider 1 is too high for sysclk and DMIC usage. Fix by overriding default divider value to 2. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=95681 Tested-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Signed-off-by: Mark Brown --- sound/soc/codecs/rt5645.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/codecs/rt5645.c b/sound/soc/codecs/rt5645.c index 10c2a564a715..1ac96ef9ee20 100644 --- a/sound/soc/codecs/rt5645.c +++ b/sound/soc/codecs/rt5645.c @@ -3833,6 +3833,9 @@ static int rt5645_i2c_probe(struct i2c_client *i2c, } } + regmap_update_bits(rt5645->regmap, RT5645_ADDA_CLK1, + RT5645_I2S_PD1_MASK, RT5645_I2S_PD1_2); + if (rt5645->pdata.jd_invert) { regmap_update_bits(rt5645->regmap, RT5645_IRQ_CTRL2, RT5645_JD_1_1_MASK, RT5645_JD_1_1_INV); From 4e2da44691cffbfffb1535f478d19bc2dca3e62b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Jan 2017 19:15:10 +0100 Subject: [PATCH 129/297] USB: serial: ch341: fix initial modem-control state DTR and RTS will be asserted by the tty-layer when the port is opened and deasserted on close (if HUPCL is set). Make sure the initial state is not-asserted before the port is first opened as well. Fixes: 664d5df92e88 ("USB: usb-serial ch341: support for DTR/RTS/CTS") Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 2597b83a8ae2..d133e72fe888 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -258,7 +258,6 @@ static int ch341_port_probe(struct usb_serial_port *port) spin_lock_init(&priv->lock); priv->baud_rate = DEFAULT_BAUD_RATE; - priv->line_control = CH341_BIT_RTS | CH341_BIT_DTR; r = ch341_configure(port->serial->dev, priv); if (r < 0) From a20047f36e2f6a1eea4f1fd261aaa55882369868 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Jan 2017 19:15:11 +0100 Subject: [PATCH 130/297] USB: serial: ch341: fix open and resume after B0 The private baud_rate variable is used to configure the port at open and reset-resume and must never be set to (and left at) zero or reset-resume and all further open attempts will fail. Fixes: aa91def41a7b ("USB: ch341: set tty baud speed according to tty struct") Fixes: 664d5df92e88 ("USB: usb-serial ch341: support for DTR/RTS/CTS") Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index d133e72fe888..6279df905c14 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -355,7 +355,6 @@ static void ch341_set_termios(struct tty_struct *tty, baud_rate = tty_get_baud_rate(tty); - priv->baud_rate = baud_rate; ctrl = CH341_LCR_ENABLE_RX | CH341_LCR_ENABLE_TX; switch (C_CSIZE(tty)) { @@ -388,6 +387,9 @@ static void ch341_set_termios(struct tty_struct *tty, spin_lock_irqsave(&priv->lock, flags); priv->line_control |= (CH341_BIT_DTR | CH341_BIT_RTS); spin_unlock_irqrestore(&priv->lock, flags); + + priv->baud_rate = baud_rate; + r = ch341_init_set_baudrate(port->serial->dev, priv, ctrl); if (r < 0 && old_termios) { priv->baud_rate = tty_termios_baud_rate(old_termios); From 030ee7ae52a46a2be52ccc8242c4a330aba8d38e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Jan 2017 19:15:12 +0100 Subject: [PATCH 131/297] USB: serial: ch341: fix modem-control and B0 handling The modem-control signals are managed by the tty-layer during open and should not be asserted prematurely when set_termios is called from driver open. Also make sure that the signals are asserted only when changing speed from B0. Fixes: 664d5df92e88 ("USB: usb-serial ch341: support for DTR/RTS/CTS") Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 6279df905c14..0cc5056b304d 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -384,10 +384,6 @@ static void ch341_set_termios(struct tty_struct *tty, ctrl |= CH341_LCR_STOP_BITS_2; if (baud_rate) { - spin_lock_irqsave(&priv->lock, flags); - priv->line_control |= (CH341_BIT_DTR | CH341_BIT_RTS); - spin_unlock_irqrestore(&priv->lock, flags); - priv->baud_rate = baud_rate; r = ch341_init_set_baudrate(port->serial->dev, priv, ctrl); @@ -395,14 +391,16 @@ static void ch341_set_termios(struct tty_struct *tty, priv->baud_rate = tty_termios_baud_rate(old_termios); tty_termios_copy_hw(&tty->termios, old_termios); } - } else { - spin_lock_irqsave(&priv->lock, flags); - priv->line_control &= ~(CH341_BIT_DTR | CH341_BIT_RTS); - spin_unlock_irqrestore(&priv->lock, flags); } - ch341_set_handshake(port->serial->dev, priv->line_control); + spin_lock_irqsave(&priv->lock, flags); + if (C_BAUD(tty) == B0) + priv->line_control &= ~(CH341_BIT_DTR | CH341_BIT_RTS); + else if (old_termios && (old_termios->c_cflag & CBAUD) == B0) + priv->line_control |= (CH341_BIT_DTR | CH341_BIT_RTS); + spin_unlock_irqrestore(&priv->lock, flags); + ch341_set_handshake(port->serial->dev, priv->line_control); } static void ch341_break_ctl(struct tty_struct *tty, int break_state) From f2950b78547ffb8475297ada6b92bc2d774d5461 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Jan 2017 19:15:13 +0100 Subject: [PATCH 132/297] USB: serial: ch341: fix open error handling Make sure to stop the interrupt URB before returning on errors during open. Fixes: 664d5df92e88 ("USB: usb-serial ch341: support for DTR/RTS/CTS") Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 0cc5056b304d..8f41d4385f1c 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -319,7 +319,7 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port) r = ch341_configure(serial->dev, priv); if (r) - goto out; + return r; if (tty) ch341_set_termios(tty, port, NULL); @@ -329,12 +329,19 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port) if (r) { dev_err(&port->dev, "%s - failed to submit interrupt urb: %d\n", __func__, r); - goto out; + return r; } r = usb_serial_generic_open(tty, port); + if (r) + goto err_kill_interrupt_urb; -out: return r; + return 0; + +err_kill_interrupt_urb: + usb_kill_urb(port->interrupt_in_urb); + + return r; } /* Old_termios contains the original termios settings and From ce5e292828117d1b71cbd3edf9e9137cf31acd30 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Jan 2017 19:15:14 +0100 Subject: [PATCH 133/297] USB: serial: ch341: fix resume after reset Fix reset-resume handling which failed to resubmit the read and interrupt URBs, thereby leaving a port that was open before suspend in a broken state until closed and reopened. Fixes: 1ded7ea47b88 ("USB: ch341 serial: fix port number changed after resume") Fixes: 2bfd1c96a9fb ("USB: serial: ch341: remove reset_resume callback") Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 8f41d4385f1c..5343d65f3b52 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -582,14 +582,23 @@ static int ch341_tiocmget(struct tty_struct *tty) static int ch341_reset_resume(struct usb_serial *serial) { - struct ch341_private *priv; - - priv = usb_get_serial_port_data(serial->port[0]); + struct usb_serial_port *port = serial->port[0]; + struct ch341_private *priv = usb_get_serial_port_data(port); + int ret; /* reconfigure ch341 serial port after bus-reset */ ch341_configure(serial->dev, priv); - return 0; + if (tty_port_initialized(&port->port)) { + ret = usb_submit_urb(port->interrupt_in_urb, GFP_NOIO); + if (ret) { + dev_err(&port->dev, "failed to submit interrupt urb: %d\n", + ret); + return ret; + } + } + + return usb_serial_generic_resume(serial); } static struct usb_serial_driver ch341_device = { From 3cca8624b6624e7ffb87dcd8a0a05bef9b50e97b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Jan 2017 19:15:15 +0100 Subject: [PATCH 134/297] USB: serial: ch341: fix line settings after reset-resume A recent change added support for modifying the default line-control settings, but did not make sure that the modified settings were used as part of reconfiguration after a device has been reset during resume. This caused a port that was open before suspend to be unusable until being closed and reopened. Fixes: ba781bdf8662 ("USB: serial: ch341: add support for parity, frame length, stop bits") Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 5343d65f3b52..eabdd05a2147 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -95,6 +95,7 @@ struct ch341_private { unsigned baud_rate; /* set baud rate */ u8 line_control; /* set line control value RTS/DTR */ u8 line_status; /* active status of modem control inputs */ + u8 lcr; }; static void ch341_set_termios(struct tty_struct *tty, @@ -232,7 +233,7 @@ static int ch341_configure(struct usb_device *dev, struct ch341_private *priv) if (r < 0) goto out; - r = ch341_init_set_baudrate(dev, priv, 0); + r = ch341_init_set_baudrate(dev, priv, priv->lcr); if (r < 0) goto out; @@ -397,6 +398,8 @@ static void ch341_set_termios(struct tty_struct *tty, if (r < 0 && old_termios) { priv->baud_rate = tty_termios_baud_rate(old_termios); tty_termios_copy_hw(&tty->termios, old_termios); + } else if (r == 0) { + priv->lcr = ctrl; } } From 55fa15b5987db22b4f35d3f0798928c126be5f1c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Jan 2017 19:15:16 +0100 Subject: [PATCH 135/297] USB: serial: ch341: fix baud rate and line-control handling Revert to using direct register writes to set the divisor and line-control registers. A recent change switched to using the init vendor command to update these registers, something which also enabled support for CH341A devices. It turns out that simply setting bit 7 in the divisor register is sufficient to support CH341A and specifically prevent data from being buffered until a full endpoint-size packet (32 bytes) has been received. Using the init command also had the side-effect of temporarily deasserting the DTR/RTS signals on every termios change (including initialisation on open) something which for example could cause problems in setups where DTR is used to trigger a reset. Fixes: 4e46c410e050 ("USB: serial: ch341: reinitialize chip on reconfiguration") Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index eabdd05a2147..8d7b0847109b 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -133,8 +133,8 @@ static int ch341_control_in(struct usb_device *dev, return r; } -static int ch341_init_set_baudrate(struct usb_device *dev, - struct ch341_private *priv, unsigned ctrl) +static int ch341_set_baudrate_lcr(struct usb_device *dev, + struct ch341_private *priv, u8 lcr) { short a; int r; @@ -157,9 +157,19 @@ static int ch341_init_set_baudrate(struct usb_device *dev, factor = 0x10000 - factor; a = (factor & 0xff00) | divisor; - /* 0x9c is "enable SFR_UART Control register and timer" */ - r = ch341_control_out(dev, CH341_REQ_SERIAL_INIT, - 0x9c | (ctrl << 8), a | 0x80); + /* + * CH341A buffers data until a full endpoint-size packet (32 bytes) + * has been received unless bit 7 is set. + */ + a |= BIT(7); + + r = ch341_control_out(dev, CH341_REQ_WRITE_REG, 0x1312, a); + if (r) + return r; + + r = ch341_control_out(dev, CH341_REQ_WRITE_REG, 0x2518, lcr); + if (r) + return r; return r; } @@ -233,7 +243,7 @@ static int ch341_configure(struct usb_device *dev, struct ch341_private *priv) if (r < 0) goto out; - r = ch341_init_set_baudrate(dev, priv, priv->lcr); + r = ch341_set_baudrate_lcr(dev, priv, priv->lcr); if (r < 0) goto out; @@ -394,7 +404,7 @@ static void ch341_set_termios(struct tty_struct *tty, if (baud_rate) { priv->baud_rate = baud_rate; - r = ch341_init_set_baudrate(port->serial->dev, priv, ctrl); + r = ch341_set_baudrate_lcr(port->serial->dev, priv, ctrl); if (r < 0 && old_termios) { priv->baud_rate = tty_termios_baud_rate(old_termios); tty_termios_copy_hw(&tty->termios, old_termios); From 811a919135b980bac8009d042acdccf10dc1ef5e Mon Sep 17 00:00:00 2001 From: Zefir Kurtisi Date: Fri, 6 Jan 2017 12:14:48 +0100 Subject: [PATCH 136/297] phy state machine: failsafe leave invalid RUNNING state While in RUNNING state, phy_state_machine() checks for link changes by comparing phydev->link before and after calling phy_read_status(). This works as long as it is guaranteed that phydev->link is never changed outside the phy_state_machine(). If in some setups this happens, it causes the state machine to miss a link loss and remain RUNNING despite phydev->link being 0. This has been observed running a dsa setup with a process continuously polling the link states over ethtool each second (SNMPD RFC-1213 agent). Disconnecting the link on a phy followed by a ETHTOOL_GSET causes dsa_slave_get_settings() / dsa_slave_get_link_ksettings() to call phy_read_status() and with that modify the link status - and with that bricking the phy state machine. This patch adds a fail-safe check while in RUNNING, which causes to move to CHANGELINK when the link is gone and we are still RUNNING. Signed-off-by: Zefir Kurtisi Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 25f93a98863b..48da6e93c3f7 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1065,6 +1065,15 @@ void phy_state_machine(struct work_struct *work) if (old_link != phydev->link) phydev->state = PHY_CHANGELINK; } + /* + * Failsafe: check that nobody set phydev->link=0 between two + * poll cycles, otherwise we won't leave RUNNING state as long + * as link remains down. + */ + if (!phydev->link && phydev->state == PHY_RUNNING) { + phydev->state = PHY_CHANGELINK; + phydev_err(phydev, "no link in PHY_RUNNING\n"); + } break; case PHY_CHANGELINK: err = phy_read_status(phydev); From 67c408cfa8862fe7e45b3a1f762f7140e03b7217 Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Sat, 7 Jan 2017 23:53:00 +0100 Subject: [PATCH 137/297] ipv6: fix typos o s/approriate/appropriate o s/discouvery/discovery Signed-off-by: Alexander Alemayhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8417c41d8ec8..ce5aaf448c54 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1464,7 +1464,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_node *fn; /* Get the "current" route for this destination and - * check if the redirect has come from approriate router. + * check if the redirect has come from appropriate router. * * RFC 4861 specifies that redirects should only be * accepted if they come from the nexthop to the target. @@ -2768,7 +2768,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) old MTU is the lowest MTU in the path, update the route PMTU to reflect the increase. In this case if the other nodes' MTU also have the lowest MTU, TOO BIG MESSAGE will be lead to - PMTU discouvery. + PMTU discovery. */ if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && From b007f09072ca8afa118ade333e717ba443e8d807 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 9 Jan 2017 10:45:49 +0300 Subject: [PATCH 138/297] ipv4: make tcp_notsent_lowat sysctl knob behave as true unsigned int > cat /proc/sys/net/ipv4/tcp_notsent_lowat -1 > echo 4294967295 > /proc/sys/net/ipv4/tcp_notsent_lowat -bash: echo: write error: Invalid argument > echo -2147483648 > /proc/sys/net/ipv4/tcp_notsent_lowat > cat /proc/sys/net/ipv4/tcp_notsent_lowat -2147483648 but in documentation we have "tcp_notsent_lowat - UNSIGNED INTEGER" v2: simplify to just proc_douintvec Signed-off-by: Pavel Tikhomirov Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 22cbd61079b5..b2fa498b15d1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -951,7 +951,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec, }, { .procname = "tcp_tw_reuse", From ce7e40c432ba84da104438f6799d460a4cad41bc Mon Sep 17 00:00:00 2001 From: Vlad Tsyrklevich Date: Mon, 9 Jan 2017 20:57:48 +0700 Subject: [PATCH 139/297] net/appletalk: Fix kernel memory disclosure ipddp_route structs contain alignment padding so kernel heap memory is leaked when they are copied to user space in ipddp_ioctl(SIOCFINDIPDDPRT). Change kmalloc() to kzalloc() to clear that memory. Signed-off-by: Vlad Tsyrklevich Signed-off-by: David S. Miller --- drivers/net/appletalk/ipddp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index b8c293373ecc..a306de4318d7 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -190,7 +190,7 @@ static netdev_tx_t ipddp_xmit(struct sk_buff *skb, struct net_device *dev) */ static int ipddp_create(struct ipddp_route *new_rt) { - struct ipddp_route *rt = kmalloc(sizeof(*rt), GFP_KERNEL); + struct ipddp_route *rt = kzalloc(sizeof(*rt), GFP_KERNEL); if (rt == NULL) return -ENOMEM; From 2ebae8bd60188f57e26e95e7fde6b8943297d348 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 9 Jan 2017 15:17:27 +0100 Subject: [PATCH 140/297] net: phy: Add Meson GXL PHY hardware dependency As I understand it the Meson GXL PHY driver is only useful on one architecture so only make it visible on that architecture. Signed-off-by: Jean Delvare Fixes: 7334b3e47aee ("net: phy: Add Meson GXL Internal PHY driver") Cc: Neil Armstrong Cc: Florian Fainelli Cc: Andrew Lunn Cc: David S. Miller Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index d361835b315d..8dbd59baa34d 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -279,6 +279,7 @@ config MARVELL_PHY config MESON_GXL_PHY tristate "Amlogic Meson GXL Internal PHY" + depends on ARCH_MESON || COMPILE_TEST ---help--- Currently has a driver for the Amlogic Meson GXL Internal PHY From 6bb629db5e7daa619f5242b6ad93e4dd9bf7432c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Jan 2017 08:51:32 -0800 Subject: [PATCH 141/297] tcp: do not export tcp_peer_is_proven() After commit 1fb6f159fd21 ("tcp: add tcp_conn_request"), tcp_peer_is_proven() no longer needs to be exported. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d46f4d5b1c62..ba8f02d0f283 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -606,7 +606,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, return ret; } -EXPORT_SYMBOL_GPL(tcp_peer_is_proven); void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) { From 5149fd327f16e393c1d04fa5325ab072c32472bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:36:30 -0800 Subject: [PATCH 142/297] xfs: bump up reserved blocks in xfs_alloc_set_aside Setting aside 4 blocks globally for bmbt splits isn't all that useful, as different threads can allocate space in parallel. Bump it to 4 blocks per AG to allow each thread that is currently doing an allocation to dip into it separately. Without that we may no have enough reserved blocks if there are enough parallel transactions in an almost out space file system that all run into bmap btree splits. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 5050056a0b06..0a46f8488b8d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -95,10 +95,7 @@ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - unsigned int blocks; - - blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); - return blocks; + return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); } /* From 255c516278175a6dc7037d1406307f35237d8688 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:36:19 -0800 Subject: [PATCH 143/297] xfs: fix bogus minleft manipulations We can't just set minleft to 0 when we're low on space - that's exactly what we need minleft for: to protect space in the AG for btree block allocations when we are low on free space. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 24 +++++++----------------- fs/xfs/libxfs/xfs_bmap.c | 3 --- fs/xfs/libxfs/xfs_bmap_btree.c | 3 +-- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 0a46f8488b8d..fe925702c955 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2635,12 +2635,10 @@ xfs_alloc_vextent( xfs_agblock_t agsize; /* allocation group size */ int error; int flags; /* XFS_ALLOC_FLAG_... locking flags */ - xfs_extlen_t minleft;/* minimum left value, temp copy */ xfs_mount_t *mp; /* mount structure pointer */ xfs_agnumber_t sagno; /* starting allocation group number */ xfs_alloctype_t type; /* input allocation type */ int bump_rotor = 0; - int no_min = 0; xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ mp = args->mp; @@ -2669,7 +2667,6 @@ xfs_alloc_vextent( trace_xfs_alloc_vextent_badargs(args); return 0; } - minleft = args->minleft; switch (type) { case XFS_ALLOCTYPE_THIS_AG: @@ -2680,9 +2677,7 @@ xfs_alloc_vextent( */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); args->pag = xfs_perag_get(mp, args->agno); - args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); - args->minleft = minleft; if (error) { trace_xfs_alloc_vextent_nofix(args); goto error0; @@ -2747,9 +2742,7 @@ xfs_alloc_vextent( */ for (;;) { args->pag = xfs_perag_get(mp, args->agno); - if (no_min) args->minleft = 0; error = xfs_alloc_fix_freelist(args, flags); - args->minleft = minleft; if (error) { trace_xfs_alloc_vextent_nofix(args); goto error0; @@ -2789,20 +2782,17 @@ xfs_alloc_vextent( * or switch to non-trylock mode. */ if (args->agno == sagno) { - if (no_min == 1) { + if (flags == 0) { args->agbno = NULLAGBLOCK; trace_xfs_alloc_vextent_allfailed(args); break; } - if (flags == 0) { - no_min = 1; - } else { - flags = 0; - if (type == XFS_ALLOCTYPE_START_BNO) { - args->agbno = XFS_FSB_TO_AGBNO(mp, - args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - } + + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; } } xfs_perag_put(args->pag); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2760bc3b2536..44773c9eb957 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3812,7 +3812,6 @@ xfs_bmap_btalloc( args.fsbno = 0; args.type = XFS_ALLOCTYPE_FIRST_AG; args.total = ap->minlen; - args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; ap->dfops->dop_low = true; @@ -4344,8 +4343,6 @@ xfs_bmapi_allocate( if (error) return error; - if (bma->dfops->dop_low) - bma->minleft = 0; if (bma->cur) bma->cur->bc_private.b.firstblock = *bma->firstblock; if (bma->blkno == NULLFSBLOCK) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d6330c297ca0..d9be241fc86f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -502,12 +502,11 @@ try_another_ag: if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy - * a full btree split. Try again without minleft and if + * a full btree split. Try again and if * successful activate the lowspace algorithm. */ args.fsbno = 0; args.type = XFS_ALLOCTYPE_FIRST_AG; - args.minleft = 0; error = xfs_alloc_vextent(&args); if (error) goto error0; From 54fee133ad59c87ab01dd84ab3e9397134b32acb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:44:30 -0800 Subject: [PATCH 144/297] xfs: adjust allocation length in xfs_alloc_space_available We must decide in xfs_alloc_fix_freelist if we can perform an allocation from a given AG is possible or not based on the available space, and should not fail the allocation past that point on a healthy file system. But currently we have two additional places that second-guess xfs_alloc_fix_freelist: xfs_alloc_ag_vextent tries to adjust the maxlen parameter to remove the reservation before doing the allocation (but ignores the various minium freespace requirements), and xfs_alloc_fix_minleft tries to fix up the allocated length after we've found an extent, but ignores the reservations and also doesn't take the AGFL into account (and thus fails allocations for not matching minlen in some cases). Remove all these later fixups and just correct the maxlen argument inside xfs_alloc_fix_freelist once we have the AGF buffer locked. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 81 ++++++++------------------------------- fs/xfs/libxfs/xfs_alloc.h | 2 +- 2 files changed, 18 insertions(+), 65 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index fe925702c955..f2e7eb6e5243 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -362,35 +362,11 @@ xfs_alloc_fix_len( return; ASSERT(rlen >= args->minlen && rlen <= args->maxlen); ASSERT(rlen % args->prod == args->mod); + ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= + rlen + args->minleft); args->len = rlen; } -/* - * Fix up length if there is too little space left in the a.g. - * Return 1 if ok, 0 if too little, should give up. - */ -STATIC int -xfs_alloc_fix_minleft( - xfs_alloc_arg_t *args) /* allocation argument structure */ -{ - xfs_agf_t *agf; /* a.g. freelist header */ - int diff; /* free space difference */ - - if (args->minleft == 0) - return 1; - agf = XFS_BUF_TO_AGF(args->agbp); - diff = be32_to_cpu(agf->agf_freeblks) - - args->len - args->minleft; - if (diff >= 0) - return 1; - args->len += diff; /* shrink the allocated space */ - /* casts to (int) catch length underflows */ - if ((int)args->len >= (int)args->minlen) - return 1; - args->agbno = NULLAGBLOCK; - return 0; -} - /* * Update the two btrees, logically removing from freespace the extent * starting at rbno, rlen blocks. The extent is contained within the @@ -686,8 +662,6 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; - xfs_extlen_t reservation; - xfs_extlen_t oldmax; ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); @@ -695,20 +669,6 @@ xfs_alloc_ag_vextent( ASSERT(args->mod < args->prod); ASSERT(args->alignment > 0); - /* - * Clamp maxlen to the amount of free space minus any reservations - * that have been made. - */ - oldmax = args->maxlen; - reservation = xfs_ag_resv_needed(args->pag, args->resv); - if (args->maxlen > args->pag->pagf_freeblks - reservation) - args->maxlen = args->pag->pagf_freeblks - reservation; - if (args->maxlen == 0) { - args->agbno = NULLAGBLOCK; - args->maxlen = oldmax; - return 0; - } - /* * Branch to correct routine based on the type. */ @@ -728,8 +688,6 @@ xfs_alloc_ag_vextent( /* NOTREACHED */ } - args->maxlen = oldmax; - if (error || args->agbno == NULLAGBLOCK) return error; @@ -838,9 +796,6 @@ xfs_alloc_ag_vextent_exact( args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) - args->agbno; xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) - goto not_found; - ASSERT(args->agbno + args->len <= tend); /* @@ -1146,12 +1101,7 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; - if (!xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - trace_xfs_alloc_near_nominleft(args); - return 0; - } - blen = args->len; + /* * We are allocating starting at bnew for blen blocks. */ @@ -1343,12 +1293,6 @@ restart: */ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) { - trace_xfs_alloc_near_nominleft(args); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - return 0; - } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, args->datatype, ltbnoa, ltlena, <new); @@ -1550,8 +1494,6 @@ restart: } xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) - goto out_nominleft; rlen = args->len; XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); /* @@ -2070,10 +2012,20 @@ xfs_alloc_space_available( /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - - reservation - min_free - args->total); - if (available < (int)args->minleft || available <= 0) + reservation - min_free - args->minleft); + if (available < (int)args->total) return false; + /* + * Clamp maxlen to the amount of free space available for the actual + * extent allocation. + */ + if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { + args->maxlen = available; + ASSERT(args->maxlen > 0); + ASSERT(args->maxlen >= args->minlen); + } + return true; } @@ -2119,7 +2071,8 @@ xfs_alloc_fix_freelist( } need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags)) + if (!xfs_alloc_space_available(args, need, flags | + XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7c404a6b0ae3..1d0f48a501a3 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -56,7 +56,7 @@ typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ #define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ #define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ - +#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ /* * Argument structure for xfs_alloc routines. From 12ef830198b0d71668eb9b59f9ba69d32951a48a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:39:35 -0800 Subject: [PATCH 145/297] xfs: don't rely on ->total in xfs_alloc_space_available ->total is a bit of an odd parameter passed down to the low-level allocator all the way from the high-level callers. It's supposed to contain the maximum number of blocks to be allocated for the whole transaction [1]. But in xfs_iomap_write_allocate we only convert existing delayed allocations and thus only have a minimal block reservation for the current transaction, so xfs_alloc_space_available can't use it for the allocation decisions. Use the maximum of args->total and the calculated block requirement to make a decision. We probably should get rid of args->total eventually and instead apply ->minleft more broadly, but that will require some extensive changes all over. [1] which creates lots of confusion as most callers don't decrement it once doing a first allocation. But that's for a separate series. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f2e7eb6e5243..9f06a211e157 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1995,7 +1995,7 @@ xfs_alloc_space_available( int flags) { struct xfs_perag *pag = args->pag; - xfs_extlen_t longest; + xfs_extlen_t alloc_len, longest; xfs_extlen_t reservation; /* blocks that are still reserved */ int available; @@ -2005,15 +2005,16 @@ xfs_alloc_space_available( reservation = xfs_ag_resv_needed(pag, args->resv); /* do we have enough contiguous free space for the allocation? */ + alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, reservation); - if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + if (longest < alloc_len) return false; /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - reservation - min_free - args->minleft); - if (available < (int)args->total) + if (available < (int)max(args->total, alloc_len)) return false; /* From 84a4620cfe97c9d57e39b2369bfb77faff55063d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:41:33 -0800 Subject: [PATCH 146/297] xfs: don't print warnings when xfs_log_force fails There are only two reasons for xfs_log_force / xfs_log_force_lsn to fail: one is an I/O error, for which xlog_bdstrat already logs a warning, and the second is an already shutdown log due to a previous I/O errors. In the latter case we'll already have a previous indication for the actual error, but the large stream of misleading warnings from xfs_log_force will probably scroll it out of the message buffer. Simply removing the warnings thus makes the XFS log reporting significantly better. Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c39ac14ff540..b1469f0a91a6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3317,12 +3317,8 @@ xfs_log_force( xfs_mount_t *mp, uint flags) { - int error; - trace_xfs_log_force(mp, 0, _RET_IP_); - error = _xfs_log_force(mp, flags, NULL); - if (error) - xfs_warn(mp, "%s: error %d returned.", __func__, error); + _xfs_log_force(mp, flags, NULL); } /* @@ -3466,12 +3462,8 @@ xfs_log_force_lsn( xfs_lsn_t lsn, uint flags) { - int error; - trace_xfs_log_force(mp, lsn, _RET_IP_); - error = _xfs_log_force_lsn(mp, lsn, flags, NULL); - if (error) - xfs_warn(mp, "%s: error %d returned.", __func__, error); + _xfs_log_force_lsn(mp, lsn, flags, NULL); } /* From 32cd7cbbacf700885a2316275f188f2d5739b5f4 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Fri, 6 Jan 2017 19:31:35 -0500 Subject: [PATCH 147/297] md/raid5: Use correct IS_ERR() variation on pointer check This fixes a build error on certain architectures, such as ppc64. Fixes: 6995f0b247e("md: takeover should clear unrelated bits") Signed-off-by: Jes Sorensen Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7b1da6e95a56..36c13e4be9c9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7831,7 +7831,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev) mddev->new_chunk_sectors = chunksect; ret = setup_conf(mddev); - if (!IS_ERR_VALUE(ret)) + if (!IS_ERR(ret)) mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); return ret; From 5dedade6dfa243c130b85d1e4daba6f027805033 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 9 Jan 2017 12:41:43 +0100 Subject: [PATCH 148/297] x86/CPU: Add native CPUID variants returning a single datum ... similarly to the cpuid_() variants. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20170109114147.5082-2-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/processor.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eaf100508c36..1be64da0384e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -219,6 +219,24 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, : "memory"); } +#define native_cpuid_reg(reg) \ +static inline unsigned int native_cpuid_##reg(unsigned int op) \ +{ \ + unsigned int eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum. + */ +native_cpuid_reg(eax) +native_cpuid_reg(ebx) +native_cpuid_reg(ecx) +native_cpuid_reg(edx) + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__pa(pgdir)); From f3e2a51f568d9f33370f4e8bb05669a34223241a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 9 Jan 2017 12:41:44 +0100 Subject: [PATCH 149/297] x86/microcode: Use native CPUID to tickle out microcode revision Intel supplies the microcode revision value in MSR 0x8b (IA32_BIOS_SIGN_ID) after CPUID(1) has been executed. Execute it each time before reading that MSR. It used to do sync_core() which did do CPUID but c198b121b1a1 ("x86/asm: Rewrite sync_core() to use IRET-to-self") changed the sync_core() implementation so we better make the microcode loading case explicit, as the SDM documents it. Reported-and-tested-by: Jun'ichi Nomura Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20170109114147.5082-3-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/cpu/microcode/intel.c | 26 +++----------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fcd484d2bb03..2d49aa949fa1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -83,7 +83,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* Required by the SDM */ - sync_core(); + native_cpuid_eax(1); rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index b624b54912e1..f79249fab389 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -368,26 +368,6 @@ next: return patch; } -static void cpuid_1(void) -{ - /* - * According to the Intel SDM, Volume 3, 9.11.7: - * - * CPUID returns a value in a model specific register in - * addition to its usual register return values. The - * semantics of CPUID cause it to deposit an update ID value - * in the 64-bit model-specific register at address 08BH - * (IA32_BIOS_SIGN_ID). If no update is present in the - * processor, the value in the MSR remains unmodified. - * - * Use native_cpuid -- this code runs very early and we don't - * want to mess with paravirt. - */ - unsigned int eax = 1, ebx, ecx = 0, edx; - - native_cpuid(&eax, &ebx, &ecx, &edx); -} - static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; @@ -413,7 +393,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ - cpuid_1(); + native_cpuid_eax(1); /* get the current revision from MSR 0x8B */ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); @@ -613,7 +593,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ - cpuid_1(); + native_cpuid_eax(1); /* get the current revision from MSR 0x8B */ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); @@ -825,7 +805,7 @@ static int apply_microcode_intel(int cpu) wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ - cpuid_1(); + native_cpuid_eax(1); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); From 4167709bbf826512a52ebd6aafda2be104adaec9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 9 Jan 2017 12:41:45 +0100 Subject: [PATCH 150/297] x86/microcode/intel: Add a helper which gives the microcode revision Since on Intel we're required to do CPUID(1) first, before reading the microcode revision MSR, let's add a special helper which does the required steps so that we don't forget to do them next time, when we want to read the microcode revision. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20170109114147.5082-4-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/microcode_intel.h | 15 +++++++++ arch/x86/kernel/cpu/intel.c | 11 ++----- arch/x86/kernel/cpu/microcode/intel.c | 43 ++++++++------------------ 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 195becc6f780..e793fc9a9b20 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -52,6 +52,21 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) +static inline u32 intel_get_microcode_revision(void) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_IA32_UCODE_REV, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + native_cpuid_eax(1); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); + + return rev; +} + #ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 2d49aa949fa1..203f860d2ab3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -78,14 +79,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { - unsigned lower_word; - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* Required by the SDM */ - native_cpuid_eax(1); - rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); - } + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) + c->microcode = intel_get_microcode_revision(); /* * Atom erratum AAE44/AAF40/AAG38/AAH41: diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index f79249fab389..faec8fa68ffd 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -390,15 +390,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); } - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - /* As documented in the SDM: Do a CPUID 1 here */ - native_cpuid_eax(1); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - csig.rev = val[1]; + csig.rev = intel_get_microcode_revision(); uci->cpu_sig = csig; uci->valid = 1; @@ -582,7 +575,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci) static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc; - unsigned int val[2]; + u32 rev; mc = uci->mc; if (!mc) @@ -590,21 +583,16 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - /* As documented in the SDM: Do a CPUID 1 here */ - native_cpuid_eax(1); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc->hdr.rev) + rev = intel_get_microcode_revision(); + if (rev != mc->hdr.rev) return -1; #ifdef CONFIG_X86_64 /* Flush global tlb. This is precaution. */ flush_tlb_early(); #endif - uci->cpu_sig.rev = val[1]; + uci->cpu_sig.rev = rev; if (early) print_ucode(uci); @@ -784,8 +772,8 @@ static int apply_microcode_intel(int cpu) struct microcode_intel *mc; struct ucode_cpu_info *uci; struct cpuinfo_x86 *c; - unsigned int val[2]; static int prev_rev; + u32 rev; /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) @@ -802,33 +790,28 @@ static int apply_microcode_intel(int cpu) /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); - wrmsrl(MSR_IA32_UCODE_REV, 0); - /* As documented in the SDM: Do a CPUID 1 here */ - native_cpuid_eax(1); + rev = intel_get_microcode_revision(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - if (val[1] != mc->hdr.rev) { + if (rev != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", cpu, mc->hdr.rev); return -1; } - if (val[1] != prev_rev) { + if (rev != prev_rev) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", - val[1], + rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24, (mc->hdr.date >> 16) & 0xff); - prev_rev = val[1]; + prev_rev = rev; } c = &cpu_data(cpu); - uci->cpu_sig.rev = val[1]; - c->microcode = val[1]; + uci->cpu_sig.rev = rev; + c->microcode = rev; return 0; } From 9fcf5ba2ef908af916e9002891fbbca20ce4dc98 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Mon, 9 Jan 2017 12:41:46 +0100 Subject: [PATCH 151/297] x86/microcode/intel: Fix allocation size of struct ucode_patch We allocate struct ucode_patch here. @size is the size of microcode data and used for kmemdup() later in this function. Fixes: 06b8534cb728 ("x86/microcode: Rework microcode loading") Signed-off-by: Jun'ichi Nomura Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/7a730dc9-ac17-35c4-fe76-dfc94e5ecd95@ce.jp.nec.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index faec8fa68ffd..943486589757 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -150,7 +150,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) { struct ucode_patch *p; - p = kzalloc(size, GFP_KERNEL); + p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); From 2e86222c67bb5d942da68e8415749b32db208534 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Mon, 9 Jan 2017 12:41:47 +0100 Subject: [PATCH 152/297] x86/microcode/intel: Use correct buffer size for saving microcode data In generic_load_microcode(), curr_mc_size is the size of the last allocated buffer and since we have this performance "optimization" there to vmalloc a new buffer only when the current one is bigger, curr_mc_size ends up becoming the size of the biggest buffer we've seen so far. However, we end up saving the microcode patch which matches our CPU and its size is not curr_mc_size but the respective mc_size during the iteration while we're staring at it. So save that mc_size into a separate variable and use it to store the previously found microcode buffer. Without this fix, we could get oops like this: BUG: unable to handle kernel paging request at ffffc9000e30f000 IP: __memcpy+0x12/0x20 ... Call Trace: ? kmemdup+0x43/0x60 __alloc_microcode_buf+0x44/0x70 save_microcode_patch+0xd4/0x150 generic_load_microcode+0x1b8/0x260 request_microcode_user+0x15/0x20 microcode_write+0x91/0x100 __vfs_write+0x34/0x120 vfs_write+0xc1/0x130 SyS_write+0x56/0xc0 do_syscall_64+0x6c/0x160 entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 06b8534cb728 ("x86/microcode: Rework microcode loading") Signed-off-by: Jun'ichi Nomura Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/4f33cbfd-44f2-9bed-3b66-7446cd14256f@ce.jp.nec.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/intel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 943486589757..3f329b74e040 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -823,7 +823,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; - unsigned int curr_mc_size = 0; + unsigned int curr_mc_size = 0, new_mc_size = 0; unsigned int csig, cpf; while (leftover) { @@ -864,6 +864,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; + new_mc_size = mc_size; mc = NULL; /* trigger new vmalloc */ } @@ -889,7 +890,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc, curr_mc_size); + save_mc_for_early(new_mc, new_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); From 3895dbf8985f656675b5bde610723a29cbce3fa7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 3 Jan 2017 14:18:43 +1300 Subject: [PATCH 153/297] mnt: Protect the mountpoint hashtable with mount_lock Protecting the mountpoint hashtable with namespace_sem was sufficient until a call to umount_mnt was added to mntput_no_expire. At which point it became possible for multiple calls of put_mountpoint on the same hash chain to happen on the same time. Kristen Johansen reported: > This can cause a panic when simultaneous callers of put_mountpoint > attempt to free the same mountpoint. This occurs because some callers > hold the mount_hash_lock, while others hold the namespace lock. Some > even hold both. > > In this submitter's case, the panic manifested itself as a GP fault in > put_mountpoint() when it called hlist_del() and attempted to dereference > a m_hash.pprev that had been poisioned by another thread. Al Viro observed that the simple fix is to switch from using the namespace_sem to the mount_lock to protect the mountpoint hash table. I have taken Al's suggested patch moved put_mountpoint in pivot_root (instead of taking mount_lock an additional time), and have replaced new_mountpoint with get_mountpoint a function that does the hash table lookup and addition under the mount_lock. The introduction of get_mounptoint ensures that only the mount_lock is needed to manipulate the mountpoint hashtable. d_set_mounted is modified to only set DCACHE_MOUNTED if it is not already set. This allows get_mountpoint to use the setting of DCACHE_MOUNTED to ensure adding a struct mountpoint for a dentry happens exactly once. Cc: stable@vger.kernel.org Fixes: ce07d891a089 ("mnt: Honor MNT_LOCKED when detaching mounts") Reported-by: Krister Johansen Suggested-by: Al Viro Acked-by: Al Viro Signed-off-by: "Eric W. Biederman" --- fs/dcache.c | 7 ++++-- fs/namespace.c | 68 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 52 insertions(+), 23 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 769903dbc19d..95d71eda8142 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1336,8 +1336,11 @@ int d_set_mounted(struct dentry *dentry) } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { - dentry->d_flags |= DCACHE_MOUNTED; - ret = 0; + ret = -EBUSY; + if (!d_mountpoint(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } } spin_unlock(&dentry->d_lock); out: diff --git a/fs/namespace.c b/fs/namespace.c index b5b1259e064f..487ba30bb5c6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -742,26 +742,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry) return NULL; } -static struct mountpoint *new_mountpoint(struct dentry *dentry) +static struct mountpoint *get_mountpoint(struct dentry *dentry) { - struct hlist_head *chain = mp_hash(dentry); - struct mountpoint *mp; + struct mountpoint *mp, *new = NULL; int ret; - mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!mp) - return ERR_PTR(-ENOMEM); - - ret = d_set_mounted(dentry); - if (ret) { - kfree(mp); - return ERR_PTR(ret); + if (d_mountpoint(dentry)) { +mountpoint: + read_seqlock_excl(&mount_lock); + mp = lookup_mountpoint(dentry); + read_sequnlock_excl(&mount_lock); + if (mp) + goto done; } - mp->m_dentry = dentry; - mp->m_count = 1; - hlist_add_head(&mp->m_hash, chain); - INIT_HLIST_HEAD(&mp->m_list); + if (!new) + new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + + /* Exactly one processes may set d_mounted */ + ret = d_set_mounted(dentry); + + /* Someone else set d_mounted? */ + if (ret == -EBUSY) + goto mountpoint; + + /* The dentry is not available as a mountpoint? */ + mp = ERR_PTR(ret); + if (ret) + goto done; + + /* Add the new mountpoint to the hash table */ + read_seqlock_excl(&mount_lock); + new->m_dentry = dentry; + new->m_count = 1; + hlist_add_head(&new->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&new->m_list); + read_sequnlock_excl(&mount_lock); + + mp = new; + new = NULL; +done: + kfree(new); return mp; } @@ -1595,11 +1619,11 @@ void __detach_mounts(struct dentry *dentry) struct mount *mnt; namespace_lock(); + lock_mount_hash(); mp = lookup_mountpoint(dentry); if (IS_ERR_OR_NULL(mp)) goto out_unlock; - lock_mount_hash(); event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); @@ -1609,9 +1633,9 @@ void __detach_mounts(struct dentry *dentry) } else umount_tree(mnt, UMOUNT_CONNECTED); } - unlock_mount_hash(); put_mountpoint(mp); out_unlock: + unlock_mount_hash(); namespace_unlock(); } @@ -2038,9 +2062,7 @@ retry: namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = lookup_mountpoint(dentry); - if (!mp) - mp = new_mountpoint(dentry); + struct mountpoint *mp = get_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); inode_unlock(dentry->d_inode); @@ -2059,7 +2081,11 @@ retry: static void unlock_mount(struct mountpoint *where) { struct dentry *dentry = where->m_dentry; + + read_seqlock_excl(&mount_lock); put_mountpoint(where); + read_sequnlock_excl(&mount_lock); + namespace_unlock(); inode_unlock(dentry->d_inode); } @@ -3135,9 +3161,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); + put_mountpoint(root_mp); unlock_mount_hash(); chroot_fs_refs(&root, &new); - put_mountpoint(root_mp); error = 0; out4: unlock_mount(old_mp); From 75422726b0f717d67db3283c2eb5bc14fa2619c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Jan 2017 17:37:27 +1300 Subject: [PATCH 154/297] libfs: Modify mount_pseudo_xattr to be clear it is not a userspace mount Add MS_KERNMOUNT to the flags that are passed. Use sget_userns and force &init_user_ns instead of calling sget so that even if called from a weird context the internal filesystem will be considered to be in the intial user namespace. Luis Ressel reported that the the failure to pass MS_KERNMOUNT into mount_pseudo broke his in development graphics driver that uses the generic drm infrastructure. I am not certain the deriver was bug free in it's usage of that infrastructure but since mount_pseudo_xattr can never be triggered by userspace it is clearer and less error prone, and less problematic for the code to be explicit. Reported-by: Luis Ressel Tested-by: Luis Ressel Acked-by: Al Viro Signed-off-by: "Eric W. Biederman" --- fs/libfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/libfs.c b/fs/libfs.c index e973cd51f126..28d6f35feed6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -245,7 +245,8 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, struct inode *root; struct qstr d_name = QSTR_INIT(name, strlen(name)); - s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL); + s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER, + &init_user_ns, NULL); if (IS_ERR(s)) return ERR_CAST(s); From add7c65ca426b7a37184dd3d2172394e23d585d6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 4 Jan 2017 19:28:14 -0800 Subject: [PATCH 155/297] pid: fix lockdep deadlock warning due to ucount_lock ========================================================= [ INFO: possible irq lock inversion dependency detected ] 4.10.0-rc2-00024-g4aecec9-dirty #118 Tainted: G W --------------------------------------------------------- swapper/1/0 just changed the state of lock: (&(&sighand->siglock)->rlock){-.....}, at: [] __lock_task_sighand+0xb6/0x2c0 but this lock took another, HARDIRQ-unsafe lock in the past: (ucounts_lock){+.+...} and interrupts could create inverse lock ordering between them. other info that might help us debug this: Chain exists of: &(&sighand->siglock)->rlock --> &(&tty->ctrl_lock)->rlock --> ucounts_lock Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(ucounts_lock); local_irq_disable(); lock(&(&sighand->siglock)->rlock); lock(&(&tty->ctrl_lock)->rlock); lock(&(&sighand->siglock)->rlock); *** DEADLOCK *** This patch removes a dependency between rlock and ucount_lock. Fixes: f333c700c610 ("pidns: Add a limit on the number of pid namespaces") Cc: stable@vger.kernel.org Signed-off-by: Andrei Vagin Acked-by: Al Viro Signed-off-by: Eric W. Biederman --- kernel/pid_namespace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index df9e8e9e0be7..eef2ce968636 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -151,8 +151,12 @@ out: static void delayed_free_pidns(struct rcu_head *p) { - kmem_cache_free(pid_ns_cachep, - container_of(p, struct pid_namespace, rcu)); + struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu); + + dec_pid_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + + kmem_cache_free(pid_ns_cachep, ns); } static void destroy_pid_namespace(struct pid_namespace *ns) @@ -162,8 +166,6 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); - dec_pid_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); call_rcu(&ns->rcu, delayed_free_pidns); } From 93362fa47fe98b62e4a34ab408c4a418432e7939 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Fri, 6 Jan 2017 09:32:32 +0800 Subject: [PATCH 156/297] sysctl: Drop reference added by grab_header in proc_sys_readdir Fixes CVE-2016-9191, proc_sys_readdir doesn't drop reference added by grab_header when return from !dir_emit_dots path. It can cause any path called unregister_sysctl_table will wait forever. The calltrace of CVE-2016-9191: [ 5535.960522] Call Trace: [ 5535.963265] [] schedule+0x3f/0xa0 [ 5535.968817] [] schedule_timeout+0x3db/0x6f0 [ 5535.975346] [] ? wait_for_completion+0x45/0x130 [ 5535.982256] [] wait_for_completion+0xc3/0x130 [ 5535.988972] [] ? wake_up_q+0x80/0x80 [ 5535.994804] [] drop_sysctl_table+0xc4/0xe0 [ 5536.001227] [] drop_sysctl_table+0x77/0xe0 [ 5536.007648] [] unregister_sysctl_table+0x4d/0xa0 [ 5536.014654] [] unregister_sysctl_table+0x7f/0xa0 [ 5536.021657] [] unregister_sched_domain_sysctl+0x15/0x40 [ 5536.029344] [] partition_sched_domains+0x44/0x450 [ 5536.036447] [] ? __mutex_unlock_slowpath+0x111/0x1f0 [ 5536.043844] [] rebuild_sched_domains_locked+0x64/0xb0 [ 5536.051336] [] update_flag+0x11d/0x210 [ 5536.057373] [] ? mutex_lock_nested+0x2df/0x450 [ 5536.064186] [] ? cpuset_css_offline+0x1b/0x60 [ 5536.070899] [] ? trace_hardirqs_on+0xd/0x10 [ 5536.077420] [] ? mutex_lock_nested+0x2df/0x450 [ 5536.084234] [] ? css_killed_work_fn+0x25/0x220 [ 5536.091049] [] cpuset_css_offline+0x35/0x60 [ 5536.097571] [] css_killed_work_fn+0x5c/0x220 [ 5536.104207] [] process_one_work+0x1df/0x710 [ 5536.110736] [] ? process_one_work+0x160/0x710 [ 5536.117461] [] worker_thread+0x12b/0x4a0 [ 5536.123697] [] ? process_one_work+0x710/0x710 [ 5536.130426] [] kthread+0xfe/0x120 [ 5536.135991] [] ret_from_fork+0x1f/0x40 [ 5536.142041] [] ? kthread_create_on_node+0x230/0x230 One cgroup maintainer mentioned that "cgroup is trying to offline a cpuset css, which takes place under cgroup_mutex. The offlining ends up trying to drain active usages of a sysctl table which apprently is not happening." The real reason is that proc_sys_readdir doesn't drop reference added by grab_header when return from !dir_emit_dots path. So this cpuset offline path will wait here forever. See here for details: http://www.openwall.com/lists/oss-security/2016/11/04/13 Fixes: f0c3b5093add ("[readdir] convert procfs") Cc: stable@vger.kernel.org Reported-by: CAI Qian Tested-by: Yang Shukui Signed-off-by: Zhou Chengming Acked-by: Al Viro Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 55313d994895..d4e37acd4821 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -709,7 +709,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) - return 0; + goto out; pos = 2; @@ -719,6 +719,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) break; } } +out: sysctl_head_finish(head); return 0; } From 21d25f6a4217e755906cb548b55ddab39d0e88b9 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Wed, 4 Jan 2017 01:22:52 -0800 Subject: [PATCH 157/297] dmaengine: iota: ioat_alloc_chan_resources should not perform sleeping allocations. On a kernel with DEBUG_LOCKS, ioat_free_chan_resources triggers an in_interrupt() warning. With PROVE_LOCKING, it reports detecting a SOFTIRQ-safe to SOFTIRQ-unsafe lock ordering in the same code path. This is because dma_generic_alloc_coherent() checks if the GFP flags permit blocking. It allocates from different subsystems if blocking is permitted. The free path knows how to return the memory to the correct allocator. If GFP_KERNEL is specified then the alloc and free end up going through cma_alloc(), which uses mutexes. Given that ioat_free_chan_resources() can be called in interrupt context, ioat_alloc_chan_resources() must specify GFP_NOWAIT so that the allocations do not block and instead use an allocator that uses spinlocks. Signed-off-by: Krister Johansen Acked-by: Dave Jiang Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index ace5cb2cb12f..cc5259b881d4 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -700,7 +700,7 @@ static int ioat_alloc_chan_resources(struct dma_chan *c) /* doing 2 32bit writes to mmio since 1 64b write doesn't work */ ioat_chan->completion = dma_pool_zalloc(ioat_chan->ioat_dma->completion_pool, - GFP_KERNEL, &ioat_chan->completion_dma); + GFP_NOWAIT, &ioat_chan->completion_dma); if (!ioat_chan->completion) return -ENOMEM; @@ -710,7 +710,7 @@ static int ioat_alloc_chan_resources(struct dma_chan *c) ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH); order = IOAT_MAX_ORDER; - ring = ioat_alloc_ring(c, order, GFP_KERNEL); + ring = ioat_alloc_ring(c, order, GFP_NOWAIT); if (!ring) return -ENOMEM; From 527a27591312e4b3a0f8179f321f9e85c0850df0 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 9 Jan 2017 16:50:52 +0200 Subject: [PATCH 158/297] dmaengine: omap-dma: Fix the port_window support We do not yet have users of port_window. The following errors were found when converting the tusb6010_omap.c musb driver: - The peripheral side must have SRC_/DST_PACKED disabled - when configuring the burst for the peripheral side the memory side configuration were overwritten: d->csdp = ... -> d->csdp |= ... - The EI and FI were configured for the wrong sides of the transfers. With these changes and the converted tus6010_omap.c I was able to verify that things are working as they expected to work. Fixes: 201ac4861c19 ("dmaengine: omap-dma: Support for slave devices with data port window") Signed-off-by: Peter Ujfalusi Signed-off-by: Vinod Koul --- drivers/dma/omap-dma.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c index 4ad101a47e0a..daf479cce691 100644 --- a/drivers/dma/omap-dma.c +++ b/drivers/dma/omap-dma.c @@ -938,6 +938,23 @@ static struct dma_async_tx_descriptor *omap_dma_prep_slave_sg( d->ccr |= CCR_DST_AMODE_POSTINC; if (port_window) { d->ccr |= CCR_SRC_AMODE_DBLIDX; + + if (port_window_bytes >= 64) + d->csdp |= CSDP_SRC_BURST_64; + else if (port_window_bytes >= 32) + d->csdp |= CSDP_SRC_BURST_32; + else if (port_window_bytes >= 16) + d->csdp |= CSDP_SRC_BURST_16; + + } else { + d->ccr |= CCR_SRC_AMODE_CONSTANT; + } + } else { + d->csdp = CSDP_SRC_BURST_64 | CSDP_SRC_PACKED; + + d->ccr |= CCR_SRC_AMODE_POSTINC; + if (port_window) { + d->ccr |= CCR_DST_AMODE_DBLIDX; d->ei = 1; /* * One frame covers the port_window and by configure @@ -948,27 +965,11 @@ static struct dma_async_tx_descriptor *omap_dma_prep_slave_sg( d->fi = -(port_window_bytes - 1); if (port_window_bytes >= 64) - d->csdp = CSDP_SRC_BURST_64 | CSDP_SRC_PACKED; + d->csdp |= CSDP_DST_BURST_64; else if (port_window_bytes >= 32) - d->csdp = CSDP_SRC_BURST_32 | CSDP_SRC_PACKED; + d->csdp |= CSDP_DST_BURST_32; else if (port_window_bytes >= 16) - d->csdp = CSDP_SRC_BURST_16 | CSDP_SRC_PACKED; - } else { - d->ccr |= CCR_SRC_AMODE_CONSTANT; - } - } else { - d->csdp = CSDP_SRC_BURST_64 | CSDP_SRC_PACKED; - - d->ccr |= CCR_SRC_AMODE_POSTINC; - if (port_window) { - d->ccr |= CCR_DST_AMODE_DBLIDX; - - if (port_window_bytes >= 64) - d->csdp = CSDP_DST_BURST_64 | CSDP_DST_PACKED; - else if (port_window_bytes >= 32) - d->csdp = CSDP_DST_BURST_32 | CSDP_DST_PACKED; - else if (port_window_bytes >= 16) - d->csdp = CSDP_DST_BURST_16 | CSDP_DST_PACKED; + d->csdp |= CSDP_DST_BURST_16; } else { d->ccr |= CCR_DST_AMODE_CONSTANT; } @@ -1017,7 +1018,7 @@ static struct dma_async_tx_descriptor *omap_dma_prep_slave_sg( osg->addr = sg_dma_address(sgent); osg->en = en; osg->fn = sg_dma_len(sgent) / frame_bytes; - if (port_window && dir == DMA_MEM_TO_DEV) { + if (port_window && dir == DMA_DEV_TO_MEM) { osg->ei = 1; /* * One frame covers the port_window and by configure From 497de07d89c1410d76a15bec2bb41f24a2a89f31 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Mon, 9 Jan 2017 09:34:48 +0800 Subject: [PATCH 159/297] tmpfs: clear S_ISGID when setting posix ACLs This change was missed the tmpfs modification in In CVE-2016-7097 commit 073931017b49 ("posix_acl: Clear SGID bit when setting file permissions") It can test by xfstest generic/375, which failed to clear setgid bit in the following test case on tmpfs: touch $testfile chown 100:100 $testfile chmod 2755 $testfile _runas -u 100 -g 101 -- setfacl -m u::rwx,g::rwx,o::rwx $testfile Signed-off-by: Gu Zheng Signed-off-by: Al Viro --- fs/posix_acl.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 595522022aca..c9d48dc78495 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -922,11 +922,10 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) int error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - return 0; - if (error == 0) - acl = NULL; + error = posix_acl_update_mode(inode, + &inode->i_mode, &acl); + if (error) + return error; } inode->i_ctime = current_time(inode); From 2e40795c3bf344cfb5220d94566205796e3ef19a Mon Sep 17 00:00:00 2001 From: Dennis Kadioglu Date: Mon, 9 Jan 2017 17:10:46 +0100 Subject: [PATCH 160/297] ALSA: usb-audio: Add a quirk for Plantronics BT600 Plantronics BT600 does not support reading the sample rate which leads to many lines of "cannot get freq at ep 0x1" and "cannot get freq at ep 0x82". This patch adds the USB ID of the BT600 to quirks.c and avoids those error messages. Signed-off-by: Dennis Kadioglu Cc: Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index b3fd2382fdd9..eb4b9f7a571e 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1135,6 +1135,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x045E, 0x076F): /* MS Lifecam HD-6000 */ case USB_ID(0x045E, 0x0772): /* MS Lifecam Studio */ case USB_ID(0x045E, 0x0779): /* MS Lifecam HD-3000 */ + case USB_ID(0x047F, 0x02F7): /* Plantronics BT-600 */ case USB_ID(0x047F, 0x0415): /* Plantronics BT-300 */ case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */ case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */ From 146cc8a17a3b4996f6805ee5c080e7101277c410 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 10 Jan 2017 12:05:37 +0100 Subject: [PATCH 161/297] USB: serial: kl5kusb105: fix line-state error handling The current implementation failed to detect short transfers when attempting to read the line state, and also, to make things worse, logged the content of the uninitialised heap transfer buffer. Fixes: abf492e7b3ae ("USB: kl5kusb105: fix DMA buffers on stack") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/usb/serial/kl5kusb105.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c index 0ee190fc1bf8..6cb45757818f 100644 --- a/drivers/usb/serial/kl5kusb105.c +++ b/drivers/usb/serial/kl5kusb105.c @@ -192,10 +192,11 @@ static int klsi_105_get_line_state(struct usb_serial_port *port, status_buf, KLSI_STATUSBUF_LEN, 10000 ); - if (rc < 0) - dev_err(&port->dev, "Reading line status failed (error = %d)\n", - rc); - else { + if (rc != KLSI_STATUSBUF_LEN) { + dev_err(&port->dev, "reading line status failed: %d\n", rc); + if (rc >= 0) + rc = -EIO; + } else { status = get_unaligned_le16(status_buf); dev_info(&port->serial->dev->dev, "read status %x %x\n", From 620f1a632ebcc9811c2f8009ba52297c7006f805 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Dec 2016 18:50:13 -0800 Subject: [PATCH 162/297] wusbcore: Fix one more crypto-on-the-stack bug The driver put a constant buffer of all zeros on the stack and pointed a scatterlist entry at it. This doesn't work with virtual stacks. Use ZERO_PAGE instead. Cc: stable@vger.kernel.org # 4.9 only Reported-by: Eric Biggers Signed-off-by: Andy Lutomirski Signed-off-by: Greg Kroah-Hartman --- drivers/usb/wusbcore/crypto.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/wusbcore/crypto.c b/drivers/usb/wusbcore/crypto.c index 79451f7ef1b7..062c205f0046 100644 --- a/drivers/usb/wusbcore/crypto.c +++ b/drivers/usb/wusbcore/crypto.c @@ -216,7 +216,6 @@ static int wusb_ccm_mac(struct crypto_skcipher *tfm_cbc, struct scatterlist sg[4], sg_dst; void *dst_buf; size_t dst_size; - const u8 bzero[16] = { 0 }; u8 iv[crypto_skcipher_ivsize(tfm_cbc)]; size_t zero_padding; @@ -261,7 +260,7 @@ static int wusb_ccm_mac(struct crypto_skcipher *tfm_cbc, sg_set_buf(&sg[1], &scratch->b1, sizeof(scratch->b1)); sg_set_buf(&sg[2], b, blen); /* 0 if well behaved :) */ - sg_set_buf(&sg[3], bzero, zero_padding); + sg_set_page(&sg[3], ZERO_PAGE(0), zero_padding, 0); sg_init_one(&sg_dst, dst_buf, dst_size); skcipher_request_set_tfm(req, tfm_cbc); From dc647ec88e029307e60e6bf9988056605f11051a Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 10 Jan 2017 09:30:51 +0100 Subject: [PATCH 163/297] net: socket: Make unnecessarily global sockfs_setattr() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sockfs_setattr() static as it is not used outside of net/socket.c This fixes the following GCC warning: net/socket.c:534:5: warning: no previous prototype for ‘sockfs_setattr’ [-Wmissing-prototypes] Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Cc: Lorenzo Colitti Signed-off-by: Tobias Klauser Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/socket.c b/net/socket.c index a8c2307590b8..0758e13754e2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -533,7 +533,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, return used; } -int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) +static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(dentry, iattr); From e864212078ded276bdb272b2e0ee6a979357ca8a Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 23 Dec 2016 11:37:53 +0100 Subject: [PATCH 164/297] target: add XCOPY target/segment desc sense codes As defined in http://www.t10.org/lists/asc-num.htm. To be used during validation of XCOPY target and segment descriptor lists. Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_transport.c | 24 ++++++++++++++++++++++++ include/target/target_core_base.h | 4 ++++ 2 files changed, 28 insertions(+) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 7dfefd66df93..1cadc9eefa21 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1693,6 +1693,10 @@ void transport_generic_request_failure(struct se_cmd *cmd, case TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED: case TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED: case TCM_COPY_TARGET_DEVICE_NOT_REACHABLE: + case TCM_TOO_MANY_TARGET_DESCS: + case TCM_UNSUPPORTED_TARGET_DESC_TYPE_CODE: + case TCM_TOO_MANY_SEGMENT_DESCS: + case TCM_UNSUPPORTED_SEGMENT_DESC_TYPE_CODE: break; case TCM_OUT_OF_RESOURCES: sense_reason = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -2808,6 +2812,26 @@ static const struct sense_info sense_info_table[] = { .key = ILLEGAL_REQUEST, .asc = 0x26, /* INVALID FIELD IN PARAMETER LIST */ }, + [TCM_TOO_MANY_TARGET_DESCS] = { + .key = ILLEGAL_REQUEST, + .asc = 0x26, + .ascq = 0x06, /* TOO MANY TARGET DESCRIPTORS */ + }, + [TCM_UNSUPPORTED_TARGET_DESC_TYPE_CODE] = { + .key = ILLEGAL_REQUEST, + .asc = 0x26, + .ascq = 0x07, /* UNSUPPORTED TARGET DESCRIPTOR TYPE CODE */ + }, + [TCM_TOO_MANY_SEGMENT_DESCS] = { + .key = ILLEGAL_REQUEST, + .asc = 0x26, + .ascq = 0x08, /* TOO MANY SEGMENT DESCRIPTORS */ + }, + [TCM_UNSUPPORTED_SEGMENT_DESC_TYPE_CODE] = { + .key = ILLEGAL_REQUEST, + .asc = 0x26, + .ascq = 0x09, /* UNSUPPORTED SEGMENT DESCRIPTOR TYPE CODE */ + }, [TCM_PARAMETER_LIST_LENGTH_ERROR] = { .key = ILLEGAL_REQUEST, .asc = 0x1a, /* PARAMETER LIST LENGTH ERROR */ diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 29e6858bb164..43edf82e54ff 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -174,6 +174,10 @@ enum tcm_sense_reason_table { TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED = R(0x16), TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED = R(0x17), TCM_COPY_TARGET_DEVICE_NOT_REACHABLE = R(0x18), + TCM_TOO_MANY_TARGET_DESCS = R(0x19), + TCM_UNSUPPORTED_TARGET_DESC_TYPE_CODE = R(0x1a), + TCM_TOO_MANY_SEGMENT_DESCS = R(0x1b), + TCM_UNSUPPORTED_SEGMENT_DESC_TYPE_CODE = R(0x1c), #undef R }; From 61c359194c46cbffec9a1f2c59c1c4011222ad84 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 23 Dec 2016 11:37:54 +0100 Subject: [PATCH 165/297] target: use XCOPY TOO MANY TARGET DESCRIPTORS sense spc4r37 6.4.3.4 states: If the number of CSCD descriptors exceeds the allowed number, the copy manager shall terminate the command with CHECK CONDITION status, with the sense key set to ILLEGAL REQUEST, and the additional sense code set to TOO MANY TARGET DESCRIPTORS. LIO currently responds with INVALID FIELD IN PARAMETER LIST, which sees it fail the libiscsi ExtendedCopy.DescrLimits test. Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 37d5caebffa6..db265ad10fa4 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -201,9 +201,11 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, " multiple of %d\n", XCOPY_TARGET_DESC_LEN); return -EINVAL; } - if (tdll > 64) { + if (tdll > RCR_OP_MAX_TARGET_DESC_COUNT * XCOPY_TARGET_DESC_LEN) { pr_err("XCOPY target descriptor supports a maximum" " two src/dest descriptors, tdll: %hu too large..\n", tdll); + /* spc4r37 6.4.3.4 CSCD DESCRIPTOR LIST LENGTH field */ + *sense_ret = TCM_TOO_MANY_TARGET_DESCS; return -EINVAL; } /* From af9f62c1686268c0517b289274d38f3a03bebd2a Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 23 Dec 2016 11:37:55 +0100 Subject: [PATCH 166/297] target: bounds check XCOPY segment descriptor list Check the length of the XCOPY request segment descriptor list against the value advertised via the MAXIMUM SEGMENT DESCRIPTOR COUNT field in the RECEIVE COPY OPERATING PARAMETERS response. spc4r37 6.4.3.5 states: If the number of segment descriptors exceeds the allowed number, the copy manager shall terminate the command with CHECK CONDITION status, with the sense key set to ILLEGAL REQUEST, and the additional sense code set to TOO MANY SEGMENT DESCRIPTORS. This functionality is testable using the libiscsi ExtendedCopy.DescrLimits test. Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index db265ad10fa4..da0f2da732e7 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -308,17 +308,26 @@ static int target_xcopy_parse_segdesc_02(struct se_cmd *se_cmd, struct xcopy_op static int target_xcopy_parse_segment_descriptors(struct se_cmd *se_cmd, struct xcopy_op *xop, unsigned char *p, - unsigned int sdll) + unsigned int sdll, sense_reason_t *sense_ret) { unsigned char *desc = p; unsigned int start = 0; int offset = sdll % XCOPY_SEGMENT_DESC_LEN, rc, ret = 0; + *sense_ret = TCM_INVALID_PARAMETER_LIST; + if (offset != 0) { pr_err("XCOPY segment descriptor list length is not" " multiple of %d\n", XCOPY_SEGMENT_DESC_LEN); return -EINVAL; } + if (sdll > RCR_OP_MAX_SG_DESC_COUNT * XCOPY_SEGMENT_DESC_LEN) { + pr_err("XCOPY supports %u segment descriptor(s), sdll: %u too" + " large..\n", RCR_OP_MAX_SG_DESC_COUNT, sdll); + /* spc4r37 6.4.3.5 SEGMENT DESCRIPTOR LIST LENGTH field */ + *sense_ret = TCM_TOO_MANY_SEGMENT_DESCS; + return -EINVAL; + } while (start < sdll) { /* @@ -916,7 +925,8 @@ sense_reason_t target_do_xcopy(struct se_cmd *se_cmd) seg_desc = &p[16]; seg_desc += (rc * XCOPY_TARGET_DESC_LEN); - rc = target_xcopy_parse_segment_descriptors(se_cmd, xop, seg_desc, sdll); + rc = target_xcopy_parse_segment_descriptors(se_cmd, xop, seg_desc, + sdll, &ret); if (rc <= 0) { xcopy_pt_undepend_remotedev(xop); goto out; From 7d38706669ce00603b187f667a4eb67c94eac098 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 23 Dec 2016 11:37:56 +0100 Subject: [PATCH 167/297] target: bounds check XCOPY total descriptor list length spc4r37 6.4.3.5 states: If the combined length of the CSCD descriptors and segment descriptors exceeds the allowed value, then the copy manager shall terminate the command with CHECK CONDITION status, with the sense key set to ILLEGAL REQUEST, and the additional sense code set to PARAMETER LIST LENGTH ERROR. This functionality can be tested using the libiscsi ExtendedCopy.DescrLimits test. Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index da0f2da732e7..0d10fcf438d1 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -894,6 +894,12 @@ sense_reason_t target_do_xcopy(struct se_cmd *se_cmd) */ tdll = get_unaligned_be16(&p[2]); sdll = get_unaligned_be32(&p[8]); + if (tdll + sdll > RCR_OP_MAX_DESC_LIST_LEN) { + pr_err("XCOPY descriptor list length %u exceeds maximum %u\n", + tdll + sdll, RCR_OP_MAX_DESC_LIST_LEN); + ret = TCM_PARAMETER_LIST_LENGTH_ERROR; + goto out; + } inline_dl = get_unaligned_be32(&p[12]); if (inline_dl != 0) { From c243849720ac237e9e7191fe57f619bb3a871d4c Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 2 Jan 2017 18:04:04 +0100 Subject: [PATCH 168/297] target: return UNSUPPORTED TARGET/SEGMENT DESC TYPE CODE sense Use UNSUPPORTED TARGET DESCRIPTOR TYPE CODE and UNSUPPORTED SEGMENT DESCRIPTOR TYPE CODE additional sense codes if a descriptor type in an XCOPY request is not supported, as specified in spc4r37 6.4.5 and 6.4.6. Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 0d10fcf438d1..52738a108e5f 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -199,6 +199,7 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, if (offset != 0) { pr_err("XCOPY target descriptor list length is not" " multiple of %d\n", XCOPY_TARGET_DESC_LEN); + *sense_ret = TCM_UNSUPPORTED_TARGET_DESC_TYPE_CODE; return -EINVAL; } if (tdll > RCR_OP_MAX_TARGET_DESC_COUNT * XCOPY_TARGET_DESC_LEN) { @@ -240,6 +241,7 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, default: pr_err("XCOPY unsupported descriptor type code:" " 0x%02x\n", desc[0]); + *sense_ret = TCM_UNSUPPORTED_TARGET_DESC_TYPE_CODE; goto out; } } @@ -319,6 +321,7 @@ static int target_xcopy_parse_segment_descriptors(struct se_cmd *se_cmd, if (offset != 0) { pr_err("XCOPY segment descriptor list length is not" " multiple of %d\n", XCOPY_SEGMENT_DESC_LEN); + *sense_ret = TCM_UNSUPPORTED_SEGMENT_DESC_TYPE_CODE; return -EINVAL; } if (sdll > RCR_OP_MAX_SG_DESC_COUNT * XCOPY_SEGMENT_DESC_LEN) { @@ -346,6 +349,7 @@ static int target_xcopy_parse_segment_descriptors(struct se_cmd *se_cmd, default: pr_err("XCOPY unsupported segment descriptor" "type: 0x%02x\n", desc[0]); + *sense_ret = TCM_UNSUPPORTED_SEGMENT_DESC_TYPE_CODE; goto out; } } From 94aae4caacda89a1bdb7198b260f4ca3595b7ed7 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 2 Jan 2017 18:04:05 +0100 Subject: [PATCH 169/297] target: simplify XCOPY wwn->se_dev lookup helper target_xcopy_locate_se_dev_e4() is used to locate an se_dev, based on the WWN provided with the XCOPY request. Remove a couple of unneeded arguments, and rely on the caller for the src/dst test. Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 52738a108e5f..155db18ee4e7 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -53,18 +53,13 @@ static int target_xcopy_gen_naa_ieee(struct se_device *dev, unsigned char *buf) return 0; } -static int target_xcopy_locate_se_dev_e4(struct se_cmd *se_cmd, struct xcopy_op *xop, - bool src) +static int target_xcopy_locate_se_dev_e4(const unsigned char *dev_wwn, + struct se_device **found_dev) { struct se_device *se_dev; - unsigned char tmp_dev_wwn[XCOPY_NAA_IEEE_REGEX_LEN], *dev_wwn; + unsigned char tmp_dev_wwn[XCOPY_NAA_IEEE_REGEX_LEN]; int rc; - if (src) - dev_wwn = &xop->dst_tid_wwn[0]; - else - dev_wwn = &xop->src_tid_wwn[0]; - mutex_lock(&g_device_mutex); list_for_each_entry(se_dev, &g_device_list, g_dev_node) { @@ -78,15 +73,8 @@ static int target_xcopy_locate_se_dev_e4(struct se_cmd *se_cmd, struct xcopy_op if (rc != 0) continue; - if (src) { - xop->dst_dev = se_dev; - pr_debug("XCOPY 0xe4: Setting xop->dst_dev: %p from located" - " se_dev\n", xop->dst_dev); - } else { - xop->src_dev = se_dev; - pr_debug("XCOPY 0xe4: Setting xop->src_dev: %p from located" - " se_dev\n", xop->src_dev); - } + *found_dev = se_dev; + pr_debug("XCOPY 0xe4: located se_dev: %p\n", se_dev); rc = target_depend_item(&se_dev->dev_group.cg_item); if (rc != 0) { @@ -247,9 +235,11 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, } if (xop->op_origin == XCOL_SOURCE_RECV_OP) - rc = target_xcopy_locate_se_dev_e4(se_cmd, xop, true); + rc = target_xcopy_locate_se_dev_e4(xop->dst_tid_wwn, + &xop->dst_dev); else - rc = target_xcopy_locate_se_dev_e4(se_cmd, xop, false); + rc = target_xcopy_locate_se_dev_e4(xop->src_tid_wwn, + &xop->src_dev); /* * If a matching IEEE NAA 0x83 descriptor for the requested device * is not located on this node, return COPY_ABORTED with ASQ/ASQC From f184210bca6c9d0091ff5e5629dea4cbb8a17c0f Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 2 Jan 2017 18:04:06 +0100 Subject: [PATCH 170/297] target: check XCOPY segment descriptor CSCD IDs Ensure that the segment descriptor CSCD descriptor ID values correspond to CSCD descriptor entries located in the XCOPY command parameter list. SPC4r37 6.4.6.1 Table 150 specifies this range as 0000h to 07FFh, where the CSCD descriptor location in the parameter list can be located via: 16 + (id * 32) Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig [ bvanassche: inserted "; " in the format string of an error message and also moved a "||" operator from the start of a line to the end of the previous line ] Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 8 ++++++++ drivers/target/target_core_xcopy.h | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 155db18ee4e7..41a2a8ad1046 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -278,6 +278,14 @@ static int target_xcopy_parse_segdesc_02(struct se_cmd *se_cmd, struct xcopy_op xop->stdi = get_unaligned_be16(&desc[4]); xop->dtdi = get_unaligned_be16(&desc[6]); + + if (xop->stdi > XCOPY_CSCD_DESC_ID_LIST_OFF_MAX || + xop->dtdi > XCOPY_CSCD_DESC_ID_LIST_OFF_MAX) { + pr_err("XCOPY segment desc 0x02: unsupported CSCD ID > 0x%x; stdi: %hu dtdi: %hu\n", + XCOPY_CSCD_DESC_ID_LIST_OFF_MAX, xop->stdi, xop->dtdi); + return -EINVAL; + } + pr_debug("XCOPY seg desc 0x02: desc_len: %hu stdi: %hu dtdi: %hu, DC: %d\n", desc_len, xop->stdi, xop->dtdi, dc); diff --git a/drivers/target/target_core_xcopy.h b/drivers/target/target_core_xcopy.h index 4d3d4dd060f2..e2d141140342 100644 --- a/drivers/target/target_core_xcopy.h +++ b/drivers/target/target_core_xcopy.h @@ -5,6 +5,12 @@ #define XCOPY_NAA_IEEE_REGEX_LEN 16 #define XCOPY_MAX_SECTORS 1024 +/* + * SPC4r37 6.4.6.1 + * Table 150 — CSCD descriptor ID values + */ +#define XCOPY_CSCD_DESC_ID_LIST_OFF_MAX 0x07FF + enum xcopy_origin_list { XCOL_SOURCE_RECV_OP = 0x01, XCOL_DEST_RECV_OP = 0x02, From 66640d35c1e4ef3c96ba5edb3c5e2ff8ab812e7a Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 2 Jan 2017 18:04:07 +0100 Subject: [PATCH 171/297] target: use XCOPY segment descriptor CSCD IDs The XCOPY specification in SPC4r37 states that the XCOPY source and destination device(s) should be derived from the copy source and copy destination (CSCD) descriptor IDs in the XCOPY segment descriptor. The CSCD IDs are generally (for block -> block copies), indexes into the corresponding CSCD descriptor list, e.g. ================================= EXTENDED COPY Header ================================= CSCD Descriptor List - entry 0 + LU ID <--------------<------------------\ - entry 1 | + LU ID <______________<_____________ | ================================= | | Segment Descriptor List | | - segment 0 | | + src CSCD ID = 0 --------->---------+----/ + dest CSCD ID = 1 ___________>______| + len + src lba + dest lba ================================= Currently LIO completely ignores the src and dest CSCD IDs in the Segment Descriptor List, and instead assumes that the first entry in the CSCD list corresponds to the source, and the second to the destination. This commit removes this assumption, by ensuring that the Segment Descriptor List is parsed prior to processing the CSCD Descriptor List. CSCD Descriptor List processing is modified to compare the current list index with the previously obtained src and dest CSCD IDs. Additionally, XCOPY requests where the src and dest CSCD IDs refer to the CSCD Descriptor List entry can now be successfully processed. Fixes: cbf031f ("target: Add support for EXTENDED_COPY copy offload") Link: https://bugzilla.kernel.org/show_bug.cgi?id=191381 Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 79 ++++++++++++++++++------------ 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 41a2a8ad1046..2595c1eb9e91 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -98,7 +98,7 @@ static int target_xcopy_locate_se_dev_e4(const unsigned char *dev_wwn, } static int target_xcopy_parse_tiddesc_e4(struct se_cmd *se_cmd, struct xcopy_op *xop, - unsigned char *p, bool src) + unsigned char *p, unsigned short cscd_index) { unsigned char *desc = p; unsigned short ript; @@ -143,7 +143,13 @@ static int target_xcopy_parse_tiddesc_e4(struct se_cmd *se_cmd, struct xcopy_op return -EINVAL; } - if (src) { + if (cscd_index != xop->stdi && cscd_index != xop->dtdi) { + pr_debug("XCOPY 0xe4: ignoring CSCD entry %d - neither src nor " + "dest\n", cscd_index); + return 0; + } + + if (cscd_index == xop->stdi) { memcpy(&xop->src_tid_wwn[0], &desc[8], XCOPY_NAA_IEEE_REGEX_LEN); /* * Determine if the source designator matches the local device @@ -155,10 +161,15 @@ static int target_xcopy_parse_tiddesc_e4(struct se_cmd *se_cmd, struct xcopy_op pr_debug("XCOPY 0xe4: Set xop->src_dev %p from source" " received xop\n", xop->src_dev); } - } else { + } + + if (cscd_index == xop->dtdi) { memcpy(&xop->dst_tid_wwn[0], &desc[8], XCOPY_NAA_IEEE_REGEX_LEN); /* - * Determine if the destination designator matches the local device + * Determine if the destination designator matches the local + * device. If @cscd_index corresponds to both source (stdi) and + * destination (dtdi), or dtdi comes after stdi, then + * XCOL_DEST_RECV_OP wins. */ if (!memcmp(&xop->local_dev_wwn[0], &xop->dst_tid_wwn[0], XCOPY_NAA_IEEE_REGEX_LEN)) { @@ -178,9 +189,9 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, { struct se_device *local_dev = se_cmd->se_dev; unsigned char *desc = p; - int offset = tdll % XCOPY_TARGET_DESC_LEN, rc, ret = 0; + int offset = tdll % XCOPY_TARGET_DESC_LEN, rc; + unsigned short cscd_index = 0; unsigned short start = 0; - bool src = true; *sense_ret = TCM_INVALID_PARAMETER_LIST; @@ -206,25 +217,19 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, while (start < tdll) { /* - * Check target descriptor identification with 0xE4 type with - * use VPD 0x83 WWPN matching .. + * Check target descriptor identification with 0xE4 type, and + * compare the current index with the CSCD descriptor IDs in + * the segment descriptor. Use VPD 0x83 WWPN matching .. */ switch (desc[0]) { case 0xe4: rc = target_xcopy_parse_tiddesc_e4(se_cmd, xop, - &desc[0], src); + &desc[0], cscd_index); if (rc != 0) goto out; - /* - * Assume target descriptors are in source -> destination order.. - */ - if (src) - src = false; - else - src = true; start += XCOPY_TARGET_DESC_LEN; desc += XCOPY_TARGET_DESC_LEN; - ret++; + cscd_index++; break; default: pr_err("XCOPY unsupported descriptor type code:" @@ -234,12 +239,21 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, } } - if (xop->op_origin == XCOL_SOURCE_RECV_OP) + switch (xop->op_origin) { + case XCOL_SOURCE_RECV_OP: rc = target_xcopy_locate_se_dev_e4(xop->dst_tid_wwn, &xop->dst_dev); - else + break; + case XCOL_DEST_RECV_OP: rc = target_xcopy_locate_se_dev_e4(xop->src_tid_wwn, &xop->src_dev); + break; + default: + pr_err("XCOPY CSCD descriptor IDs not found in CSCD list - " + "stdi: %hu dtdi: %hu\n", xop->stdi, xop->dtdi); + rc = -EINVAL; + break; + } /* * If a matching IEEE NAA 0x83 descriptor for the requested device * is not located on this node, return COPY_ABORTED with ASQ/ASQC @@ -256,7 +270,7 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, pr_debug("XCOPY TGT desc: Dest dev: %p NAA IEEE WWN: 0x%16phN\n", xop->dst_dev, &xop->dst_tid_wwn[0]); - return ret; + return cscd_index; out: return -EINVAL; @@ -913,6 +927,20 @@ sense_reason_t target_do_xcopy(struct se_cmd *se_cmd) " tdll: %hu sdll: %u inline_dl: %u\n", list_id, list_id_usage, tdll, sdll, inline_dl); + /* + * skip over the target descriptors until segment descriptors + * have been passed - CSCD ids are needed to determine src and dest. + */ + seg_desc = &p[16] + tdll; + + rc = target_xcopy_parse_segment_descriptors(se_cmd, xop, seg_desc, + sdll, &ret); + if (rc <= 0) + goto out; + + pr_debug("XCOPY: Processed %d segment descriptors, length: %u\n", rc, + rc * XCOPY_SEGMENT_DESC_LEN); + rc = target_xcopy_parse_target_descriptors(se_cmd, xop, &p[16], tdll, &ret); if (rc <= 0) goto out; @@ -930,19 +958,8 @@ sense_reason_t target_do_xcopy(struct se_cmd *se_cmd) pr_debug("XCOPY: Processed %d target descriptors, length: %u\n", rc, rc * XCOPY_TARGET_DESC_LEN); - seg_desc = &p[16]; - seg_desc += (rc * XCOPY_TARGET_DESC_LEN); - - rc = target_xcopy_parse_segment_descriptors(se_cmd, xop, seg_desc, - sdll, &ret); - if (rc <= 0) { - xcopy_pt_undepend_remotedev(xop); - goto out; - } transport_kunmap_data_sg(se_cmd); - pr_debug("XCOPY: Processed %d segment descriptors, length: %u\n", rc, - rc * XCOPY_SEGMENT_DESC_LEN); INIT_WORK(&xop->xop_work, target_xcopy_do_work); queue_work(xcopy_wq, &xop->xop_work); return TCM_NO_SENSE; From f94fd098f674b78c29f482da1999d8de0c93c74e Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 2 Jan 2017 18:04:08 +0100 Subject: [PATCH 172/297] target: check for XCOPY parameter truncation Check for XCOPY header, CSCD descriptor and segment descriptor list truncation, and respond accordingly. SPC4r37 6.4.1 EXTENDED COPY(LID4) states (also applying to LID1 reqs): If the parameter list length causes truncation of the parameter list, then the copy manager shall transfer no data and shall terminate the EXTENDED COPY command with CHECK CONDITION status, with the sense key set to ILLEGAL REQUEST, and the additional sense code set to PARAMETER LIST LENGTH ERROR. This behaviour can be tested using the libiscsi ExtendedCopy.ParamHdr test. Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 14 ++++++++++++++ drivers/target/target_core_xcopy.h | 1 + 2 files changed, 15 insertions(+) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 2595c1eb9e91..a9a6462c66d1 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -888,6 +888,12 @@ sense_reason_t target_do_xcopy(struct se_cmd *se_cmd) return TCM_UNSUPPORTED_SCSI_OPCODE; } + if (se_cmd->data_length < XCOPY_HDR_LEN) { + pr_err("XCOPY parameter truncation: length %u < hdr_len %u\n", + se_cmd->data_length, XCOPY_HDR_LEN); + return TCM_PARAMETER_LIST_LENGTH_ERROR; + } + xop = kzalloc(sizeof(struct xcopy_op), GFP_KERNEL); if (!xop) { pr_err("Unable to allocate xcopy_op\n"); @@ -923,6 +929,14 @@ sense_reason_t target_do_xcopy(struct se_cmd *se_cmd) goto out; } + if (se_cmd->data_length < (XCOPY_HDR_LEN + tdll + sdll + inline_dl)) { + pr_err("XCOPY parameter truncation: data length %u too small " + "for tdll: %hu sdll: %u inline_dl: %u\n", + se_cmd->data_length, tdll, sdll, inline_dl); + ret = TCM_PARAMETER_LIST_LENGTH_ERROR; + goto out; + } + pr_debug("Processing XCOPY with list_id: 0x%02x list_id_usage: 0x%02x" " tdll: %hu sdll: %u inline_dl: %u\n", list_id, list_id_usage, tdll, sdll, inline_dl); diff --git a/drivers/target/target_core_xcopy.h b/drivers/target/target_core_xcopy.h index e2d141140342..7c0b105cbe1b 100644 --- a/drivers/target/target_core_xcopy.h +++ b/drivers/target/target_core_xcopy.h @@ -1,5 +1,6 @@ #include +#define XCOPY_HDR_LEN 16 #define XCOPY_TARGET_DESC_LEN 32 #define XCOPY_SEGMENT_DESC_LEN 28 #define XCOPY_NAA_IEEE_REGEX_LEN 16 From 87156518da94a696f2b27ab8945d531af2f1d339 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 2 Jan 2017 18:04:09 +0100 Subject: [PATCH 173/297] target: support XCOPY requests without parameters SPC4r37 6.4.1 EXTENDED COPY(LID4) states (also applying to LID1 reqs): A parameter list length of zero specifies that the copy manager shall not transfer any data or alter any internal state, and this shall not be considered an error. This behaviour can be tested using the libiscsi ExtendedCopy.ParamHdr test. Signed-off-by: David Disseldorp Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/target/target_core_xcopy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index a9a6462c66d1..d828b3b5000b 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -888,6 +888,10 @@ sense_reason_t target_do_xcopy(struct se_cmd *se_cmd) return TCM_UNSUPPORTED_SCSI_OPCODE; } + if (se_cmd->data_length == 0) { + target_complete_cmd(se_cmd, SAM_STAT_GOOD); + return TCM_NO_SENSE; + } if (se_cmd->data_length < XCOPY_HDR_LEN) { pr_err("XCOPY parameter truncation: length %u < hdr_len %u\n", se_cmd->data_length, XCOPY_HDR_LEN); From 8fb280616878b81c0790a0c33acbeec59c5711f4 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 10 Jan 2017 17:04:06 +0800 Subject: [PATCH 174/297] r8152: split rtl8152_suspend function Split rtl8152_suspend() into rtl8152_system_suspend() and rtl8152_rumtime_suspend(). Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 57 +++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 7dc61228c55b..c5e6d88de4e4 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3576,39 +3576,62 @@ static bool delay_autosuspend(struct r8152 *tp) return false; } -static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message) +static int rtl8152_rumtime_suspend(struct r8152 *tp) { - struct r8152 *tp = usb_get_intfdata(intf); struct net_device *netdev = tp->netdev; int ret = 0; - mutex_lock(&tp->control); - - if (PMSG_IS_AUTO(message)) { - if (netif_running(netdev) && delay_autosuspend(tp)) { + if (netif_running(netdev) && test_bit(WORK_ENABLE, &tp->flags)) { + if (delay_autosuspend(tp)) { ret = -EBUSY; goto out1; } - set_bit(SELECTIVE_SUSPEND, &tp->flags); - } else { - netif_device_detach(netdev); + clear_bit(WORK_ENABLE, &tp->flags); + usb_kill_urb(tp->intr_urb); + napi_disable(&tp->napi); + rtl_stop_rx(tp); + tp->rtl_ops.autosuspend_en(tp, true); + napi_enable(&tp->napi); } + set_bit(SELECTIVE_SUSPEND, &tp->flags); + +out1: + return ret; +} + +static int rtl8152_system_suspend(struct r8152 *tp) +{ + struct net_device *netdev = tp->netdev; + int ret = 0; + + netif_device_detach(netdev); + if (netif_running(netdev) && test_bit(WORK_ENABLE, &tp->flags)) { clear_bit(WORK_ENABLE, &tp->flags); usb_kill_urb(tp->intr_urb); napi_disable(&tp->napi); - if (test_bit(SELECTIVE_SUSPEND, &tp->flags)) { - rtl_stop_rx(tp); - tp->rtl_ops.autosuspend_en(tp, true); - } else { - cancel_delayed_work_sync(&tp->schedule); - tp->rtl_ops.down(tp); - } + cancel_delayed_work_sync(&tp->schedule); + tp->rtl_ops.down(tp); napi_enable(&tp->napi); } -out1: + + return ret; +} + +static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message) +{ + struct r8152 *tp = usb_get_intfdata(intf); + int ret; + + mutex_lock(&tp->control); + + if (PMSG_IS_AUTO(message)) + ret = rtl8152_rumtime_suspend(tp); + else + ret = rtl8152_system_suspend(tp); + mutex_unlock(&tp->control); return ret; From 75dc692eda114cb234a46cb11893a9c3ea520934 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 10 Jan 2017 17:04:07 +0800 Subject: [PATCH 175/297] r8152: fix rx issue for runtime suspend Pause the rx and make sure the rx fifo is empty when the autosuspend occurs. If the rx data comes when the driver is canceling the rx urb, the host controller would stop getting the data from the device and continue it after next rx urb is submitted. That is, one continuing data is split into two different urb buffers. That let the driver take the data as a rx descriptor, and unexpected behavior happens. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index c5e6d88de4e4..be418563cb18 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3582,17 +3582,42 @@ static int rtl8152_rumtime_suspend(struct r8152 *tp) int ret = 0; if (netif_running(netdev) && test_bit(WORK_ENABLE, &tp->flags)) { + u32 rcr = 0; + if (delay_autosuspend(tp)) { ret = -EBUSY; goto out1; } + if (netif_carrier_ok(netdev)) { + u32 ocp_data; + + rcr = ocp_read_dword(tp, MCU_TYPE_PLA, PLA_RCR); + ocp_data = rcr & ~RCR_ACPT_ALL; + ocp_write_dword(tp, MCU_TYPE_PLA, PLA_RCR, ocp_data); + rxdy_gated_en(tp, true); + ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, + PLA_OOB_CTRL); + if (!(ocp_data & RXFIFO_EMPTY)) { + rxdy_gated_en(tp, false); + ocp_write_dword(tp, MCU_TYPE_PLA, PLA_RCR, rcr); + ret = -EBUSY; + goto out1; + } + } + clear_bit(WORK_ENABLE, &tp->flags); usb_kill_urb(tp->intr_urb); - napi_disable(&tp->napi); - rtl_stop_rx(tp); + tp->rtl_ops.autosuspend_en(tp, true); - napi_enable(&tp->napi); + + if (netif_carrier_ok(netdev)) { + napi_disable(&tp->napi); + rtl_stop_rx(tp); + rxdy_gated_en(tp, false); + ocp_write_dword(tp, MCU_TYPE_PLA, PLA_RCR, rcr); + napi_enable(&tp->napi); + } } set_bit(SELECTIVE_SUSPEND, &tp->flags); From 7b6c1b4c0e1e44544aa18161dba6a741c080a7ef Mon Sep 17 00:00:00 2001 From: Bin Liu Date: Tue, 10 Jan 2017 10:46:00 -0600 Subject: [PATCH 176/297] usb: musb: fix runtime PM in debugfs MUSB driver now has runtime PM support, but the debugfs driver misses the PM _get/_put() calls, which could cause MUSB register access failure. Cc: stable@vger.kernel.org # 4.9+ Acked-by: Tony Lindgren Signed-off-by: Bin Liu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/musb_debugfs.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/usb/musb/musb_debugfs.c b/drivers/usb/musb/musb_debugfs.c index 4fef50e5c8c1..dd70c88419d2 100644 --- a/drivers/usb/musb/musb_debugfs.c +++ b/drivers/usb/musb/musb_debugfs.c @@ -114,6 +114,7 @@ static int musb_regdump_show(struct seq_file *s, void *unused) unsigned i; seq_printf(s, "MUSB (M)HDRC Register Dump\n"); + pm_runtime_get_sync(musb->controller); for (i = 0; i < ARRAY_SIZE(musb_regmap); i++) { switch (musb_regmap[i].size) { @@ -132,6 +133,8 @@ static int musb_regdump_show(struct seq_file *s, void *unused) } } + pm_runtime_mark_last_busy(musb->controller); + pm_runtime_put_autosuspend(musb->controller); return 0; } @@ -145,7 +148,10 @@ static int musb_test_mode_show(struct seq_file *s, void *unused) struct musb *musb = s->private; unsigned test; + pm_runtime_get_sync(musb->controller); test = musb_readb(musb->mregs, MUSB_TESTMODE); + pm_runtime_mark_last_busy(musb->controller); + pm_runtime_put_autosuspend(musb->controller); if (test & MUSB_TEST_FORCE_HOST) seq_printf(s, "force host\n"); @@ -194,11 +200,12 @@ static ssize_t musb_test_mode_write(struct file *file, u8 test; char buf[18]; + pm_runtime_get_sync(musb->controller); test = musb_readb(musb->mregs, MUSB_TESTMODE); if (test) { dev_err(musb->controller, "Error: test mode is already set. " "Please do USB Bus Reset to start a new test.\n"); - return count; + goto ret; } memset(buf, 0x00, sizeof(buf)); @@ -234,6 +241,9 @@ static ssize_t musb_test_mode_write(struct file *file, musb_writeb(musb->mregs, MUSB_TESTMODE, test); +ret: + pm_runtime_mark_last_busy(musb->controller); + pm_runtime_put_autosuspend(musb->controller); return count; } @@ -254,8 +264,13 @@ static int musb_softconnect_show(struct seq_file *s, void *unused) switch (musb->xceiv->otg->state) { case OTG_STATE_A_HOST: case OTG_STATE_A_WAIT_BCON: + pm_runtime_get_sync(musb->controller); + reg = musb_readb(musb->mregs, MUSB_DEVCTL); connect = reg & MUSB_DEVCTL_SESSION ? 1 : 0; + + pm_runtime_mark_last_busy(musb->controller); + pm_runtime_put_autosuspend(musb->controller); break; default: connect = -1; @@ -284,6 +299,7 @@ static ssize_t musb_softconnect_write(struct file *file, if (copy_from_user(&buf, ubuf, min_t(size_t, sizeof(buf) - 1, count))) return -EFAULT; + pm_runtime_get_sync(musb->controller); if (!strncmp(buf, "0", 1)) { switch (musb->xceiv->otg->state) { case OTG_STATE_A_HOST: @@ -314,6 +330,8 @@ static ssize_t musb_softconnect_write(struct file *file, } } + pm_runtime_mark_last_busy(musb->controller); + pm_runtime_put_autosuspend(musb->controller); return count; } From 7c9d8d0c41b3e24473ac7648a7fc2d644ccf08ff Mon Sep 17 00:00:00 2001 From: "Bryant G. Ly" Date: Mon, 9 Jan 2017 10:21:20 -0600 Subject: [PATCH 177/297] ibmvscsis: Fix srp_transfer_data fail return code If srp_transfer_data fails within ibmvscsis_write_pending, then the most likely scenario is that the client timed out the op and removed the TCE mapping. Thus it will loop forever retrying the op that is pretty much guaranteed to fail forever. A better return code would be EIO instead of EAGAIN. Cc: stable@vger.kernel.org Reported-by: Steven Royer Tested-by: Steven Royer Signed-off-by: Bryant G. Ly Signed-off-by: Bart Van Assche --- drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index 3d3768aaab4f..8fb5c54c7dd3 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -3585,7 +3585,7 @@ static int ibmvscsis_write_pending(struct se_cmd *se_cmd) 1, 1); if (rc) { pr_err("srp_transfer_data() failed: %d\n", rc); - return -EAGAIN; + return -EIO; } /* * We now tell TCM to add this WRITE CDB directly into the TCM storage From d9584d8ccc06ba98f4fad8ec720de66b6659fd35 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Jan 2017 11:18:01 -0800 Subject: [PATCH 178/297] net: skb_flow_get_be16() can be static Removes following sparse complain : net/core/flow_dissector.c:70:8: warning: symbol 'skb_flow_get_be16' was not declared. Should it be static? Fixes: 972d3876faa8 ("flow dissector: ICMP support") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index fe4e1531976c..1b7673aac59d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -67,8 +67,8 @@ EXPORT_SYMBOL(skb_flow_dissector_init); * The function will try to retrieve a be32 entity at * offset poff */ -__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data, - int hlen) +static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, + void *data, int hlen) { __be16 *u, _u; From dd545b52a3e1efd9f2c6352dbe95ccd0c53461cc Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Tue, 10 Jan 2017 13:29:54 -0700 Subject: [PATCH 179/297] do_direct_IO: Use inode->i_blkbits to compute block count to be cleaned The code currently uses sdio->blkbits to compute the number of blocks to be cleaned. However sdio->blkbits is derived from the logical block size of the underlying block device (Refer to the definition of do_blockdev_direct_IO()). Due to this, generic/299 test would rarely fail when executed on an ext4 filesystem with 64k as the block size and when using a virtio based disk (having 512 byte as the logical block size) inside a kvm guest. This commit fixes the bug by using inode->i_blkbits to compute the number of blocks to be cleaned. Signed-off-by: Chandan Rajendra Reviewed-by: Christoph Hellwig Fixed up by Jeff Moyer to only use/evaluate inode->i_blkbits once, to avoid issues with block size changes with IO in flight. Signed-off-by: Jens Axboe --- fs/direct-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index aeae8c063451..c87bae4376b8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -906,6 +906,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { const unsigned blkbits = sdio->blkbits; + const unsigned i_blkbits = blkbits + sdio->blkfactor; int ret = 0; while (sdio->block_in_file < sdio->final_block_in_request) { @@ -949,7 +950,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, clean_bdev_aliases( map_bh->b_bdev, map_bh->b_blocknr, - map_bh->b_size >> blkbits); + map_bh->b_size >> i_blkbits); } if (!sdio->blkfactor) From a14d749fcebe97ddf6af6db3d1f6ece85c9ddcb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 08:56:23 -0700 Subject: [PATCH 180/297] virtio_blk: avoid DMA to stack for the sense buffer Most users of BLOCK_PC requests allocate the sense buffer on the stack, so to avoid DMA to the stack copy them to a field in the heap allocated virtblk_req structure. Without that any attempt at SCSI passthrough I/O, including the SG_IO ioctl from userspace will crash the kernel. Note that this includes running tools like hdparm even when the host does not have SCSI passthrough enabled. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5545a679abd8..3c3b8f601469 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -56,6 +56,7 @@ struct virtblk_req { struct virtio_blk_outhdr out_hdr; struct virtio_scsi_inhdr in_hdr; u8 status; + u8 sense[SCSI_SENSE_BUFFERSIZE]; struct scatterlist sg[]; }; @@ -102,7 +103,8 @@ static int __virtblk_add_req(struct virtqueue *vq, } if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { - sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + memcpy(vbr->sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + sg_init_one(&sense, vbr->sense, SCSI_SENSE_BUFFERSIZE); sgs[num_out + num_in++] = &sense; sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); sgs[num_out + num_in++] = &inhdr; From 25b4acfc7de0fc4da3bfea3a316f7282c6fbde81 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 9 Jan 2017 15:20:31 -0500 Subject: [PATCH 181/297] nbd: blk_mq_init_queue returns an error code on failure, not NULL Additionally, don't assign directly to disk->queue, otherwise blk_put_queue (called via put_disk) will choke (panic) on the errno stored there. Bug found by code inspection after Omar found a similar issue in virtio_blk. Compile-tested only. Signed-off-by: Jeff Moyer Reviewed-by: Omar Sandoval Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 38c576f76d36..50a2020b5b72 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1042,6 +1042,7 @@ static int __init nbd_init(void) return -ENOMEM; for (i = 0; i < nbds_max; i++) { + struct request_queue *q; struct gendisk *disk = alloc_disk(1 << part_shift); if (!disk) goto out; @@ -1067,12 +1068,13 @@ static int __init nbd_init(void) * every gendisk to have its very own request_queue struct. * These structs are big so we dynamically allocate them. */ - disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set); - if (!disk->queue) { + q = blk_mq_init_queue(&nbd_dev[i].tag_set); + if (IS_ERR(q)) { blk_mq_free_tag_set(&nbd_dev[i].tag_set); put_disk(disk); goto out; } + disk->queue = q; /* * Tell the block layer that we are not a rotational device From 6bf6b0aa3da84a3d9126919a94c49c0fb7ee2fb3 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 9 Jan 2017 11:44:12 -0800 Subject: [PATCH 182/297] virtio_blk: fix panic in initialization error path If blk_mq_init_queue() returns an error, it gets assigned to vblk->disk->queue. Then, when we call put_disk(), we end up calling blk_put_queue() with the ERR_PTR, causing a bad dereference. Fix it by only assigning to vblk->disk->queue on success. Signed-off-by: Omar Sandoval Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 3c3b8f601469..10332c24f961 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -630,11 +630,12 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_put_disk; - q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); + q = blk_mq_init_queue(&vblk->tag_set); if (IS_ERR(q)) { err = -ENOMEM; goto out_free_tags; } + vblk->disk->queue = q; q->queuedata = vblk; From faf3a932fbeb77860226a8323eacb835edc98648 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 9 Jan 2017 11:58:34 -0800 Subject: [PATCH 183/297] net: dsa: Ensure validity of dst->ds[0] It is perfectly possible to have non zero indexed switches being present in a DSA switch tree, in such a case, we will be deferencing a NULL pointer while dsa_cpu_port_ethtool_{setup,restore}. Be more defensive and ensure that dst->ds[0] is valid before doing anything with it. Fixes: 0c73c523cf73 ("net: dsa: Initialize CPU port ethtool ops per tree") Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 5fff951a0a49..da3862124545 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -394,9 +394,11 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - err = dsa_cpu_port_ethtool_setup(dst->ds[0]); - if (err) - return err; + if (dst->ds[0]) { + err = dsa_cpu_port_ethtool_setup(dst->ds[0]); + if (err) + return err; + } /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get @@ -433,7 +435,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - dsa_cpu_port_ethtool_restore(dst->ds[0]); + if (dst->ds[0]) + dsa_cpu_port_ethtool_restore(dst->ds[0]); pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; From 3512a1ad56174308a9fd3e10f4b1e3e152e9ec01 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 9 Jan 2017 14:31:58 -0800 Subject: [PATCH 184/297] net: qrtr: Mark 'buf' as little endian Failure to mark this pointer as __le32 causes checkers like sparse to complain: net/qrtr/qrtr.c:274:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:274:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:274:16: got restricted __le32 [usertype] net/qrtr/qrtr.c:275:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:275:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:275:16: got restricted __le32 [usertype] net/qrtr/qrtr.c:276:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:276:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:276:16: got restricted __le32 [usertype] Silence it. Cc: Bjorn Andersson Signed-off-by: Stephen Boyd Acked-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index c985ecbe9bd6..ae5ac175b2be 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -252,7 +252,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, const int pkt_len = 20; struct qrtr_hdr *hdr; struct sk_buff *skb; - u32 *buf; + __le32 *buf; skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); if (!skb) @@ -269,7 +269,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, hdr->dst_node_id = cpu_to_le32(dst_node); hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); - buf = (u32 *)skb_put(skb, pkt_len); + buf = (__le32 *)skb_put(skb, pkt_len); memset(buf, 0, pkt_len); buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); buf[1] = cpu_to_le32(src_node); From 5d722b3024f6762addb8642ffddc9f275b5107ae Mon Sep 17 00:00:00 2001 From: "Anna, Suman" Date: Mon, 9 Jan 2017 21:48:56 -0600 Subject: [PATCH 185/297] net: add the AF_QIPCRTR entries to family name tables Commit bdabad3e363d ("net: Add Qualcomm IPC router") introduced a new address family. Update the family name tables accordingly so that the lockdep initialization can use the proper names for this family. Cc: Courtney Cavin Cc: Bjorn Andersson Signed-off-by: Suman Anna Signed-off-by: David S. Miller --- net/core/sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index f560e0826009..4eca27dc5c94 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , - "sk_lock-AF_MAX" + "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , - "slock-AF_MAX" + "slock-AF_QIPCRTR", "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , - "clock-AF_MAX" + "clock-AF_QIPCRTR", "clock-AF_MAX" }; /* From dc5367bcc556e97555fc94a32cd1aadbebdff47e Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 10 Jan 2017 17:10:34 +0100 Subject: [PATCH 186/297] net/af_iucv: don't use paged skbs for TX on HiperSockets With commit e53743994e21 ("af_iucv: use paged SKBs for big outbound messages"), we transmit paged skbs for both of AF_IUCV's transport modes (IUCV or HiperSockets). The qeth driver for Layer 3 HiperSockets currently doesn't support NETIF_F_SG, so these skbs would just be linearized again by the stack. Avoid that overhead by using paged skbs only for IUCV transport. cc stable, since this also circumvents a significant skb leak when sending large messages (where the skb then needs to be linearized). Signed-off-by: Julian Wiedmann Signed-off-by: Ursula Braun Cc: # v4.8+ Fixes: e53743994e21 ("af_iucv: use paged SKBs for big outbound messages") Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cfb9e5f4e28f..13190b38f22e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1044,7 +1044,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); - size_t headroom, linear; + size_t headroom = 0; + size_t linear; struct sk_buff *skb; struct iucv_message txmsg = {0}; struct cmsghdr *cmsg; @@ -1122,18 +1123,20 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, * this is fine for SOCK_SEQPACKET (unless we want to support * segmented records using the MSG_EOR flag), but * for SOCK_STREAM we might want to improve it in future */ - headroom = (iucv->transport == AF_IUCV_TRANS_HIPER) - ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0; - if (headroom + len < PAGE_SIZE) { + if (iucv->transport == AF_IUCV_TRANS_HIPER) { + headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN; linear = len; } else { - /* In nonlinear "classic" iucv skb, - * reserve space for iucv_array - */ - if (iucv->transport != AF_IUCV_TRANS_HIPER) - headroom += sizeof(struct iucv_array) * - (MAX_SKB_FRAGS + 1); - linear = PAGE_SIZE - headroom; + if (len < PAGE_SIZE) { + linear = len; + } else { + /* In nonlinear "classic" iucv skb, + * reserve space for iucv_array + */ + headroom = sizeof(struct iucv_array) * + (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } } skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear, noblock, &err, 0); From 9f9b74ef896792399dc7b5121896b9c963db80fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 10 Jan 2017 09:41:49 -0800 Subject: [PATCH 187/297] mlx4: Return EOPNOTSUPP instead of ENOTSUPP In commit b45f0674b997 ("mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs"), it changed EOPNOTSUPP to ENOTSUPP by mistake. This patch fixes it. Fixes: b45f0674b997 ("mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs") Signed-off-by: Martin KaFai Lau Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index edbe200ac2fa..4910d9af1933 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2277,7 +2277,7 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) if (priv->tx_ring_num[TX_XDP] && !mlx4_en_check_xdp_mtu(dev, new_mtu)) - return -ENOTSUPP; + return -EOPNOTSUPP; dev->mtu = new_mtu; From 1272ce87fa017ca4cf32920764d879656b7a005a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2017 12:24:01 -0800 Subject: [PATCH 188/297] gro: Enter slow-path if there is no tailroom The GRO path has a fast-path where we avoid calling pskb_may_pull and pskb_expand by directly accessing frag0. However, this should only be done if we have enough tailroom in the skb as otherwise we'll have to expand it later anyway. This patch adds the check by capping frag0_len with the skb tailroom. Fixes: cb18978cbf45 ("gro: Open-code final pskb_may_pull") Reported-by: Slava Shwartsman Signed-off-by: Herbert Xu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 8db5a0b4b520..88d2907ca2cd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4441,7 +4441,8 @@ static void skb_gro_reset_offset(struct sk_buff *skb) pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + NAPI_GRO_CB(skb)->frag0_len = min(skb_frag_size(frag0), + skb->end - skb->tail); } } From 57ea52a865144aedbcd619ee0081155e658b6f7d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2017 12:24:15 -0800 Subject: [PATCH 189/297] gro: Disable frag0 optimization on IPv6 ext headers The GRO fast path caches the frag0 address. This address becomes invalid if frag0 is modified by pskb_may_pull or its variants. So whenever that happens we must disable the frag0 optimization. This is usually done through the combination of gro_header_hard and gro_header_slow, however, the IPv6 extension header path did the pulling directly and would continue to use the GRO fast path incorrectly. This patch fixes it by disabling the fast path when we enter the IPv6 extension header path. Fixes: 78a478d0efd9 ("gro: Inline skb_gro_header and cache frag0 virtual address") Reported-by: Slava Shwartsman Signed-off-by: Herbert Xu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++-- net/ipv6/ip6_offload.c | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 994f7423a74b..9bde9558b596 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2477,14 +2477,19 @@ static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) return NAPI_GRO_CB(skb)->frag0_len < hlen; } +static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; +} + static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { if (!pskb_may_pull(skb, hlen)) return NULL; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + skb_gro_frag0_invalidate(skb); return skb->data + offset; } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 89c59e656f44..fc7b4017ba24 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -191,6 +191,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { __pskb_pull(skb, skb_gro_offset(skb)); + skb_gro_frag0_invalidate(skb); proto = ipv6_gso_pull_exthdrs(skb, proto); skb_gro_pull(skb, -skb_transport_offset(skb)); skb_reset_transport_header(skb); From 5771f6ea8d5ccc0df4d02ae65833413150a1b829 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 10 Jan 2017 16:57:12 -0800 Subject: [PATCH 190/297] MAINTAINERS: remove duplicate bug filling description I have noticed that two different descriptions for B: entries in MAINTAINERS were merged: commit 686564434e88 ("MAINTAINERS: Add bug tracking system location entry type") and 2de2bd95f456 ("MAINTAINERS: add "B:" for URI where to file bugs"). This patch keeps the description from 2de2bd95f456. There has been a discussion [1] about whether this more detailed description is useful and what it exactly implies. I find it more useful and general, and the author of 686564434e88 agreed in the end that either is fine. [1] https://lkml.org/lkml/2016/12/8/71 Link: http://lkml.kernel.org/r/20161219085158.12114-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 35c9cbfe4f2d..0277df881da4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -81,7 +81,6 @@ Descriptions of section entries: Q: Patchwork web based patch tracking system site T: SCM tree type and location. Type is one of: git, hg, quilt, stgit, topgit - B: Bug tracking system location. S: Status, one of the following: Supported: Someone is actually paid to look after this. Maintained: Someone actually looks after it. From 965d004af54088d138f806d04d803fb60d441986 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:15 -0800 Subject: [PATCH 191/297] dax: fix deadlock with DAX 4k holes Currently in DAX if we have three read faults on the same hole address we can end up with the following: Thread 0 Thread 1 Thread 2 -------- -------- -------- dax_iomap_fault grab_mapping_entry lock_slot dax_iomap_fault grab_mapping_entry get_unlocked_mapping_entry dax_iomap_fault grab_mapping_entry get_unlocked_mapping_entry dax_load_hole find_or_create_page ... page_cache_tree_insert dax_wake_mapping_entry_waiter __radix_tree_replace get_page lock_page ... put_locked_mapping_entry unlock_page put_page The crux of the problem is that once we insert a 4k zero page, all locking from then on is done in terms of that 4k zero page and any additional threads sleeping on the empty DAX entry will never be woken. Fix this by waking all sleepers when we replace the DAX radix tree entry with a 4k zero page. This will allow all sleeping threads to successfully transition from locking based on the DAX empty entry to locking on the 4k zero page. With the test case reported by Xiong this happens very regularly in my test setup, with some runs resulting in 9+ threads in this deadlocked state. With this fix I've been able to run that same test dozens of times in a loop without issue. Fixes: ac401cc78242 ("dax: New fault locking") Link: http://lkml.kernel.org/r/1483479365-13607-1-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reported-by: Xiong Zhou Reviewed-by: Jan Kara Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index d0e4d1002059..b772a33ef640 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -138,7 +138,7 @@ static int page_cache_tree_insert(struct address_space *mapping, dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); /* Wakeup waiters for exceptional entry lock */ dax_wake_mapping_entry_waiter(mapping, page->index, p, - false); + true); } } __radix_tree_replace(&mapping->page_tree, node, slot, page, From d670ffd87509b6b136d8ed6f757851a8ebe442b2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 10 Jan 2017 16:57:18 -0800 Subject: [PATCH 192/297] mm/thp/pagecache/collapse: free the pte page table on collapse for thp page cache. With THP page cache, when trying to build a huge page from regular pte pages, we just clear the pmd entry. We will take another fault and at that point we will find the huge page in the radix tree, thereby using the huge page to complete the page fault The second fault path will allocate the needed pgtable_t page for archs like ppc64. So no need to deposit the same in collapse path. Depositing them in the collapse path resulting in a pgtable_t memory leak also giving errors like BUG: non-zero nr_ptes on freeing mm: 3 Fixes: 953c66c2b22a ("mm: THP page cache support for ppc64") Link: http://lkml.kernel.org/r/20161212163428.6780-2-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e32389a97030..b0924a68cc36 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1242,7 +1242,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; - bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1267,26 +1266,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); - /* - * now deposit the pgtable for arch that need it - * otherwise free it. - */ - if (arch_needs_pgtable_deposit()) { - /* - * The deposit should be visibile only after - * collapse is seen by others. - */ - smp_wmb(); - pgtable_trans_huge_deposit(vma->vm_mm, pmd, - pmd_pgtable(_pmd)); - deposited = true; - } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - if (!deposited) { - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); - } + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); } } i_mmap_unlock_write(mapping); From 097963959594c5eccaba42510f7033f703211bda Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:21 -0800 Subject: [PATCH 193/297] mm: add follow_pte_pmd() Patch series "Write protect DAX PMDs in *sync path". Currently dax_mapping_entry_mkclean() fails to clean and write protect the pmd_t of a DAX PMD entry during an *sync operation. This can result in data loss, as detailed in patch 2. This series is based on Dan's "libnvdimm-pending" branch, which is the current home for Jan's "dax: Page invalidation fixes" series. You can find a working tree here: https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean This patch (of 2): Similar to follow_pte(), follow_pte_pmd() allows either a PTE leaf or a huge page PMD leaf to be found and returned. Link: http://lkml.kernel.org/r/1482272586-21177-2-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Dave Hansen Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fe6b4036664a..02793ac64ac6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1212,6 +1212,8 @@ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); +int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 9f2c15cdb32c..b62f3bc63481 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3772,8 +3772,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -static int __follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; pud_t *pud; @@ -3790,11 +3790,20 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - /* We cannot handle huge page PFN maps. Luckily they don't exist. */ - if (pmd_huge(*pmd)) + if (pmd_huge(*pmd)) { + if (!pmdpp) + goto out; + + *ptlp = pmd_lock(mm, pmd); + if (pmd_huge(*pmd)) { + *pmdpp = pmd; + return 0; + } + spin_unlock(*ptlp); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); @@ -3817,10 +3826,24 @@ int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte(mm, address, ptepp, ptlp))); + !(res = __follow_pte_pmd(mm, address, ptepp, NULL, + ptlp))); return res; } +int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, + ptlp))); + return res; +} +EXPORT_SYMBOL(follow_pte_pmd); + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping From f729c8c9b24f0540a6e6b86e68f3888ba90ef7e7 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:24 -0800 Subject: [PATCH 194/297] dax: wrprotect pmd_t in dax_mapping_entry_mkclean Currently dax_mapping_entry_mkclean() fails to clean and write protect the pmd_t of a DAX PMD entry during an *sync operation. This can result in data loss in the following sequence: 1) mmap write to DAX PMD, dirtying PMD radix tree entry and making the pmd_t dirty and writeable 2) fsync, flushing out PMD data and cleaning the radix tree entry. We currently fail to mark the pmd_t as clean and write protected. 3) more mmap writes to the PMD. These don't cause any page faults since the pmd_t is dirty and writeable. The radix tree entry remains clean. 4) fsync, which fails to flush the dirty PMD data because the radix tree entry was clean. 5) crash - dirty data that should have been fsync'd as part of 4) could still have been in the processor cache, and is lost. Fix this by marking the pmd_t clean and write protected in dax_mapping_entry_mkclean(), which is called as part of the fsync operation 2). This will cause the writes in step 3) above to generate page faults where we'll re-dirty the PMD radix tree entry, resulting in flushes in the fsync that happens in step 4). Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush") Link: http://lkml.kernel.org/r/1482272586-21177-3-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 51 ++++++++++++++++++++++++++++++++-------------- include/linux/mm.h | 2 -- mm/memory.c | 4 ++-- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5c74f60d0a50..ddcddfeaa03b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -691,8 +691,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pgoff_t index, unsigned long pfn) { struct vm_area_struct *vma; - pte_t *ptep; - pte_t pte; + pte_t pte, *ptep = NULL; + pmd_t *pmdp = NULL; spinlock_t *ptl; bool changed; @@ -707,21 +707,42 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, address = pgoff_address(index, vma); changed = false; - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) continue; - if (pfn != pte_pfn(*ptep)) - goto unlock; - if (!pte_dirty(*ptep) && !pte_write(*ptep)) - goto unlock; - flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); - changed = true; -unlock: - pte_unmap_unlock(ptep, ptl); + if (pmdp) { +#ifdef CONFIG_FS_DAX_PMD + pmd_t pmd; + + if (pfn != pmd_pfn(*pmdp)) + goto unlock_pmd; + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + goto unlock_pmd; + + flush_cache_page(vma, address, pfn); + pmd = pmdp_huge_clear_flush(vma, address, pmdp); + pmd = pmd_wrprotect(pmd); + pmd = pmd_mkclean(pmd); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + changed = true; +unlock_pmd: + spin_unlock(ptl); +#endif + } else { + if (pfn != pte_pfn(*ptep)) + goto unlock_pte; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock_pte; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock_pte: + pte_unmap_unlock(ptep, ptl); + } if (changed) mmu_notifier_invalidate_page(vma->vm_mm, address); diff --git a/include/linux/mm.h b/include/linux/mm.h index 02793ac64ac6..b84615b0f64c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,8 +1210,6 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); -int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, - spinlock_t **ptlp); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index b62f3bc63481..6bf2b471e30c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3819,8 +3819,8 @@ out: return -EINVAL; } -int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, - spinlock_t **ptlp) +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) { int res; From bb1107f7c6052c863692a41f78c000db792334bf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:27 -0800 Subject: [PATCH 195/297] mm, slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER Andrey Konovalov has reported the following warning triggered by the syzkaller fuzzer. WARNING: CPU: 1 PID: 9935 at mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 9935 Comm: syz-executor0 Not tainted 4.9.0-rc7+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __alloc_pages_slowpath mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 mm/page_alloc.c:3781 alloc_pages_current+0x1c7/0x6b0 mm/mempolicy.c:2072 alloc_pages include/linux/gfp.h:469 kmalloc_order+0x1f/0x70 mm/slab_common.c:1015 kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026 kmalloc_large include/linux/slab.h:422 __kmalloc+0x210/0x2d0 mm/slub.c:3723 kmalloc include/linux/slab.h:495 ep_write_iter+0x167/0xb50 drivers/usb/gadget/legacy/inode.c:664 new_sync_write fs/read_write.c:499 __vfs_write+0x483/0x760 fs/read_write.c:512 vfs_write+0x170/0x4e0 fs/read_write.c:560 SYSC_write fs/read_write.c:607 SyS_write+0xfb/0x230 fs/read_write.c:599 entry_SYSCALL_64_fastpath+0x1f/0xc2 The issue is caused by a lack of size check for the request size in ep_write_iter which should be fixed. It, however, points to another problem, that SLUB defines KMALLOC_MAX_SIZE too large because the its KMALLOC_SHIFT_MAX is (MAX_ORDER + PAGE_SHIFT) which means that the resulting page allocator request might be MAX_ORDER which is too large (see __alloc_pages_slowpath). The same applies to the SLOB allocator which allows even larger sizes. Make sure that they are capped properly and never request more than MAX_ORDER order. Link: http://lkml.kernel.org/r/20161220130659.16461-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Andrey Konovalov Acked-by: Christoph Lameter Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 084b12bad198..4c5363566815 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -226,7 +226,7 @@ static inline const char *__check_heap_object(const void *ptr, * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif @@ -239,7 +239,7 @@ static inline const char *__check_heap_object(const void *ptr, * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT -#define KMALLOC_SHIFT_MAX 30 +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif From 7984c27c2c5cd3298de8afdba3e1bd46f884e934 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:30 -0800 Subject: [PATCH 196/297] bpf: do not use KMALLOC_SHIFT_MAX Commit 01b3f52157ff ("bpf: fix allocation warnings in bpf maps and integer overflow") has added checks for the maximum allocateable size. It (ab)used KMALLOC_SHIFT_MAX for that purpose. While this is not incorrect it is not very clean because we already have KMALLOC_MAX_SIZE for this very reason so let's change both checks to use KMALLOC_MAX_SIZE instead. The original motivation for using KMALLOC_SHIFT_MAX was to work around an incorrect KMALLOC_MAX_SIZE which could lead to allocation warnings but it is no longer needed since "slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER". Link: http://lkml.kernel.org/r/20161220130659.16461-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Christoph Lameter Cc: Alexei Starovoitov Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/hashtab.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index a2ac051c342f..229a5d5df977 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); - if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 34debc1a9641..3f2bb58952d8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -274,7 +274,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - + if (htab->map.value_size >= KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct htab_elem)) /* if value_size is bigger, the user space won't be able to * access the elements via bpf syscall. This check also makes From e7ee2c089e94067d68475990bdeed211c8852917 Mon Sep 17 00:00:00 2001 From: Eric Ren Date: Tue, 10 Jan 2017 16:57:33 -0800 Subject: [PATCH 197/297] ocfs2: fix crash caused by stale lvb with fsdlm plugin The crash happens rather often when we reset some cluster nodes while nodes contend fiercely to do truncate and append. The crash backtrace is below: dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover_grant 1 locks on 971 resources dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover 9 generation 5 done: 4 ms ocfs2: Begin replay journal (node 318952601, slot 2) on device (253,18) ocfs2: End replay journal (node 318952601, slot 2) on device (253,18) ocfs2: Beginning quota recovery on device (253,18) for slot 2 ocfs2: Finishing quota recovery on device (253,18) for slot 2 (truncate,30154,1):ocfs2_truncate_file:470 ERROR: bug expression: le64_to_cpu(fe->i_size) != i_size_read(inode) (truncate,30154,1):ocfs2_truncate_file:470 ERROR: Inode 290321, inode i_size = 732 != di i_size = 937, i_flags = 0x1 ------------[ cut here ]------------ kernel BUG at /usr/src/linux/fs/ocfs2/file.c:470! invalid opcode: 0000 [#1] SMP Modules linked in: ocfs2_stack_user(OEN) ocfs2(OEN) ocfs2_nodemanager ocfs2_stackglue(OEN) quota_tree dlm(OEN) configfs fuse sd_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi af_packet iscsi_ibft iscsi_boot_sysfs softdog xfs libcrc32c ppdev parport_pc pcspkr parport joydev virtio_balloon virtio_net i2c_piix4 acpi_cpufreq button processor ext4 crc16 jbd2 mbcache ata_generic cirrus virtio_blk ata_piix drm_kms_helper ahci syscopyarea libahci sysfillrect sysimgblt fb_sys_fops ttm floppy libata drm virtio_pci virtio_ring uhci_hcd virtio ehci_hcd usbcore serio_raw usb_common sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4 Supported: No, Unsupported modules are loaded CPU: 1 PID: 30154 Comm: truncate Tainted: G OE N 4.4.21-69-default #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20151112_172657-sheep25 04/01/2014 task: ffff88004ff6d240 ti: ffff880074e68000 task.ti: ffff880074e68000 RIP: 0010:[] [] ocfs2_truncate_file+0x640/0x6c0 [ocfs2] RSP: 0018:ffff880074e6bd50 EFLAGS: 00010282 RAX: 0000000000000074 RBX: 000000000000029e RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246 RBP: ffff880074e6bda8 R08: 000000003675dc7a R09: ffffffff82013414 R10: 0000000000034c50 R11: 0000000000000000 R12: ffff88003aab3448 R13: 00000000000002dc R14: 0000000000046e11 R15: 0000000000000020 FS: 00007f839f965700(0000) GS:ffff88007fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f839f97e000 CR3: 0000000036723000 CR4: 00000000000006e0 Call Trace: ocfs2_setattr+0x698/0xa90 [ocfs2] notify_change+0x1ae/0x380 do_truncate+0x5e/0x90 do_sys_ftruncate.constprop.11+0x108/0x160 entry_SYSCALL_64_fastpath+0x12/0x6d Code: 24 28 ba d6 01 00 00 48 c7 c6 30 43 62 a0 8b 41 2c 89 44 24 08 48 8b 41 20 48 c7 c1 78 a3 62 a0 48 89 04 24 31 c0 e8 a0 97 f9 ff <0f> 0b 3d 00 fe ff ff 0f 84 ab fd ff ff 83 f8 fc 0f 84 a2 fd ff RIP [] ocfs2_truncate_file+0x640/0x6c0 [ocfs2] It's because ocfs2_inode_lock() get us stale LVB in which the i_size is not equal to the disk i_size. We mistakenly trust the LVB because the underlaying fsdlm dlm_lock() doesn't set lkb_sbflags with DLM_SBF_VALNOTVALID properly for us. But, why? The current code tries to downconvert lock without DLM_LKF_VALBLK flag to tell o2cb don't update RSB's LVB if it's a PR->NULL conversion, even if the lock resource type needs LVB. This is not the right way for fsdlm. The fsdlm plugin behaves different on DLM_LKF_VALBLK, it depends on DLM_LKF_VALBLK to decide if we care about the LVB in the LKB. If DLM_LKF_VALBLK is not set, fsdlm will skip recovering RSB's LVB from this lkb and set the right DLM_SBF_VALNOTVALID appropriately when node failure happens. The following diagram briefly illustrates how this crash happens: RSB1 is inode metadata lock resource with LOCK_TYPE_USES_LVB; The 1st round: Node1 Node2 RSB1: PR RSB1(master): NULL->EX ocfs2_downconvert_lock(PR->NULL, set_lvb==0) ocfs2_dlm_lock(no DLM_LKF_VALBLK) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - dlm_lock(no DLM_LKF_VALBLK) convert_lock(overwrite lkb->lkb_exflags with no DLM_LKF_VALBLK) RSB1: NULL RSB1: EX reset Node2 dlm_recover_rsbs() recover_lvb() /* The LVB is not trustable if the node with EX fails and * no lock >= PR is left. We should set RSB_VALNOTVALID for RSB1. */ if(!(kb_exflags & DLM_LKF_VALBLK)) /* This means we miss the chance to return; * to invalid the LVB here. */ The 2nd round: Node 1 Node2 RSB1(become master from recovery) ocfs2_setattr() ocfs2_inode_lock(NULL->EX) /* dlm_lock() return the stale lvb without setting DLM_SBF_VALNOTVALID */ ocfs2_meta_lvb_is_trustable() return 1 /* so we don't refresh inode from disk */ ocfs2_truncate_file() mlog_bug_on_msg(disk isize != i_size_read(inode)) /* crash! */ The fix is quite straightforward. We keep to set DLM_LKF_VALBLK flag for dlm_lock() if the lock resource type needs LVB and the fsdlm plugin is uesed. Link: http://lkml.kernel.org/r/1481275846-6604-1-git-send-email-zren@suse.com Signed-off-by: Eric Ren Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 10 ++++++++++ fs/ocfs2/stackglue.c | 6 ++++++ fs/ocfs2/stackglue.h | 3 +++ 3 files changed, 19 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 83d576f6a287..77d1632e905d 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3303,6 +3303,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, lockres->l_level, new_level); + /* + * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always + * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that + * we can recover correctly from node failure. Otherwise, we may get + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. + */ + if (!ocfs2_is_o2cb_active() && + lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lvb = 1; + if (lvb) dlm_flags |= DLM_LKF_VALBLK; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 52c07346bea3..820359096c7a 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; +inline int ocfs2_is_o2cb_active(void) +{ + return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); +} +EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); + static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index f2dce10fae54..e3036e1790e8 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ +int ocfs2_is_o2cb_active(void); + extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ From f931ab479dd24cf7a2c6e2df19778406892591fb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 10 Jan 2017 16:57:36 -0800 Subject: [PATCH 198/297] mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done} Both arch_add_memory() and arch_remove_memory() expect a single threaded context. For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does not hold any locks over this check and branch: if (pgd_val(*pgd)) { pud = (pud_t *)pgd_page_vaddr(*pgd); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); continue; } pud = alloc_low_page(); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); The result is that two threads calling devm_memremap_pages() simultaneously can end up colliding on pgd initialization. This leads to crash signatures like the following where the loser of the race initializes the wrong pgd entry: BUG: unable to handle kernel paging request at ffff888ebfff0000 IP: memcpy_erms+0x6/0x10 PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */ Oops: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13 task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000 RIP: memcpy_erms+0x6/0x10 [..] Call Trace: ? pmem_do_bvec+0x205/0x370 [nd_pmem] ? blk_queue_enter+0x3a/0x280 pmem_rw_page+0x38/0x80 [nd_pmem] bdev_read_page+0x84/0xb0 Hold the standard memory hotplug mutex over calls to arch_{add,remove}_memory(). Fixes: 41e94a851304 ("add devm_memremap_pages") Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/memremap.c b/kernel/memremap.c index b501e390bb34..9ecedc28b928 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -246,7 +246,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); + mem_hotplug_begin(); arch_remove_memory(align_start, align_size); + mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, @@ -358,7 +360,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_pfn_remap; + mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); + mem_hotplug_done(); if (error) goto err_add_memory; From 2df26639e708a88dcc22171949da638a9998f3bc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:39 -0800 Subject: [PATCH 199/297] mm: fix remote numa hits statistics Jia He has noticed that commit b9f00e147f27 ("mm, page_alloc: reduce branches in zone_statistics") has an unintentional side effect that remote node allocation requests are accounted as NUMA_MISS rathat than NUMA_HIT and NUMA_OTHER if such a request doesn't use __GFP_OTHER_NODE. There are many of these potentially because the flag is used very rarely while we have many users of __alloc_pages_node. Fix this by simply ignoring __GFP_OTHER_NODE (it can be removed in a follow up patch) and treat all allocations that were satisfied from the preferred zone's node as NUMA_HITS because this is the same node we requested the allocation from in most cases. If this is not the local node then we just account it as NUMA_OTHER rather than NUMA_LOCAL. One downsize would be that an allocation request for a node which is outside of the mempolicy nodemask would be reported as a hit which is a bit weird but that was the case before b9f00e147f27 already. Fixes: b9f00e147f27 ("mm, page_alloc: reduce branches in zone_statistics") Link: http://lkml.kernel.org/r/20170102153057.9451-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Jia He Reviewed-by: Vlastimil Babka # with cbmc[1] superpowers Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Taku Izumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c6d5f64feca..cba2a64792e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2583,30 +2583,23 @@ int __isolate_free_page(struct page *page, unsigned int order) * Update NUMA hit/miss statistics * * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. */ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) { #ifdef CONFIG_NUMA - int local_nid = numa_node_id(); enum zone_stat_item local_stat = NUMA_LOCAL; - if (unlikely(flags & __GFP_OTHER_NODE)) { + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; - local_nid = preferred_zone->node; - } - if (z->node == local_nid) { + if (z->node == preferred_zone->node) __inc_zone_state(z, NUMA_HIT); - __inc_zone_state(z, local_stat); - } else { + else { __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } + __inc_zone_state(z, local_stat); #endif } From 41b6167e8f746b475668f1da78599fc4284f18db Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:42 -0800 Subject: [PATCH 200/297] mm: get rid of __GFP_OTHER_NODE The flag was introduced by commit 78afd5612deb ("mm: add __GFP_OTHER_NODE flag") to allow proper accounting of remote node allocations done by kernel daemons on behalf of a process - e.g. khugepaged. After "mm: fix remote numa hits statistics" we do not need and actually use the flag so we can safely remove it because all allocations which are satisfied from their "home" node are accounted properly. [mhocko@suse.com: fix build] Link: http://lkml.kernel.org/r/20170106122225.GK5556@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170102153057.9451-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Taku Izumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 13 +++---------- include/trace/events/mmflags.h | 3 +-- mm/huge_memory.c | 3 +-- mm/khugepaged.c | 5 ++--- mm/page_alloc.c | 5 ++--- tools/perf/builtin-kmem.c | 1 - 6 files changed, 9 insertions(+), 21 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4175dca4ac39..7806a8f80abc 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -38,9 +38,8 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_DIRECT_RECLAIM 0x400000u -#define ___GFP_OTHER_NODE 0x800000u -#define ___GFP_WRITE 0x1000000u -#define ___GFP_KSWAPD_RECLAIM 0x2000000u +#define ___GFP_WRITE 0x800000u +#define ___GFP_KSWAPD_RECLAIM 0x1000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -172,11 +171,6 @@ struct vm_area_struct; * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of * distinguishing in the source between false positives and allocations that * cannot be supported (e.g. page tables). - * - * __GFP_OTHER_NODE is for allocations that are on a remote node but that - * should not be accounted for as a remote allocation in vmstat. A - * typical user would be khugepaged collapsing a huge page on a remote - * node. */ #define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) @@ -184,10 +178,9 @@ struct vm_area_struct; #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT 26 +#define __GFP_BITS_SHIFT 25 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 9e687ca9a307..15bf875d0e4a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -47,8 +47,7 @@ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ - {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"}\ #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 10eedbf14421..72339a646fb1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -919,8 +919,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | - __GFP_OTHER_NODE, vma, + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b0924a68cc36..77ae3239c3de 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -943,7 +943,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; /* * Before allocating the hugepage, release the mmap_sem read lock. @@ -1309,8 +1309,7 @@ static void collapse_shmem(struct mm_struct *mm, VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | - __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cba2a64792e6..872caae544ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2584,8 +2584,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, - gfp_t flags) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA enum zone_stat_item local_stat = NUMA_LOCAL; @@ -2667,7 +2666,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, } __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, gfp_flags); + zone_statistics(preferred_zone, zone); local_irq_restore(flags); VM_BUG_ON_PAGE(bad_range(zone, page), page); diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 35a02f8e5a4a..915869e00d86 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -655,7 +655,6 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, - { "__GFP_OTHER_NODE", "ON" }, }; static size_t max_gfp_len; From da0510c47519fe0999cffe316e1d370e29f952be Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 10 Jan 2017 16:57:45 -0800 Subject: [PATCH 201/297] lib/Kconfig.debug: fix frv build failure The build of frv allmodconfig was failing with the errors like: /tmp/cc0JSPc3.s: Assembler messages: /tmp/cc0JSPc3.s:1839: Error: symbol `.LSLT0' is already defined /tmp/cc0JSPc3.s:1842: Error: symbol `.LASLTP0' is already defined /tmp/cc0JSPc3.s:1969: Error: symbol `.LELTP0' is already defined /tmp/cc0JSPc3.s:1970: Error: symbol `.LELT0' is already defined Commit 866ced950bcd ("kbuild: Support split debug info v4") introduced splitting the debug info and keeping that in a separate file. Somehow, the frv-linux gcc did not like that and I am guessing that instead of splitting it started copying. The first report about this is at: https://lists.01.org/pipermail/kbuild-all/2015-July/010527.html. I will try and see if this can work with frv and if still fails I will open a bug report with gcc. But meanwhile this is the easiest option to solve build failure of frv. Fixes: 866ced950bcd ("kbuild: Support split debug info v4") Link: http://lkml.kernel.org/r/1482062348-5352-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Reported-by: Fengguang Wu Cc: Andi Kleen Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b06848a104e6..eb9e9a7870fa 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -164,7 +164,7 @@ config DEBUG_INFO_REDUCED config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" - depends on DEBUG_INFO + depends on DEBUG_INFO && !FRV help Generate debug info into separate .dwo files. This significantly reduces the build directory size for builds with DEBUG_INFO, From c626bc46edb0fec289adfc86b02e07d34127ef6c Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 10 Jan 2017 16:57:48 -0800 Subject: [PATCH 202/297] ipc/sem.c: fix incorrect sem_lock pairing Based on the syzcaller test case from dvyukov: https://gist.githubusercontent.com/dvyukov/d0e5efefe4d7d6daed829f5c3ca26a40/raw/08d0a261fe3c987bed04fbf267e08ba04bd533ea/gistfile1.txt The slow (i.e.: failure to acquire) syscall exit from semtimedop() incorrectly assumed that the the same lock is acquired as it was at the initial syscall entry. This is wrong: - thread A: single semop semop(), sleeps - thread B: multi semop semop(), sleeps - thread A: woken up by signal/timeout With this sequence, the initial sem_lock() call locks the per-semaphore spinlock, and it is unlocked with sem_unlock(). The call at the syscall return locks the global spinlock. Because locknum is not updated, the following sem_unlock() call unlocks the per-semaphore spinlock, which is actually not locked. The fix is trivial: Use the return value from sem_lock. Fixes: 370b262c896e ("ipc/sem: avoid idr tree lookup for interrupted semop") Link: http://lkml.kernel.org/r/1482215645-22328-1-git-send-email-manfred@colorfullife.com Signed-off-by: Manfred Spraul Reported-by: Dmitry Vyukov Reported-by: Johanna Abrahamsson Tested-by: Johanna Abrahamsson Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/sem.c b/ipc/sem.c index e08b94851922..3ec5742b5640 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1977,7 +1977,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, } rcu_read_lock(); - sem_lock(sma, sops, nsops); + locknum = sem_lock(sma, sops, nsops); if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock_free; From 20f664aabeb88d582b623a625f83b0454fa34f07 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:57:51 -0800 Subject: [PATCH 203/297] mm: pmd dirty emulation in page fault handler Andreas reported [1] made a test in jemalloc hang in THP mode in arm64: http://lkml.kernel.org/r/mvmmvfy37g1.fsf@hawking.suse.de The problem is currently page fault handler doesn't supports dirty bit emulation of pmd for non-HW dirty-bit architecture so that application stucks until VM marked the pmd dirty. How the emulation work depends on the architecture. In case of arm64, when it set up pte firstly, it sets pte PTE_RDONLY to get a chance to mark the pte dirty via triggering page fault when store access happens. Once the page fault occurs, VM marks the pmd dirty and arch code for setting pmd will clear PTE_RDONLY for application to proceed. IOW, if VM doesn't mark the pmd dirty, application hangs forever by repeated fault(i.e., store op but the pmd is PTE_RDONLY). This patch enables pmd dirty-bit emulation for those architectures. [1] b8d3c4c3009d, mm/huge_memory.c: don't split THP page when MADV_FREE syscall is called Fixes: b8d3c4c3009d ("mm/huge_memory.c: don't split THP page when MADV_FREE syscall is called") Link: http://lkml.kernel.org/r/1482506098-6149-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: Andreas Schwab Tested-by: Andreas Schwab Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Jason Evans Cc: Will Deacon Cc: Catalin Marinas Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 72339a646fb1..9a6bd6c8d55a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -883,15 +883,17 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; + bool write = vmf->flags & FAULT_FLAG_WRITE; vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); + if (write) + entry = pmd_mkdirty(entry); haddr = vmf->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, - vmf->flags & FAULT_FLAG_WRITE)) + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: From 2d39b3cd34e6d323720d4c61bd714f5ae202c022 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 10 Jan 2017 16:57:54 -0800 Subject: [PATCH 204/297] signal: protect SIGNAL_UNKILLABLE from unintentional clearing. Since commit 00cd5c37afd5 ("ptrace: permit ptracing of /sbin/init") we can now trace init processes. init is initially protected with SIGNAL_UNKILLABLE which will prevent fatal signals such as SIGSTOP, but there are a number of paths during tracing where SIGNAL_UNKILLABLE can be implicitly cleared. This can result in init becoming stoppable/killable after tracing. For example, running: while true; do kill -STOP 1; done & strace -p 1 and then stopping strace and the kill loop will result in init being left in state TASK_STOPPED. Sending SIGCONT to init will resume it, but init will now respond to future SIGSTOP signals rather than ignoring them. Make sure that when setting SIGNAL_STOP_CONTINUED/SIGNAL_STOP_STOPPED that we don't clear SIGNAL_UNKILLABLE. Link: http://lkml.kernel.org/r/20170104122017.25047-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Cc: Alexander Viro Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 10 ++++++++++ kernel/signal.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d1905245c7a..ad3ec9ec61f7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -854,6 +854,16 @@ struct signal_struct { #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ +#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ + SIGNAL_STOP_CONTINUED) + +static inline void signal_set_stop_flags(struct signal_struct *sig, + unsigned int flags) +{ + WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; +} + /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { diff --git a/kernel/signal.c b/kernel/signal.c index ff046b73ff2d..3603d93a1968 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task) * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { - sig->flags = SIGNAL_STOP_STOPPED; + signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; @@ -843,7 +843,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ - signal->flags = why | SIGNAL_STOP_CONTINUED; + signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } From 9ebf73b275f06b114586af27cda3fd72e149d5ba Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 10 Jan 2017 16:57:57 -0800 Subject: [PATCH 205/297] mailmap: add codeaurora.org names for nameless email commits Some codeaurora.org emails have crept in but the names don't exist for them. Add the names for the emails so git can match everyone up. Link: http://lkml.kernel.org/r/20170104194611.25933-1-sboyd@codeaurora.org Signed-off-by: Stephen Boyd Cc: Sarangdhar Joshi Cc: Subash Abhinov Kasiviswanathan Cc: Subhash Jadavani Cc: Thomas Pedersen Cc: Andy Gross Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.mailmap b/.mailmap index 02d261407683..67dc22ffc9a8 100644 --- a/.mailmap +++ b/.mailmap @@ -137,6 +137,7 @@ Ricardo Ribalda Delgado Rudolf Marek Rui Saraiva Sachin P Sant +Sarangdhar Joshi Sam Ravnborg Santosh Shilimkar Santosh Shilimkar @@ -150,10 +151,13 @@ Shuah Khan Simon Kelley Stéphane Witzmann Stephen Hemminger +Subash Abhinov Kasiviswanathan +Subhash Jadavani Sudeep Holla Sudeep KarkadaNagesha Sumit Semwal Tejun Heo Thomas Graf +Thomas Pedersen Tony Luck Tsuneo Yoshioka Uwe Kleine-König From f073bdc51771f5a5c7a8d1191bfc3ae371d44de7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 10 Jan 2017 16:58:00 -0800 Subject: [PATCH 206/297] mm: don't dereference struct page fields of invalid pages The VM_BUG_ON() check in move_freepages() checks whether the node id of a page matches the node id of its zone. However, it does this before having checked whether the struct page pointer refers to a valid struct page to begin with. This is guaranteed in most cases, but may not be the case if CONFIG_HOLES_IN_ZONE=y. So reorder the VM_BUG_ON() with the pfn_valid_within() check. Link: http://lkml.kernel.org/r/1481706707-6211-2-git-send-email-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Will Deacon Cc: Catalin Marinas Cc: Hanjun Guo Cc: Yisheng Xie Cc: Robert Richter Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 872caae544ef..74afdb4177cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1864,14 +1864,14 @@ int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + if (!PageBuddy(page)) { page++; continue; From b4536f0c829c8586544c94735c343f9b5070bd01 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:58:04 -0800 Subject: [PATCH 207/297] mm, memcg: fix the active list aging for lowmem requests when memcg is enabled Nils Holland and Klaus Ethgen have reported unexpected OOM killer invocations with 32b kernel starting with 4.8 kernels kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0 kworker/u4:5 cpuset=/ mems_allowed=0 CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2 [...] Mem-Info: active_anon:58685 inactive_anon:90 isolated_anon:0 active_file:274324 inactive_file:281962 isolated_file:0 unevictable:0 dirty:649 writeback:0 unstable:0 slab_reclaimable:40662 slab_unreclaimable:17754 mapped:7382 shmem:202 pagetables:351 bounce:0 free:206736 free_pcp:332 free_cma:0 Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 813 3474 3474 Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB lowmem_reserve[]: 0 0 21292 21292 HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB the oom killer is clearly pre-mature because there there is still a lot of page cache in the zone Normal which should satisfy this lowmem request. Further debugging has shown that the reclaim cannot make any forward progress because the page cache is hidden in the active list which doesn't get rotated because inactive_list_is_low is not memcg aware. The code simply subtracts per-zone highmem counters from the respective memcg's lru sizes which doesn't make any sense. We can simply end up always seeing the resulting active and inactive counts 0 and return false. This issue is not limited to 32b kernels but in practice the effect on systems without CONFIG_HIGHMEM would be much harder to notice because we do not invoke the OOM killer for allocations requests targeting < ZONE_NORMAL. Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node and subtract per-memcg highmem counts when memcg is enabled. Introduce helper lruvec_zone_lru_size which redirects to either zone counters or mem_cgroup_get_zone_lru_size when appropriate. We are losing empty LRU but non-zero lru size detection introduced by ca707239e8a7 ("mm: update_lru_size warn and reset bad lru_size") because of the inherent zone vs. node discrepancy. Fixes: f8d1a31163fc ("mm: consider whether to decivate based on eligible zones inactive ratio") Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Nils Holland Tested-by: Nils Holland Reported-by: Klaus Ethgen Acked-by: Minchan Kim Acked-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 26 +++++++++++++++++++++++--- include/linux/mm_inline.h | 2 +- mm/memcontrol.c | 18 ++++++++---------- mm/vmscan.c | 27 +++++++++++++++++---------- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 61d20c17f3b7..254698856b8f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter { */ struct mem_cgroup_per_node { struct lruvec lruvec; - unsigned long lru_size[NR_LRU_LISTS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages); + int zid, int nr_pages); unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); @@ -441,9 +441,23 @@ static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_node *mz; + unsigned long nr_pages = 0; + int zid; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - return mz->lru_size[lru]; + for (zid = 0; zid < MAX_NR_ZONES; zid++) + nr_pages += mz->lru_zone_size[zid][lru]; + return nr_pages; +} + +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) +{ + struct mem_cgroup_per_node *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + return mz->lru_zone_size[zone_idx][lru]; } void mem_cgroup_handle_over_high(void); @@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { return 0; } +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) +{ + return 0; +} static inline unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 71613e8a720f..41d376e7116d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4048897e7b01..a63a8f832664 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); unsigned long nr = 0; - struct mem_cgroup_per_node *mz; enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); @@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - mz = mem_cgroup_nodeinfo(memcg, nid); - nr += mz->lru_size[lru]; + nr += mem_cgroup_get_lru_size(lruvec, lru); } return nr; } @@ -1002,6 +1001,7 @@ out: * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on + * @zid: zone id of the accounted pages * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added @@ -1009,27 +1009,25 @@ out: * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages) + int zid, int nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; - bool empty; if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - lru_size = mz->lru_size + lru; - empty = list_empty(lruvec->lists + lru); + lru_size = &mz->lru_zone_size[zid][lru]; if (nr_pages < 0) *lru_size += nr_pages; size = *lru_size; - if (WARN_ONCE(size < 0 || empty != !size, - "%s(%p, %d, %d): lru_size %ld but %sempty\n", - __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6aa5b01d3e75..532a2a750952 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } +unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) +{ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); + + return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], + NR_ZONE_LRU_BASE + lru); +} + /* * Add a shrinker callback to be called from the vm. */ @@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * be complete before mem_cgroup_update_lru_size due to a santity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, - enum lru_list lru, unsigned long *nr_zone_taken, - unsigned long nr_taken) + enum lru_list lru, unsigned long *nr_zone_taken) { int zid; @@ -1392,11 +1401,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); +#endif } -#ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); -#endif } /* @@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); - update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); + update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!managed_zone(zone)) continue; - inactive_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE)); - active_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); + inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); + active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); inactive -= min(inactive, inactive_zone); active -= min(active, active_zone); From 8c2dd3e4a4bae78093c4a5cee6494877651be3c9 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:06 -0800 Subject: [PATCH 208/297] mm: rename __alloc_page_frag to page_frag_alloc and __free_page_frag to page_frag_free Patch series "Page fragment updates", v4. This patch series takes care of a few cleanups for the page fragments API. First we do some renames so that things are much more consistent. First we move the page_frag_ portion of the name to the front of the functions names. Secondly we split out the cache specific functions from the other page fragment functions by adding the word "cache" to the name. Finally I added a bit of documentation that will hopefully help to explain some of this. I plan to revisit this later as we get things more ironed out in the near future with the changes planned for the DMA setup to support eXpress Data Path. This patch (of 3): This patch renames the page frag functions to be more consistent with other APIs. Specifically we place the name page_frag first in the name and then have either an alloc or free call name that we append as the suffix. This makes it a bit clearer in terms of naming. In addition we drop the leading double underscores since we are technically no longer a backing interface and instead the front end that is called from the networking APIs. Link: http://lkml.kernel.org/r/20170104023854.13451.67390.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 +++--- include/linux/skbuff.h | 2 +- mm/page_alloc.c | 10 +++++----- net/core/skbuff.c | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7806a8f80abc..ed77a86fbbb0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -501,9 +501,9 @@ extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; extern void __page_frag_drain(struct page *page, unsigned int order, unsigned int count); -extern void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask); -extern void __free_page_frag(void *addr); +extern void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask); +extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b53c0cfd417e..a410715bbef8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2480,7 +2480,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, static inline void skb_free_frag(void *addr) { - __free_page_frag(addr); + page_frag_free(addr); } void *napi_alloc_frag(unsigned int fragsz); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74afdb4177cb..097893ffe194 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3931,8 +3931,8 @@ void __page_frag_drain(struct page *page, unsigned int order, } EXPORT_SYMBOL(__page_frag_drain); -void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -3983,19 +3983,19 @@ refill: return nc->va + offset; } -EXPORT_SYMBOL(__alloc_page_frag); +EXPORT_SYMBOL(page_frag_alloc); /* * Frees a page fragment allocated out of either a compound or order 0 page. */ -void __free_page_frag(void *addr) +void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) __free_pages_ok(page, compound_order(page)); } -EXPORT_SYMBOL(__free_page_frag); +EXPORT_SYMBOL(page_frag_free); static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5a03730fbc1a..734c71468b01 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -369,7 +369,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, fragsz, gfp_mask); + data = page_frag_alloc(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -391,7 +391,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(&nc->page, fragsz, gfp_mask); + return page_frag_alloc(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -441,7 +441,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, len, gfp_mask); + data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; local_irq_restore(flags); @@ -505,7 +505,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(&nc->page, len, gfp_mask); + data = page_frag_alloc(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; From 2976db8018532b624c4123ae662fbc0814877abf Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:09 -0800 Subject: [PATCH 209/297] mm: rename __page_frag functions to __page_frag_cache, drop order from drain This patch does two things. First it goes through and renames the __page_frag prefixed functions to __page_frag_cache so that we can be clear that we are draining or refilling the cache, not the frags themselves. Second we drop the order parameter from __page_frag_cache_drain since we don't actually need to pass it since all fragments are either order 0 or must be a compound page. Link: http://lkml.kernel.org/r/20170104023954.13451.5678.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/intel/igb/igb_main.c | 6 +++--- include/linux/gfp.h | 3 +-- mm/page_alloc.c | 13 +++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a761001308dc..1515abaa5ac9 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3962,8 +3962,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) PAGE_SIZE, DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); - __page_frag_drain(buffer_info->page, 0, - buffer_info->pagecnt_bias); + __page_frag_cache_drain(buffer_info->page, + buffer_info->pagecnt_bias); buffer_info->page = NULL; } @@ -6991,7 +6991,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE, DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); - __page_frag_drain(page, 0, rx_buffer->pagecnt_bias); + __page_frag_cache_drain(page, rx_buffer->pagecnt_bias); } /* clear contents of rx_buffer */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ed77a86fbbb0..0fe0b6295ab5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -499,8 +499,7 @@ extern void free_hot_cold_page(struct page *page, bool cold); extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; -extern void __page_frag_drain(struct page *page, unsigned int order, - unsigned int count); +extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void page_frag_free(void *addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 097893ffe194..d604d2596b7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3896,8 +3896,8 @@ EXPORT_SYMBOL(free_pages); * drivers to provide a backing region of memory for use as either an * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. */ -static struct page *__page_frag_refill(struct page_frag_cache *nc, - gfp_t gfp_mask) +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) { struct page *page = NULL; gfp_t gfp = gfp_mask; @@ -3917,19 +3917,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, return page; } -void __page_frag_drain(struct page *page, unsigned int order, - unsigned int count) +void __page_frag_cache_drain(struct page *page, unsigned int count) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) { + unsigned int order = compound_order(page); + if (order == 0) free_hot_cold_page(page, false); else __free_pages_ok(page, order); } } -EXPORT_SYMBOL(__page_frag_drain); +EXPORT_SYMBOL(__page_frag_cache_drain); void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) @@ -3940,7 +3941,7 @@ void *page_frag_alloc(struct page_frag_cache *nc, if (unlikely(!nc->va)) { refill: - page = __page_frag_refill(nc, gfp_mask); + page = __page_frag_cache_refill(nc, gfp_mask); if (!page) return NULL; From 4d09d0f45dd5d78b3a301c238272211d1ea7d9e6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:12 -0800 Subject: [PATCH 210/297] mm: add documentation for page fragment APIs This is a first pass at trying to add documentation for the page_frag APIs. They may still change over time but for now I thought I would try to get these documented so that as more network drivers and stack calls make use of them we have one central spot to document how they are meant to be used. Link: http://lkml.kernel.org/r/20170104024157.13451.6758.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_frags | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 Documentation/vm/page_frags diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags new file mode 100644 index 000000000000..a6714565dbf9 --- /dev/null +++ b/Documentation/vm/page_frags @@ -0,0 +1,42 @@ +Page fragments +-------------- + +A page fragment is an arbitrary-length arbitrary-offset area of memory +which resides within a 0 or higher order compound page. Multiple +fragments within that page are individually refcounted, in the page's +reference counter. + +The page_frag functions, page_frag_alloc and page_frag_free, provide a +simple allocation framework for page fragments. This is used by the +network stack and network device drivers to provide a backing region of +memory for use as either an sk_buff->head, or to be used in the "frags" +portion of skb_shared_info. + +In order to make use of the page fragment APIs a backing page fragment +cache is needed. This provides a central point for the fragment allocation +and tracks allows multiple calls to make use of a cached page. The +advantage to doing this is that multiple calls to get_page can be avoided +which can be expensive at allocation time. However due to the nature of +this caching it is required that any calls to the cache be protected by +either a per-cpu limitation, or a per-cpu limitation and forcing interrupts +to be disabled when executing the fragment allocation. + +The network stack uses two separate caches per CPU to handle fragment +allocation. The netdev_alloc_cache is used by callers making use of the +__netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is +used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The +main difference between these two calls is the context in which they may be +called. The "netdev" prefixed functions are usable in any context as these +functions will disable interrupts, while the "napi" prefixed functions are +only usable within the softirq context. + +Many network device drivers use a similar methodology for allocating page +fragments, but the page fragments are cached at the ring or descriptor +level. In order to enable these cases it is necessary to provide a generic +way of tearing down a page cache. For this reason __page_frag_cache_drain +was implemented. It allows for freeing multiple references from a single +page via a single call. The advantage to doing this is that it allows for +cleaning up the multiple references that were added to a page in order to +avoid calling get_page per allocation. + +Alexander Duyck, Nov 29, 2016. From f05714293a591038304ddae7cb0dd747bb3786cc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:15 -0800 Subject: [PATCH 211/297] mm: support anonymous stable page During developemnt for zram-swap asynchronous writeback, I found strange corruption of compressed page, resulting in: Modules linked in: zram(E) CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G E 4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff88007620b840 task.stack: ffff880078090000 RIP: set_freeobj.part.43+0x1c/0x1f RSP: 0018:ffff880078093ca8 EFLAGS: 00010246 RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8 RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246 RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88 R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0 Call Trace: obj_malloc+0x22b/0x260 zs_malloc+0x1e4/0x580 zram_bvec_rw+0x4cd/0x830 [zram] page_requests_rw+0x9c/0x130 [zram] zram_thread+0xe6/0x173 [zram] kthread+0xca/0xe0 ret_from_fork+0x25/0x30 With investigation, it reveals currently stable page doesn't support anonymous page. IOW, reuse_swap_page can reuse the page without waiting writeback completion so it can overwrite page zram is compressing. Unfortunately, zram has used per-cpu stream feature from v4.7. It aims for increasing cache hit ratio of scratch buffer for compressing. Downside of that approach is that zram should ask memory space for compressed page in per-cpu context which requires stricted gfp flag which could be failed. If so, it retries to allocate memory space out of per-cpu context so it could get memory this time and compress the data again, copies it to the memory space. In this scenario, zram assumes the data should never be changed but it is not true unless stable page supports. So, If the data is changed under us, zram can make buffer overrun because second compression size could be bigger than one we got in previous trial and blindly, copy bigger size object to smaller buffer which is buffer overrun. The overrun breaks zsmalloc free object chaining so system goes crash like above. I think below is same problem. https://bugzilla.suse.com/show_bug.cgi?id=997574 Unfortunately, reuse_swap_page should be atomic so that we cannot wait on writeback in there so the approach in this patch is simply return false if we found it needs stable page. Although it increases memory footprint temporarily, it happens rarely and it should be reclaimed easily althoug it happened. Also, It would be better than waiting of IO completion, which is critial path for application latency. Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Cc: Sergey Senozhatsky Cc: Darrick J. Wong Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- mm/swapfile.c | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 09f4be179ff3..7f47b7098b1b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -150,8 +150,9 @@ enum { SWP_FILE = (1 << 7), /* set after swap_activate success */ SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ /* add others here before... */ - SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL diff --git a/mm/swapfile.c b/mm/swapfile.c index 1c6e0321205d..4761701d1721 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -943,11 +943,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); - if (count == 1 && !PageWriteback(page)) { + if (count != 1) + goto out; + if (!PageWriteback(page)) { delete_from_swap_cache(page); SetPageDirty(page); + } else { + swp_entry_t entry; + struct swap_info_struct *p; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p->flags & SWP_STABLE_WRITES) { + spin_unlock(&p->lock); + return false; + } + spin_unlock(&p->lock); } } +out: return count <= 1; } @@ -2448,6 +2462,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + + if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) + p->flags |= SWP_STABLE_WRITES; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; From e7ccfc4ccb703e0f033bd4617580039898e912dd Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:18 -0800 Subject: [PATCH 212/297] zram: revalidate disk under init_lock Commit b4c5c60920e3 ("zram: avoid lockdep splat by revalidate_disk") moved revalidate_disk call out of init_lock to avoid lockdep false-positive splat. However, commit 08eee69fcf6b ("zram: remove init_lock in zram_make_request") removed init_lock in IO path so there is no worry about lockdep splat. So, let's restore it. This patch is needed to set BDI_CAP_STABLE_WRITES atomically in next patch. Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/1482366980-3782-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: Hugh Dickins Cc: Darrick J. Wong Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 15f58ab44d0b..195376b4472b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1095,14 +1095,8 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); - - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ revalidate_disk(zram->disk); + up_write(&zram->init_lock); return len; From b09ab054b69b07077bd3292f67e777861ac796e5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:21 -0800 Subject: [PATCH 213/297] zram: support BDI_CAP_STABLE_WRITES zram has used per-cpu stream feature from v4.7. It aims for increasing cache hit ratio of scratch buffer for compressing. Downside of that approach is that zram should ask memory space for compressed page in per-cpu context which requires stricted gfp flag which could be failed. If so, it retries to allocate memory space out of per-cpu context so it could get memory this time and compress the data again, copies it to the memory space. In this scenario, zram assumes the data should never be changed but it is not true without stable page support. So, If the data is changed under us, zram can make buffer overrun so that zsmalloc free object chain is broken so system goes crash like below https://bugzilla.suse.com/show_bug.cgi?id=997574 This patch adds BDI_CAP_STABLE_WRITES to zram for declaring "I am block device needing *stable write*". Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/1482366980-3782-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: Hugh Dickins Cc: Darrick J. Wong Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 195376b4472b..e5ab7d9e8c45 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,14 @@ static inline bool is_partial_io(struct bio_vec *bvec) return bvec->bv_len != PAGE_SIZE; } +static void zram_revalidate_disk(struct zram *zram) +{ + revalidate_disk(zram->disk); + /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ + zram->disk->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; +} + /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -1095,7 +1104,7 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - revalidate_disk(zram->disk); + zram_revalidate_disk(zram); up_write(&zram->init_lock); return len; @@ -1143,7 +1152,7 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - revalidate_disk(zram->disk); + zram_revalidate_disk(zram); bdput(bdev); mutex_lock(&bdev->bd_mutex); From c4e490cf148e85ead0d1b1c2caaba833f1d5b29f Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Tue, 10 Jan 2017 16:58:24 -0800 Subject: [PATCH 214/297] mm/slab.c: fix SLAB freelist randomization duplicate entries This patch fixes a bug in the freelist randomization code. When a high random number is used, the freelist will contain duplicate entries. It will result in different allocations sharing the same chunk. It will result in odd behaviours and crashes. It should be uncommon but it depends on the machines. We saw it happening more often on some machines (every few hours of running tests). Fixes: c7ce4f60ac19 ("mm: SLAB freelist randomization") Link: http://lkml.kernel.org/r/20170103181908.143178-1-thgarnie@google.com Signed-off-by: John Sperbeck Signed-off-by: Thomas Garnier Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 29bc6c0dedd0..4f2ec6bb46eb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2457,7 +2457,6 @@ union freelist_init_state { unsigned int pos; unsigned int *list; unsigned int count; - unsigned int rand; }; struct rnd_state rnd_state; }; @@ -2483,8 +2482,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, } else { state->list = cachep->random_seq; state->count = count; - state->pos = 0; - state->rand = rand; + state->pos = rand % count; ret = true; } return ret; @@ -2493,7 +2491,9 @@ static bool freelist_state_initialize(union freelist_init_state *state, /* Get the next entry on the list and randomize it using a random shift */ static freelist_idx_t next_random_slot(union freelist_init_state *state) { - return (state->list[state->pos++] + state->rand) % state->count; + if (state->pos >= state->count) + state->pos = 0; + return state->list[state->pos++]; } /* Swap two freelist entries */ From e5bbc8a6c992901058bc09e2ce01d16c111ff047 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 10 Jan 2017 16:58:27 -0800 Subject: [PATCH 215/297] mm/hugetlb.c: fix reservation race when freeing surplus pages return_unused_surplus_pages() decrements the global reservation count, and frees any unused surplus pages that were backing the reservation. Commit 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages()") added a call to cond_resched_lock in the loop freeing the pages. As a result, the hugetlb_lock could be dropped, and someone else could use the pages that will be freed in subsequent iterations of the loop. This could result in inconsistent global hugetlb page state, application api failures (such as mmap) failures or application crashes. When dropping the lock in return_unused_surplus_pages, make sure that the global reservation count (resv_huge_pages) remains sufficiently large to prevent someone else from claiming pages about to be freed. Analyzed by Paul Cassella. Fixes: 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages()") Link: http://lkml.kernel.org/r/1483991767-6879-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Paul Cassella Suggested-by: Michal Hocko Cc: Masayoshi Mizuma Cc: Naoya Horiguchi Cc: Aneesh Kumar Cc: Hillf Danton Cc: [3.15+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3edb759c5c7d..c7025c132670 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1773,23 +1773,32 @@ free: } /* - * When releasing a hugetlb pool reservation, any surplus pages that were - * allocated to satisfy the reservation must be explicitly freed if they were - * never used. - * Called with hugetlb_lock held. + * This routine has two main purposes: + * 1) Decrement the reservation count (resv_huge_pages) by the value passed + * in unused_resv_pages. This corresponds to the prior adjustments made + * to the associated reservation map. + * 2) Free any unused surplus pages that may have been allocated to satisfy + * the reservation. As many as unused_resv_pages may be freed. + * + * Called with hugetlb_lock held. However, the lock could be dropped (and + * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, + * we must make sure nobody else can claim pages we are in the process of + * freeing. Do this by ensuring resv_huge_page always is greater than the + * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - /* Uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) - return; + goto out; + /* + * Part (or even all) of the reservation could have been backed + * by pre-allocated pages. Only free surplus pages. + */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* @@ -1799,12 +1808,22 @@ static void return_unused_surplus_pages(struct hstate *h, * when the nodes with surplus pages have no free pages. * free_pool_huge_page() will balance the the freed pages across the * on-line nodes with memory and will handle the hstate accounting. + * + * Note that we decrement resv_huge_pages as we free the pages. If + * we drop the lock, resv_huge_pages will still be sufficiently large + * to cover subsequent pages we may free. */ while (nr_pages--) { + h->resv_huge_pages--; + unused_resv_pages--; if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) - break; + goto out; cond_resched_lock(&hugetlb_lock); } + +out: + /* Fully uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; } From 575b1967e10a1f3038266244d2c7a3ca6b99fed8 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 10 Jan 2017 16:58:30 -0800 Subject: [PATCH 216/297] timerfd: export defines to userspace Since userspace is expected to call timerfd syscalls directly with these flags/ioctls, make sure we export them so they don't have to duplicate the values themselves. Link: http://lkml.kernel.org/r/20161219064052.7196-1-vapier@gentoo.org Signed-off-by: Mike Frysinger Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timerfd.h | 20 +------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/timerfd.h | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 19 deletions(-) create mode 100644 include/uapi/linux/timerfd.h diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h index bd36ce431e32..bab0b1ad0613 100644 --- a/include/linux/timerfd.h +++ b/include/linux/timerfd.h @@ -8,23 +8,7 @@ #ifndef _LINUX_TIMERFD_H #define _LINUX_TIMERFD_H -/* For O_CLOEXEC and O_NONBLOCK */ -#include - -/* For _IO helpers */ -#include - -/* - * CAREFUL: Check include/asm-generic/fcntl.h when defining - * new flags, since they might collide with O_* ones. We want - * to re-use O_* flags that couldn't possibly have a meaning - * from eventfd, in order to leave a free define-space for - * shared O_* flags. - */ -#define TFD_TIMER_ABSTIME (1 << 0) -#define TFD_TIMER_CANCEL_ON_SET (1 << 1) -#define TFD_CLOEXEC O_CLOEXEC -#define TFD_NONBLOCK O_NONBLOCK +#include #define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) /* Flags for timerfd_create. */ @@ -32,6 +16,4 @@ /* Flags for timerfd_settime. */ #define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) -#define TFD_IOC_SET_TICKS _IOW('T', 0, u64) - #endif /* _LINUX_TIMERFD_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index a8b93e685239..f330ba4547cf 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -414,6 +414,7 @@ header-y += telephony.h header-y += termios.h header-y += thermal.h header-y += time.h +header-y += timerfd.h header-y += times.h header-y += timex.h header-y += tiocl.h diff --git a/include/uapi/linux/timerfd.h b/include/uapi/linux/timerfd.h new file mode 100644 index 000000000000..6fcfaa8da173 --- /dev/null +++ b/include/uapi/linux/timerfd.h @@ -0,0 +1,36 @@ +/* + * include/linux/timerfd.h + * + * Copyright (C) 2007 Davide Libenzi + * + */ + +#ifndef _UAPI_LINUX_TIMERFD_H +#define _UAPI_LINUX_TIMERFD_H + +#include + +/* For O_CLOEXEC and O_NONBLOCK */ +#include + +/* For _IO helpers */ +#include + +/* + * CAREFUL: Check include/asm-generic/fcntl.h when defining + * new flags, since they might collide with O_* ones. We want + * to re-use O_* flags that couldn't possibly have a meaning + * from eventfd, in order to leave a free define-space for + * shared O_* flags. + * + * Also make sure to update the masks in include/linux/timerfd.h + * when adding new flags. + */ +#define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_CANCEL_ON_SET (1 << 1) +#define TFD_CLOEXEC O_CLOEXEC +#define TFD_NONBLOCK O_NONBLOCK + +#define TFD_IOC_SET_TICKS _IOW('T', 0, __u64) + +#endif /* _UAPI_LINUX_TIMERFD_H */ From cd3776638003b3362d9d7d1f27bcb80c276e2c28 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:30 +0200 Subject: [PATCH 217/297] net/mlx5e: Properly handle offloading of source udp port for IP tunnels We can offload the matching on source udp port of ip tunnels for decapsulation. We can not offload setting source udp port for tunnels as part of encapsulation. Fix both the code that deals with matching offload (decap) and the code that deal with encap offload to align with that. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Fixes: bbd00f7e2349 ('net/mlx5e: Add TC tunnel release action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index f8829b517156..b60feceab63b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -239,10 +239,6 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst))) return -EOPNOTSUPP; - /* udp src port isn't supported */ - if (memchr_inv(&mask->src, 0, sizeof(mask->src))) - return -EOPNOTSUPP; - if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) parse_vxlan_attr(spec, f); @@ -254,6 +250,10 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, ntohs(key->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + udp_sport, ntohs(mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + udp_sport, ntohs(key->src)); } else { /* udp dst port must be given */ return -EOPNOTSUPP; } @@ -796,6 +796,10 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, if (!memchr_inv(&key->tp_dst, 0, sizeof(key->tp_dst))) return -EOPNOTSUPP; + /* setting udp src port isn't supported */ + if (memchr_inv(&key->tp_src, 0, sizeof(key->tp_src))) + return -EOPNOTSUPP; + if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { info.tp_dst = key->tp_dst; From 2fcd82e9be133e4ec777f66fd67a8fb8e7748b1b Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:31 +0200 Subject: [PATCH 218/297] net/mlx5e: Warn when rejecting offload attempts of IP tunnels We silently reject offloading of IPv6 tunnels, non vxlan tunnels, vxlan tunnels where the dst port to match is not provided, etc. Be a bit more verbose and print a warning so the user better realizes what went wrong here and can fix it. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Fixes: bbd00f7e2349 ('net/mlx5e: Add TC tunnel release action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index b60feceab63b..d2fc055f054a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -237,13 +237,16 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, /* Full udp dst port must be given */ if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst))) - return -EOPNOTSUPP; + goto vxlan_match_offload_err; if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) parse_vxlan_attr(spec, f); - else + else { + netdev_warn(priv->netdev, + "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->dst)); return -EOPNOTSUPP; + } MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport, ntohs(mask->dst)); @@ -255,7 +258,10 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport, ntohs(key->src)); } else { /* udp dst port must be given */ - return -EOPNOTSUPP; +vxlan_match_offload_err: + netdev_warn(priv->netdev, + "IP tunnel decap offload supported only for vxlan, must set UDP dport\n"); + return -EOPNOTSUPP; } if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { @@ -346,6 +352,9 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, if (parse_tunnel_attr(priv, spec, f)) return -EOPNOTSUPP; break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + netdev_warn(priv->netdev, + "IPv6 tunnel decap offload isn't supported\n"); default: return -EOPNOTSUPP; } @@ -792,13 +801,17 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, int tunnel_type; int err; - /* udp dst port must be given */ + /* udp dst port must be set */ if (!memchr_inv(&key->tp_dst, 0, sizeof(key->tp_dst))) - return -EOPNOTSUPP; + goto vxlan_encap_offload_err; /* setting udp src port isn't supported */ - if (memchr_inv(&key->tp_src, 0, sizeof(key->tp_src))) + if (memchr_inv(&key->tp_src, 0, sizeof(key->tp_src))) { +vxlan_encap_offload_err: + netdev_warn(priv->netdev, + "must set udp dst port and not set udp src port\n"); return -EOPNOTSUPP; + } if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { @@ -806,6 +819,8 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, info.tun_id = tunnel_id_to_key32(key->tun_id); tunnel_type = MLX5_HEADER_TYPE_VXLAN; } else { + netdev_warn(priv->netdev, + "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->tp_dst)); return -EOPNOTSUPP; } @@ -813,6 +828,9 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, case AF_INET: info.daddr = key->u.ipv4.dst; break; + case AF_INET6: + netdev_warn(priv->netdev, + "IPv6 tunnel encap offload isn't supported\n"); default: return -EOPNOTSUPP; } From a42485eb0ee458da3a0df82b0e42ab58ce76be05 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:32 +0200 Subject: [PATCH 219/297] net/mlx5e: TC ipv4 tunnel encap offload error flow fixes When the route lookup fails we should return the actual error. When the neigh isn't valid, we should return -EOPNOTSUPP as done in similar cases along the code. When the offload can't take place as of invalid neigh etc, we must release the neigh. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index d2fc055f054a..b62f06f3f7e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -656,17 +656,14 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, #if IS_ENABLED(CONFIG_INET) rt = ip_route_output_key(dev_net(mirred_dev), fl4); - if (IS_ERR(rt)) { - pr_warn("%s: no route to %pI4\n", __func__, &fl4->daddr); - return -EOPNOTSUPP; - } + if (IS_ERR(rt)) + return PTR_ERR(rt); #else return -EOPNOTSUPP; #endif if (!switchdev_port_same_parent_id(priv->netdev, rt->dst.dev)) { - pr_warn("%s: Can't offload the flow, netdevices aren't on the same HW e-switch\n", - __func__); + pr_warn("%s: can't offload, devices not on same HW e-switch\n", __func__); ip_rt_put(rt); return -EOPNOTSUPP; } @@ -727,8 +724,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, struct net_device **out_dev) { int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); + struct neighbour *n = NULL; struct flowi4 fl4 = {}; - struct neighbour *n; char *encap_header; int encap_size; __be32 saddr; @@ -759,7 +756,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, e->out_dev = *out_dev; if (!(n->nud_state & NUD_VALID)) { - err = -ENOTSUPP; + pr_warn("%s: can't offload, neighbour to %pI4 invalid\n", __func__, &fl4.daddr); + err = -EOPNOTSUPP; goto out; } @@ -781,6 +779,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, err = mlx5_encap_alloc(priv->mdev, e->tunnel_type, encap_size, encap_header, &e->encap_id); out: + if (err && n) + neigh_release(n); kfree(encap_header); return err; } From 2e72eb438ce5ea9fa118edfd9a5f628c2a69111a Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:33 +0200 Subject: [PATCH 220/297] net/mlx5e: Properly get address type of encapsulation IP headers As done elsewhere in our TC/flower offload code, the address type of the encapsulation IP headers should be realized accroding to the addr_type field of the encapsulation control dissector key, do that. Fixes: bbd00f7e2349 ('net/mlx5e: Add TC tunnel release action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index b62f06f3f7e0..9cfddd9fc097 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -225,6 +225,11 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); + struct flow_dissector_key_control *enc_control = + skb_flow_dissector_target(f->dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + f->key); + if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { struct flow_dissector_key_ports *key = skb_flow_dissector_target(f->dissector, @@ -264,7 +269,7 @@ vxlan_match_offload_err: return -EOPNOTSUPP; } - if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + if (enc_control->addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { struct flow_dissector_key_ipv4_addrs *key = skb_flow_dissector_target(f->dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, @@ -286,10 +291,10 @@ vxlan_match_offload_err: MLX5_SET(fte_match_set_lyr_2_4, headers_v, dst_ipv4_dst_ipv6.ipv4_layout.ipv4, ntohl(key->dst)); - } - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP); + } /* Enforce DMAC when offloading incoming tunneled flows. * Flow counters require a match on the DMAC. From 0827444d052ba5347900376dbdbf5d9065d091d4 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:34 +0200 Subject: [PATCH 221/297] net/mlx5e: Set inline mode requirements for matching on IP fragments For e-switch level matching on packets being an IP fragment, we need to make sure the source vport inline mode is L3, fix that. Fixes: 3f7d0eb42d59 ('net/mlx5e: Offload TC matching on packets being IP fragments') Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9cfddd9fc097..a35fa1eb0694 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -389,6 +389,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1); MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, key->flags & FLOW_DIS_IS_FRAGMENT); + + /* the HW doesn't need L3 inline to match on frag=no */ + if (key->flags & FLOW_DIS_IS_FRAGMENT) + *min_inline = MLX5_INLINE_MODE_IP; } } From a757d108dc1a053722215ee89116f8af9bba1525 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Tue, 10 Jan 2017 22:33:35 +0200 Subject: [PATCH 222/297] net/mlx5e: Fix kbuild warnings for uninitialized parameters kbuild warn about parameters that may be used uninitialized, fix it. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a35fa1eb0694..5dbc81de34ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -737,8 +737,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, struct flowi4 fl4 = {}; char *encap_header; int encap_size; - __be32 saddr; - int ttl; + __be32 saddr = 0; + int ttl = 0; int err; encap_header = kzalloc(max_encap_size, GFP_KERNEL); From 5e86397abe10aa4c884478a45e9a35b6a37d8d5d Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:36 +0200 Subject: [PATCH 223/297] net/mlx5e: Properly handle FW errors while adding TC rules When the firmware returns an error (common example is an attempt to add twice the same rule which is refused by the some FWs), we are not properly derefing/cleaning few resources allocated on the way. Examples are vport vlan deref under eswitch vlan offloads, and encap entry/neighbour deref under eswitch encapsulation offloads, fix that. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Fixes: 8b32580df1cb ('net/mlx5e: Add TC vlan action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 5dbc81de34ee..118cea5e5489 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -161,15 +161,21 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv, } } +/* we get here also when setting rule to the FW failed, etc. It means that the + * flow rule itself might not exist, but some offloading related to the actions + * should be cleaned. + */ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5_fc *counter = NULL; - counter = mlx5_flow_rule_counter(flow->rule); - - mlx5_del_flow_rules(flow->rule); + if (!IS_ERR(flow->rule)) { + counter = mlx5_flow_rule_counter(flow->rule); + mlx5_del_flow_rules(flow->rule); + mlx5_fc_destroy(priv->mdev, counter); + } if (esw && esw->mode == SRIOV_OFFLOADS) { mlx5_eswitch_del_vlan_action(esw, flow->attr); @@ -177,8 +183,6 @@ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, mlx5e_detach_encap(priv, flow); } - mlx5_fc_destroy(priv->mdev, counter); - if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) { mlx5_destroy_flow_table(priv->fs.tc.t); priv->fs.tc.t = NULL; @@ -1017,7 +1021,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol, if (IS_ERR(flow->rule)) { err = PTR_ERR(flow->rule); - goto err_free; + goto err_del_rule; } err = rhashtable_insert_fast(&tc->ht, &flow->node, @@ -1028,7 +1032,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol, goto out; err_del_rule: - mlx5_del_flow_rules(flow->rule); + mlx5e_tc_del_flow(priv, flow); err_free: kfree(flow); From 3deef8cea3efcaeeae240bb00541de66abb9bfa0 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 10 Jan 2017 22:33:37 +0200 Subject: [PATCH 224/297] net/mlx5e: Un-register uplink representor on nic_disable The code before this patch registered uplink e-Switch representor on nic_enable and unregistered on nic_cleanup, the right place for this unregister is in nic_disable. Fixes: 127ea380acc9 ("net/mlx5: Add Representors registration API") Signed-off-by: Saeed Mahameed Reviewed-by: Mohamad Haj Yahia Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 1236b27b1493..2b7dd315020c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3675,14 +3675,8 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev, static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) { - struct mlx5_core_dev *mdev = priv->mdev; - struct mlx5_eswitch *esw = mdev->priv.eswitch; - mlx5e_vxlan_cleanup(priv); - if (MLX5_CAP_GEN(mdev, vport_group_manager)) - mlx5_eswitch_unregister_vport_rep(esw, 0); - if (priv->xdp_prog) bpf_prog_put(priv->xdp_prog); } @@ -3807,9 +3801,14 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) static void mlx5e_nic_disable(struct mlx5e_priv *priv) { + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_eswitch *esw = mdev->priv.eswitch; + queue_work(priv->wq, &priv->set_rx_mode_work); + if (MLX5_CAP_GEN(mdev, vport_group_manager)) + mlx5_eswitch_unregister_vport_rep(esw, 0); mlx5e_disable_async_events(priv); - mlx5_lag_remove(priv->mdev); + mlx5_lag_remove(mdev); } static const struct mlx5e_profile mlx5e_nic_profile = { From 0bbcc0a8fc394d01988fe0263ccf7fddb77a12c3 Mon Sep 17 00:00:00 2001 From: Gil Rockah Date: Tue, 10 Jan 2017 22:33:38 +0200 Subject: [PATCH 225/297] net/mlx5e: Remove WARN_ONCE from adaptive moderation code When trying to do interface down or changing interface configuration under heavy traffic, some of the adaptive moderation corner cases can occur and leave a WARN_ONCE call trace in the kernel log. Those WARN_ONCE are meant for debug only, and should have been inserted only under debug. We avoid such call traces by removing those WARN_ONCE. Fixes: cb3c7fd4f839 ("net/mlx5e: Support adaptive RX coalescing") Signed-off-by: Gil Rockah Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c index 1fffe48a93cc..cbfac06b7ffd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c @@ -109,7 +109,6 @@ static bool mlx5e_am_on_top(struct mlx5e_rx_am *am) switch (am->tune_state) { case MLX5E_AM_PARKING_ON_TOP: case MLX5E_AM_PARKING_TIRED: - WARN_ONCE(true, "mlx5e_am_on_top: PARKING\n"); return true; case MLX5E_AM_GOING_RIGHT: return (am->steps_left > 1) && (am->steps_right == 1); @@ -123,7 +122,6 @@ static void mlx5e_am_turn(struct mlx5e_rx_am *am) switch (am->tune_state) { case MLX5E_AM_PARKING_ON_TOP: case MLX5E_AM_PARKING_TIRED: - WARN_ONCE(true, "mlx5e_am_turn: PARKING\n"); break; case MLX5E_AM_GOING_RIGHT: am->tune_state = MLX5E_AM_GOING_LEFT; @@ -144,7 +142,6 @@ static int mlx5e_am_step(struct mlx5e_rx_am *am) switch (am->tune_state) { case MLX5E_AM_PARKING_ON_TOP: case MLX5E_AM_PARKING_TIRED: - WARN_ONCE(true, "mlx5e_am_step: PARKING\n"); break; case MLX5E_AM_GOING_RIGHT: if (am->profile_ix == (MLX5E_PARAMS_AM_NUM_PROFILES - 1)) @@ -282,10 +279,8 @@ static void mlx5e_am_calc_stats(struct mlx5e_rx_am_sample *start, u32 delta_us = ktime_us_delta(end->time, start->time); unsigned int npkts = end->pkt_ctr - start->pkt_ctr; - if (!delta_us) { - WARN_ONCE(true, "mlx5e_am_calc_stats: delta_us=0\n"); + if (!delta_us) return; - } curr_stats->ppms = (npkts * USEC_PER_MSEC) / delta_us; curr_stats->epms = (MLX5E_AM_NEVENTS * USEC_PER_MSEC) / delta_us; From 5e44fca5047054f1762813751626b5245e0da022 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 10 Jan 2017 22:33:39 +0200 Subject: [PATCH 226/297] net/mlx5: Only cancel recovery work when cleaning up device Do not attempt to drain the health workqueue when unloading the device in the recovery flow, this can cause a deadlock when the recovery work tries to cancel itself with sync. Because the work is no longer unconditionally canceled when unloading, it must be explicitly canceled in the AER flow. fixes: 689a248df83b ("net/mlx5: Cancel recovery work in remove flow") Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 6547f22e6b9b..d01e9f21d469 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1195,7 +1195,8 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, { int err = 0; - mlx5_drain_health_wq(dev); + if (cleanup) + mlx5_drain_health_wq(dev); mutex_lock(&dev->intf_state_mutex); if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) { @@ -1359,9 +1360,10 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv, false); - /* In case of kernel call save the pci state */ + /* In case of kernel call save the pci state and drain the health wq */ if (state) { pci_save_state(pdev); + mlx5_drain_health_wq(dev); mlx5_pci_disable_device(dev); } From 7ee7f45a763bd68c3a606595a8c1bb08c3e6146b Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Wed, 11 Jan 2017 01:27:21 +0200 Subject: [PATCH 227/297] mei: bus: enable OS version only for SPT and newer Sending OS version for support of TPM2_ChangeEPS() is required only for SPT FW (HMB version 2.0) and newer. On older platforms the command should be just ignored by the firmware but some older platforms misbehave so it's safer to send the command only if required. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=192051 Fixes: 7279b238bade (mei: send OS type to the FW) Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Tested-by: Jan Niehusmann Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/bus-fixup.c | 3 +++ drivers/misc/mei/debugfs.c | 2 ++ drivers/misc/mei/hbm.c | 4 ++++ drivers/misc/mei/hw.h | 6 ++++++ drivers/misc/mei/mei_dev.h | 2 ++ 5 files changed, 17 insertions(+) diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c index 18e05ca7584f..3600c9993a98 100644 --- a/drivers/misc/mei/bus-fixup.c +++ b/drivers/misc/mei/bus-fixup.c @@ -152,6 +152,9 @@ static void mei_mkhi_fix(struct mei_cl_device *cldev) { int ret; + if (!cldev->bus->hbm_f_os_supported) + return; + ret = mei_cldev_enable(cldev); if (ret) return; diff --git a/drivers/misc/mei/debugfs.c b/drivers/misc/mei/debugfs.c index c6c051b52f55..c6217a4993ad 100644 --- a/drivers/misc/mei/debugfs.c +++ b/drivers/misc/mei/debugfs.c @@ -180,6 +180,8 @@ static ssize_t mei_dbgfs_read_devstate(struct file *fp, char __user *ubuf, dev->hbm_f_ev_supported); pos += scnprintf(buf + pos, bufsz - pos, "\tFA: %01d\n", dev->hbm_f_fa_supported); + pos += scnprintf(buf + pos, bufsz - pos, "\tOS: %01d\n", + dev->hbm_f_os_supported); } pos += scnprintf(buf + pos, bufsz - pos, "pg: %s, %s\n", diff --git a/drivers/misc/mei/hbm.c b/drivers/misc/mei/hbm.c index dd7f15a65eed..25b4a1ba522d 100644 --- a/drivers/misc/mei/hbm.c +++ b/drivers/misc/mei/hbm.c @@ -989,6 +989,10 @@ static void mei_hbm_config_features(struct mei_device *dev) /* Fixed Address Client Support */ if (dev->version.major_version >= HBM_MAJOR_VERSION_FA) dev->hbm_f_fa_supported = 1; + + /* OS ver message Support */ + if (dev->version.major_version >= HBM_MAJOR_VERSION_OS) + dev->hbm_f_os_supported = 1; } /** diff --git a/drivers/misc/mei/hw.h b/drivers/misc/mei/hw.h index 9daf3f9aed25..e1e4d47d4d7d 100644 --- a/drivers/misc/mei/hw.h +++ b/drivers/misc/mei/hw.h @@ -76,6 +76,12 @@ #define HBM_MINOR_VERSION_FA 0 #define HBM_MAJOR_VERSION_FA 2 +/* + * MEI version with OS ver message support + */ +#define HBM_MINOR_VERSION_OS 0 +#define HBM_MAJOR_VERSION_OS 2 + /* Host bus message command opcode */ #define MEI_HBM_CMD_OP_MSK 0x7f /* Host bus message command RESPONSE */ diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h index 699693cd8c59..8dadb98662a9 100644 --- a/drivers/misc/mei/mei_dev.h +++ b/drivers/misc/mei/mei_dev.h @@ -406,6 +406,7 @@ const char *mei_pg_state_str(enum mei_pg_state state); * @hbm_f_ev_supported : hbm feature event notification * @hbm_f_fa_supported : hbm feature fixed address client * @hbm_f_ie_supported : hbm feature immediate reply to enum request + * @hbm_f_os_supported : hbm feature support OS ver message * * @me_clients_rwsem: rw lock over me_clients list * @me_clients : list of FW clients @@ -487,6 +488,7 @@ struct mei_device { unsigned int hbm_f_ev_supported:1; unsigned int hbm_f_fa_supported:1; unsigned int hbm_f_ie_supported:1; + unsigned int hbm_f_os_supported:1; struct rw_semaphore me_clients_rwsem; struct list_head me_clients; From 488debb9971bc7d0edd6d8080ba78ca02a04f6c4 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 5 Jan 2017 17:15:01 +0000 Subject: [PATCH 228/297] drivers: char: mem: Fix thinkos in kmem address checks When borrowing the pfn_valid() check from mmap_kmem(), somebody managed to get physical and virtual addresses spectacularly muddled up, such that we've ended up with checks for one being the other. Whilst this does indeed prevent out-of-bounds accesses crashing, on most systems it also prevents the more desirable use-case of working at all ever. Check the *virtual* offset correctly for what it is. Furthermore, do so in the right place - a read or write may span multiple pages, so a single up-front check is insufficient. High memory accesses already have a similar validity check just before the copy_to_user() call, so just make the low memory path fully consistent with that. Reported-by: Jason A. Donenfeld CC: stable@vger.kernel.org Fixes: 148a1bc84398 ("drivers: char: mem: Check {read,write}_kmem() addresses") Signed-off-by: Robin Murphy Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 5bb1985ec484..6d9cc2d39d22 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -381,9 +381,6 @@ static ssize_t read_kmem(struct file *file, char __user *buf, char *kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ int err = 0; - if (!pfn_valid(PFN_DOWN(p))) - return -EIO; - read = 0; if (p < (unsigned long) high_memory) { low_count = count; @@ -412,6 +409,8 @@ static ssize_t read_kmem(struct file *file, char __user *buf, * by the kernel or data corruption may occur */ kbuf = xlate_dev_kmem_ptr((void *)p); + if (!virt_addr_valid(kbuf)) + return -ENXIO; if (copy_to_user(buf, kbuf, sz)) return -EFAULT; @@ -482,6 +481,8 @@ static ssize_t do_write_kmem(unsigned long p, const char __user *buf, * corruption may occur. */ ptr = xlate_dev_kmem_ptr((void *)p); + if (!virt_addr_valid(ptr)) + return -ENXIO; copied = copy_from_user(ptr, buf, sz); if (copied) { @@ -512,9 +513,6 @@ static ssize_t write_kmem(struct file *file, const char __user *buf, char *kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ int err = 0; - if (!pfn_valid(PFN_DOWN(p))) - return -EIO; - if (p < (unsigned long) high_memory) { unsigned long to_write = min_t(unsigned long, count, (unsigned long)high_memory - p); From 89d8232411a85b9a6b12fd5da4d07d8a138a8e0c Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Tue, 13 Dec 2016 17:27:56 +0100 Subject: [PATCH 229/297] tty/serial: atmel_serial: BUG: stop DMA from transmitting in stop_tx If we don't disable the transmitter in atmel_stop_tx, the DMA buffer continues to send data until it is emptied. This cause problems with the flow control (CTS is asserted and data are still sent). So, disabling the transmitter in atmel_stop_tx is a sane thing to do. Tested on at91sam9g35-cm(DMA) Tested for regressions on sama5d2-xplained(Fifo) and at91sam9g20ek(PDC) Cc: (beware, this won't apply before 4.3) Signed-off-by: Richard Genoud Acked-by: Nicolas Ferre Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/atmel_serial.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index 168b10cad47b..f9d42de5ab2d 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -481,6 +481,14 @@ static void atmel_stop_tx(struct uart_port *port) /* disable PDC transmit */ atmel_uart_writel(port, ATMEL_PDC_PTCR, ATMEL_PDC_TXTDIS); } + + /* + * Disable the transmitter. + * This is mandatory when DMA is used, otherwise the DMA buffer + * is fully transmitted. + */ + atmel_uart_writel(port, ATMEL_US_CR, ATMEL_US_TXDIS); + /* Disable interrupts */ atmel_uart_writel(port, ATMEL_US_IDR, atmel_port->tx_done_mask); @@ -513,6 +521,9 @@ static void atmel_start_tx(struct uart_port *port) /* Enable interrupts */ atmel_uart_writel(port, ATMEL_US_IER, atmel_port->tx_done_mask); + + /* re-enable the transmitter */ + atmel_uart_writel(port, ATMEL_US_CR, ATMEL_US_TXEN); } /* From b389f173aaa1204d6dc1f299082a162eb0491545 Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Tue, 6 Dec 2016 13:05:33 +0100 Subject: [PATCH 230/297] tty/serial: atmel: RS485 half duplex w/DMA: enable RX after TX is done When using RS485 in half duplex, RX should be enabled when TX is finished, and stopped when TX starts. Before commit 0058f0871efe7b01c6 ("tty/serial: atmel: fix RS485 half duplex with DMA"), RX was not disabled in atmel_start_tx() if the DMA was used. So, collisions could happened. But disabling RX in atmel_start_tx() uncovered another bug: RX was enabled again in the wrong place (in atmel_tx_dma) instead of being enabled when TX is finished (in atmel_complete_tx_dma), so the transmission simply stopped. This bug was not triggered before commit 0058f0871efe7b01c6 ("tty/serial: atmel: fix RS485 half duplex with DMA") because RX was never disabled before. Moving atmel_start_rx() in atmel_complete_tx_dma() corrects the problem. Cc: stable@vger.kernel.org Reported-by: Gil Weber Fixes: 0058f0871efe7b01c6 Tested-by: Gil Weber Signed-off-by: Richard Genoud Acked-by: Alexandre Belloni Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/atmel_serial.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index f9d42de5ab2d..fabbe76203bb 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -809,6 +809,11 @@ static void atmel_complete_tx_dma(void *arg) */ if (!uart_circ_empty(xmit)) atmel_tasklet_schedule(atmel_port, &atmel_port->tasklet_tx); + else if ((port->rs485.flags & SER_RS485_ENABLED) && + !(port->rs485.flags & SER_RS485_RX_DURING_TX)) { + /* DMA done, stop TX, start RX for RS485 */ + atmel_start_rx(port); + } spin_unlock_irqrestore(&port->lock, flags); } @@ -911,12 +916,6 @@ static void atmel_tx_dma(struct uart_port *port) desc->callback = atmel_complete_tx_dma; desc->callback_param = atmel_port; atmel_port->cookie_tx = dmaengine_submit(desc); - - } else { - if (port->rs485.flags & SER_RS485_ENABLED) { - /* DMA done, stop TX, start RX for RS485 */ - atmel_start_rx(port); - } } if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) From c130b666a9a711f985a0a44b58699ebe14bb7245 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 28 Dec 2016 16:42:00 -0200 Subject: [PATCH 231/297] 8250_pci: Fix potential use-after-free in error path Commit f209fa03fc9d ("serial: 8250_pci: Detach low-level driver during PCI error recovery") introduces a potential use-after-free in case the pciserial_init_ports call in serial8250_io_resume fails, which may happen if a memory allocation fails or if the .init quirk failed for whatever reason). If this happen, further pci_get_drvdata will return a pointer to freed memory. This patch reworks the PCI recovery resume hook to restore the old priv structure in this case, which should be ok, since the ports were already detached. Such error during recovery causes us to give up on the recovery. Fixes: f209fa03fc9d ("serial: 8250_pci: Detach low-level driver during PCI error recovery") Reported-by: Michal Suchanek Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Guilherme G. Piccoli Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pci.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index aa0166b6d450..116436b7fa52 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -5642,17 +5642,15 @@ static pci_ers_result_t serial8250_io_slot_reset(struct pci_dev *dev) static void serial8250_io_resume(struct pci_dev *dev) { struct serial_private *priv = pci_get_drvdata(dev); - const struct pciserial_board *board; + struct serial_private *new; if (!priv) return; - board = priv->board; - kfree(priv); - priv = pciserial_init_ports(dev, board); - - if (!IS_ERR(priv)) { - pci_set_drvdata(dev, priv); + new = pciserial_init_ports(dev, priv->board); + if (!IS_ERR(new)) { + pci_set_drvdata(dev, new); + kfree(priv); } } From 2bed8a8e70729f996af92042d3ad0f11870acc1f Mon Sep 17 00:00:00 2001 From: Daniel Jedrychowski Date: Mon, 12 Dec 2016 09:18:28 +1100 Subject: [PATCH 232/297] Clearing FIFOs in RS485 emulation mode causes subsequent transmits to break When in RS485 emulation mode, __do_stop_tx_rs485() calls serial8250_clear_fifos(). This not only clears the FIFOs, but also sets all bits in their control register (UART_FCR) to 0. One of the effects of this is the disabling of the FIFOs, which turns them into single-byte holding registers. The rest of the driver doesn't know this, which results in the lions share of characters passed into a write call to be dropped. (I can supply logic analyzer screenshots if necessary) This fix replaces the serial8250_clear_fifos() call to serial8250_clear_and_reinit_fifos() - this prevents the "dropped characters" issue from manifesting again while retaining the requirement of clearing the RX FIFO after transmission if the SER_RS485_RX_DURING_TX flag is disabled. Signed-off-by: Daniel Jedrychowski Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index fe4399b41df6..c13fec451d03 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1413,7 +1413,7 @@ static void __do_stop_tx_rs485(struct uart_8250_port *p) * Enable previously disabled RX interrupts. */ if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) { - serial8250_clear_fifos(p); + serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; serial_port_out(&p->port, UART_IER, p->ier); From 6741f551a0b26479de2532ffa43a366747e6dbf3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 11 Dec 2016 10:05:49 +0800 Subject: [PATCH 233/297] Revert "tty: serial: 8250: add CON_CONSDEV to flags" This commit needs to be reverted because it prevents people from using the serial console as a secondary console with input being directed to tty0. IOW, if you boot with console=ttyS0 console=tty0 then all kernels prior to this commit will produce output on both ttyS0 and tty0 but input will only be taken from tty0. With this patch the serial console will always be the primary console instead of tty0, potentially preventing people from getting into their machines in emergency situations. Fixes: d03516df8375 ("tty: serial: 8250: add CON_CONSDEV to flags") Signed-off-by: Herbert Xu Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 61569a765d9e..76e03a7de9cc 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -675,7 +675,7 @@ static struct console univ8250_console = { .device = uart_console_device, .setup = univ8250_console_setup, .match = univ8250_console_match, - .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_CONSDEV, + .flags = CON_PRINTBUFFER | CON_ANYTIME, .index = -1, .data = &serial8250_reg, }; From 5b11ebedd6a8bb4271b796e498cd15c0fe1133b6 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sat, 3 Dec 2016 16:56:49 +0800 Subject: [PATCH 234/297] extcon: return error code on failure Function get_zeroed_page() returns a NULL pointer if there is no enough memory. In function extcon_sync(), it returns 0 if the call to get_zeroed_page() fails. The return value 0 indicates success in the context, which is incosistent with the execution status. This patch fixes the bug by returning -ENOMEM. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188611 Signed-off-by: Pan Bian Fixes: a580982f0836e Cc: stable Acked-by: Chanwoo Choi Signed-off-by: Greg Kroah-Hartman --- drivers/extcon/extcon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c index 78298460d168..7c1e3a7b14e0 100644 --- a/drivers/extcon/extcon.c +++ b/drivers/extcon/extcon.c @@ -453,7 +453,7 @@ int extcon_sync(struct extcon_dev *edev, unsigned int id) dev_err(&edev->dev, "out of memory in extcon_set_state\n"); kobject_uevent(&edev->dev.kobj, KOBJ_CHANGE); - return 0; + return -ENOMEM; } length = name_show(&edev->dev, NULL, prop_buf); From 0fa2c8eb270413160557babda519aa3c21e2bfaf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 2 Dec 2016 16:23:55 +0000 Subject: [PATCH 235/297] ppdev: don't print a free'd string A previous fix of a memory leak now prints the string 'name' that was previously free'd. Fix this by free'ing the string at the end of the function and adding an error exit path for the error conditions. CoverityScan CID#1384523 ("Use after free") Fixes: 2bd362d5f45c1 ("ppdev: fix memory leak") Signed-off-by: Colin Ian King Acked-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- drivers/char/ppdev.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c index 02819e0703c8..87885d146dbb 100644 --- a/drivers/char/ppdev.c +++ b/drivers/char/ppdev.c @@ -290,6 +290,7 @@ static int register_device(int minor, struct pp_struct *pp) struct pardevice *pdev = NULL; char *name; struct pardev_cb ppdev_cb; + int rc = 0; name = kasprintf(GFP_KERNEL, CHRDEV "%x", minor); if (name == NULL) @@ -298,8 +299,8 @@ static int register_device(int minor, struct pp_struct *pp) port = parport_find_number(minor); if (!port) { pr_warn("%s: no associated port!\n", name); - kfree(name); - return -ENXIO; + rc = -ENXIO; + goto err; } memset(&ppdev_cb, 0, sizeof(ppdev_cb)); @@ -308,16 +309,18 @@ static int register_device(int minor, struct pp_struct *pp) ppdev_cb.private = pp; pdev = parport_register_dev_model(port, name, &ppdev_cb, minor); parport_put_port(port); - kfree(name); if (!pdev) { pr_warn("%s: failed to register device!\n", name); - return -ENXIO; + rc = -ENXIO; + goto err; } pp->pdev = pdev; dev_dbg(&pdev->dev, "registered pardevice\n"); - return 0; +err: + kfree(name); + return rc; } static enum ieee1284_phase init_phase(int mode) From 802c03881f29844af0252b6e22be5d2f65f93fd0 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 6 Jan 2017 02:14:16 +0900 Subject: [PATCH 236/297] sysrq: attach sysrq handler correctly for 32-bit kernel The sysrq input handler should be attached to the input device which has a left alt key. On 32-bit kernels, some input devices which has a left alt key cannot attach sysrq handler. Because the keybit bitmap in struct input_device_id for sysrq is not correctly initialized. KEY_LEFTALT is 56 which is greater than BITS_PER_LONG on 32-bit kernels. I found this problem when using a matrix keypad device which defines a KEY_LEFTALT (56) but doesn't have a KEY_O (24 == 56%32). Cc: Jiri Slaby Signed-off-by: Akinobu Mita Acked-by: Dmitry Torokhov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/sysrq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 52bbd27e93ae..701c085bb19b 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -946,8 +946,8 @@ static const struct input_device_id sysrq_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT_MASK(EV_KEY) }, - .keybit = { BIT_MASK(KEY_LEFTALT) }, + .evbit = { [BIT_WORD(EV_KEY)] = BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_LEFTALT)] = BIT_MASK(KEY_LEFTALT) }, }, { }, }; From 546cf3ef9c92b76ff0037c871b939e63caea98b3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 26 Dec 2016 09:58:34 -0800 Subject: [PATCH 237/297] auxdisplay: fix new ht16k33 build errors Fix build errors caused by selecting incorrect kconfig symbols. drivers/built-in.o:(.data+0x19cec): undefined reference to `sys_fillrect' drivers/built-in.o:(.data+0x19cf0): undefined reference to `sys_copyarea' drivers/built-in.o:(.data+0x19cf4): undefined reference to `sys_imageblit' Fixes: 31114fa95bdb (auxdisplay: ht16k33: select framebuffer helper modules) Signed-off-by: Randy Dunlap Cc: Miguel Ojeda Sandonis Reported-by: kbuild test robot Acked-by: Robin van der Gracht Signed-off-by: Greg Kroah-Hartman --- drivers/auxdisplay/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 4ef4c5caed4f..8a8e403644d6 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -132,9 +132,9 @@ config HT16K33 tristate "Holtek Ht16K33 LED controller with keyscan" depends on FB && OF && I2C && INPUT select FB_SYS_FOPS - select FB_CFB_FILLRECT - select FB_CFB_COPYAREA - select FB_CFB_IMAGEBLIT + select FB_SYS_FILLRECT + select FB_SYS_COPYAREA + select FB_SYS_IMAGEBLIT select INPUT_MATRIXKMAP select FB_BACKLIGHT help From 24b91e360ef521a2808771633d76ebc68bd5604b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Jan 2017 15:12:04 +0100 Subject: [PATCH 238/297] nohz: Fix collision between tick and other hrtimers When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-by: James Hartsock Signed-off-by: Frederic Weisbecker Reviewed-by: Wanpeng Li Acked-by: Peter Zijlstra Acked-by: Rik van Riel Link: http://lkml.kernel.org/r/1483539124-5693-1-git-send-email-fweisbec@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 9 +++++++-- kernel/time/tick-sched.h | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2c115fdab397..74e0388cc88d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -767,7 +767,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && (expires == dev->next_event)) + if (ts->tick_stopped && (expires == ts->next_tick)) goto out; /* @@ -787,6 +787,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, trace_tick_stop(1, TICK_DEP_MASK_NONE); } + ts->next_tick = tick; + /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. @@ -802,7 +804,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, else tick_program_event(tick, 1); out: - /* Update the estimated sleep length */ + /* + * Update the estimated sleep length until the next timer + * (not only the tick). + */ ts->sleep_length = ktime_sub(dev->next_event, now); return tick; } diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index bf38226e5c17..075444e3d48e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -27,6 +27,7 @@ enum tick_nohz_mode { * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -44,6 +45,7 @@ struct tick_sched { unsigned long check_clocks; enum tick_nohz_mode nohz_mode; ktime_t last_tick; + ktime_t next_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; From c8a6a09c1c617402cc9254b2bc8da359a0347d75 Mon Sep 17 00:00:00 2001 From: Augusto Mecking Caringi Date: Tue, 10 Jan 2017 10:45:00 +0000 Subject: [PATCH 239/297] vme: Fix wrong pointer utilization in ca91cx42_slave_get MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ca91cx42_slave_get function, the value pointed by vme_base pointer is set through: *vme_base = ioread32(bridge->base + CA91CX42_VSI_BS[i]); So it must be dereferenced to be used in calculation of pci_base: *pci_base = (dma_addr_t)*vme_base + pci_offset; This bug was caught thanks to the following gcc warning: drivers/vme/bridges/vme_ca91cx42.c: In function ‘ca91cx42_slave_get’: drivers/vme/bridges/vme_ca91cx42.c:467:14: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] *pci_base = (dma_addr_t)vme_base + pci_offset; Signed-off-by: Augusto Mecking Caringi Acked-By: Martyn Welch Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/vme/bridges/vme_ca91cx42.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vme/bridges/vme_ca91cx42.c b/drivers/vme/bridges/vme_ca91cx42.c index 6b5ee896af63..7cc51223db1c 100644 --- a/drivers/vme/bridges/vme_ca91cx42.c +++ b/drivers/vme/bridges/vme_ca91cx42.c @@ -464,7 +464,7 @@ static int ca91cx42_slave_get(struct vme_slave_resource *image, int *enabled, vme_bound = ioread32(bridge->base + CA91CX42_VSI_BD[i]); pci_offset = ioread32(bridge->base + CA91CX42_VSI_TO[i]); - *pci_base = (dma_addr_t)vme_base + pci_offset; + *pci_base = (dma_addr_t)*vme_base + pci_offset; *size = (unsigned long long)((vme_bound - *vme_base) + granularity); *enabled = 0; From 69d012345a1a32d3f03957f14d972efccc106a98 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 11 Jan 2017 14:02:00 +0800 Subject: [PATCH 240/297] arm64: hugetlb: fix the wrong return value for huge_ptep_set_access_flags In current code, the @changed always returns the last one's status for the huge page with the contiguous bit set. This is really not what we want. Even one of the PTEs is changed, we should tell it to the caller. This patch fixes this issue. Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Cc: # 4.5.x- Signed-off-by: Huang Shijie Signed-off-by: Catalin Marinas --- arch/arm64/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 964b7549af5c..e25584d72396 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -239,7 +239,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, ncontig = find_num_contig(vma->vm_mm, addr, cpte, *cpte, &pgsize); for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { - changed = ptep_set_access_flags(vma, addr, cpte, + changed |= ptep_set_access_flags(vma, addr, cpte, pfn_pte(pfn, hugeprot), dirty); From 2d5a9c72d0c4ac73cf97f4b7814ed6c44b1e49ae Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Jan 2017 19:15:18 +0100 Subject: [PATCH 241/297] USB: serial: ch341: fix control-message error handling A short control transfer would currently fail to be detected, something which could lead to stale buffer data being used as valid input. Check for short transfers, and make sure to log any transfer errors. Note that this also avoids leaking heap data to user space (TIOCMGET) and the remote device (break control). Fixes: 6ce76104781a ("USB: Driver for CH341 USB-serial adaptor") Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 8d7b0847109b..95aa5233726c 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -113,6 +113,8 @@ static int ch341_control_out(struct usb_device *dev, u8 request, r = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), request, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, value, index, NULL, 0, DEFAULT_TIMEOUT); + if (r < 0) + dev_err(&dev->dev, "failed to send control message: %d\n", r); return r; } @@ -130,7 +132,20 @@ static int ch341_control_in(struct usb_device *dev, r = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), request, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, value, index, buf, bufsize, DEFAULT_TIMEOUT); - return r; + if (r < bufsize) { + if (r >= 0) { + dev_err(&dev->dev, + "short control message received (%d < %u)\n", + r, bufsize); + r = -EIO; + } + + dev_err(&dev->dev, "failed to receive control message: %d\n", + r); + return r; + } + + return 0; } static int ch341_set_baudrate_lcr(struct usb_device *dev, @@ -181,9 +196,9 @@ static int ch341_set_handshake(struct usb_device *dev, u8 control) static int ch341_get_status(struct usb_device *dev, struct ch341_private *priv) { + const unsigned int size = 2; char *buffer; int r; - const unsigned size = 8; unsigned long flags; buffer = kmalloc(size, GFP_KERNEL); @@ -194,14 +209,9 @@ static int ch341_get_status(struct usb_device *dev, struct ch341_private *priv) if (r < 0) goto out; - /* setup the private status if available */ - if (r == 2) { - r = 0; - spin_lock_irqsave(&priv->lock, flags); - priv->line_status = (~(*buffer)) & CH341_BITS_MODEM_STAT; - spin_unlock_irqrestore(&priv->lock, flags); - } else - r = -EPROTO; + spin_lock_irqsave(&priv->lock, flags); + priv->line_status = (~(*buffer)) & CH341_BITS_MODEM_STAT; + spin_unlock_irqrestore(&priv->lock, flags); out: kfree(buffer); return r; @@ -211,9 +221,9 @@ out: kfree(buffer); static int ch341_configure(struct usb_device *dev, struct ch341_private *priv) { + const unsigned int size = 2; char *buffer; int r; - const unsigned size = 8; buffer = kmalloc(size, GFP_KERNEL); if (!buffer) From 6d6daa20945f3f598e56e18d1f926c08754f5801 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 5 Jan 2017 10:09:25 -0500 Subject: [PATCH 242/297] perf/x86/intel/uncore: Fix hardcoded socket 0 assumption in the Haswell init code hswep_uncore_cpu_init() uses a hardcoded physical package id 0 for the boot cpu. This works as long as the boot CPU is actually on the physical package 0, which is normaly the case after power on / reboot. But it fails with a NULL pointer dereference when a kdump kernel is started on a secondary socket which has a different physical package id because the locigal package translation for physical package 0 does not exist. Use the logical package id of the boot cpu instead of hard coded 0. [ tglx: Rewrote changelog once more ] Fixes: cf6d445f6897 ("perf/x86/uncore: Track packages, not per CPU data") Signed-off-by: Prarit Bhargava Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Harish Chegondi Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1483628965-2890-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index e6832be714bc..dae2fedc1601 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { void hswep_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = boot_cpu_data.logical_proc_id; if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; From 7cfd5fd5a9813f1430290d20c0fead9b4582a307 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Jan 2017 19:52:43 -0800 Subject: [PATCH 243/297] gro: use min_t() in skb_gro_reset_offset() On 32bit arches, (skb->end - skb->data) is not 'unsigned int', so we shall use min_t() instead of min() to avoid a compiler error. Fixes: 1272ce87fa01 ("gro: Enter slow-path if there is no tailroom") Reported-by: kernel test robot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 88d2907ca2cd..07b307b0b414 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4441,8 +4441,9 @@ static void skb_gro_reset_offset(struct sk_buff *skb) pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = min(skb_frag_size(frag0), - skb->end - skb->tail); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); } } From 73b351473547e543e9c8166dd67fd99c64c15b0b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jan 2017 13:08:06 +0100 Subject: [PATCH 244/297] cgroup: move CONFIG_SOCK_CGROUP_DATA to init/Kconfig We now 'select SOCK_CGROUP_DATA' but Kconfig complains that this is not right when CONFIG_NET is disabled and there is no socket interface: warning: (CGROUP_BPF) selects SOCK_CGROUP_DATA which has unmet direct dependencies (NET) I don't know what the correct solution for this is, but simply removing the dependency on NET from SOCK_CGROUP_DATA by moving it out of the 'if NET' section avoids the warning and does not produce other build errors. Fixes: 483c4933ea09 ("cgroup: Fix CGROUP_BPF config") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- init/Kconfig | 4 ++++ net/Kconfig | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 223b734abccd..e1a937348a3e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1176,6 +1176,10 @@ config CGROUP_DEBUG Say N. +config SOCK_CGROUP_DATA + bool + default n + endif # CGROUPS config CHECKPOINT_RESTORE diff --git a/net/Kconfig b/net/Kconfig index a1005007224c..a29bb4b41c50 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -258,10 +258,6 @@ config XPS config HWBM bool -config SOCK_CGROUP_DATA - bool - default n - config CGROUP_NET_PRIO bool "Network priority cgroup" depends on CGROUPS From 7a18c5b9fb31a999afc62b0e60978aa896fc89e9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 10 Jan 2017 14:37:35 -0800 Subject: [PATCH 245/297] net: ipv4: Fix multipath selection with vrf fib_select_path does not call fib_select_multipath if oif is set in the flow struct. For VRF use cases oif is always set, so multipath route selection is bypassed. Use the FLOWI_FLAG_SKIP_NH_OIF to skip the oif check similar to what is done in fib_table_lookup. Add saddr and proto to the flow struct for the fib lookup done by the VRF driver to better match hash computation for a flow. Fixes: 613d09b30f8b ("net: Use VRF device index for lookups on TX") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 ++ net/ipv4/fib_semantics.c | 9 +++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 23dfb0eac098..0a067708aa39 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -263,7 +263,9 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_tos = RT_TOS(ip4h->tos), .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF, + .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, + .saddr = ip4h->saddr, }; struct net *net = dev_net(vrf_dev); struct rtable *rt; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7a5b4c7d9a87..eba1546b5031 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1618,8 +1618,13 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, int mp_hash) { + bool oif_check; + + oif_check = (fl4->flowi4_oif == 0 || + fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF); + #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (res->fi->fib_nhs > 1 && oif_check) { if (mp_hash < 0) mp_hash = get_hash_from_flowi4(fl4) >> 1; @@ -1629,7 +1634,7 @@ void fib_select_path(struct net *net, struct fib_result *res, #endif if (!res->prefixlen && res->table->tb_num_default > 1 && - res->type == RTN_UNICAST && !fl4->flowi4_oif) + res->type == RTN_UNICAST && oif_check) fib_select_default(fl4, res); if (!fl4->saddr) From eb004603c857f3e3bfcda437b6c68fd258c54960 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Jan 2017 22:53:06 +0000 Subject: [PATCH 246/297] sctp: Fix spelling mistake: "Atempt" -> "Attempt" Trivial fix to spelling mistake in WARN_ONCE message Signed-off-by: Colin Ian King Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index e54082699520..34efaa4ef2f6 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1048,7 +1048,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) (new_transport->state == SCTP_PF))) new_transport = asoc->peer.active_path; if (new_transport->state == SCTP_UNCONFIRMED) { - WARN_ONCE(1, "Atempt to send packet on unconfirmed path."); + WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; From a13c06525ab9ff442924e67df9393a5efa914c56 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 10 Jan 2017 23:13:45 +0000 Subject: [PATCH 247/297] net: phy: marvell: fix Marvell 88E1512 used in SGMII mode When an Marvell 88E1512 PHY is connected to a nic in SGMII mode, the fiber page is used for the SGMII host-side connection. The PHY driver notices that SUPPORTED_FIBRE is set, so it tries reading the fiber page for the link status, and ends up reading the MAC-side status instead of the outgoing (copper) link. This leads to incorrect results reported via ethtool. If the PHY is connected via SGMII to the host, ignore the fiber page. However, continue to allow the existing power management code to suspend and resume the fiber page. Fixes: 6cfb3bcc0641 ("Marvell phy: check link status in case of fiber link.") Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e269262471a4..0b78210c0fa7 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1192,7 +1192,8 @@ static int marvell_read_status(struct phy_device *phydev) int err; /* Check the fiber mode first */ - if (phydev->supported & SUPPORTED_FIBRE) { + if (phydev->supported & SUPPORTED_FIBRE && + phydev->interface != PHY_INTERFACE_MODE_SGMII) { err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M1111_FIBER); if (err < 0) goto error; From 24c63bbc18e25d5d8439422aa5fd2d66390b88eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 10 Jan 2017 15:22:25 -0800 Subject: [PATCH 248/297] net: vrf: do not allow table id 0 Frank reported that vrf devices can be created with a table id of 0. This breaks many of the run time table id checks and should not be allowed. Detect this condition at create time and fail with EINVAL. Fixes: 193125dbd8eb ("net: Introduce VRF device driver") Reported-by: Frank Kellermann Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 0a067708aa39..454f907d419a 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1252,6 +1252,8 @@ static int vrf_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); + if (vrf->tb_id == RT_TABLE_UNSPEC) + return -EINVAL; dev->priv_flags |= IFF_L3MDEV_MASTER; From ad5013d5699d30ded0cdbbc68b93b2aa28222c6e Mon Sep 17 00:00:00 2001 From: Colin King Date: Wed, 11 Jan 2017 11:43:10 +0000 Subject: [PATCH 249/297] perf/x86/intel: Use ULL constant to prevent undefined shift behaviour When x86_pmu.num_counters is 32 the shift of the integer constant 1 is exceeding 32bit and therefor undefined behaviour. Fix this by shifting 1ULL instead of 1. Reported-by: CoverityScan CID#1192105 ("Bad bit shift operation") Signed-off-by: Colin Ian King Cc: Andi Kleen Cc: Peter Zijlstra Cc: Kan Liang Cc: Stephane Eranian Cc: Alexander Shishkin Link: http://lkml.kernel.org/r/20170111114310.17928-1-colin.king@canonical.com Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 86138267b68a..d611cab214a6 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3987,7 +3987,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", From d6169d04097fd9ddf811e63eae4e5cd71e6666e2 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Wed, 11 Jan 2017 17:10:34 +0200 Subject: [PATCH 250/297] xhci: fix deadlock at host remove by running watchdog correctly If a URB is killed while the host is removed we can end up in a situation where the hub thread takes the roothub device lock, and waits for the URB to be given back by xhci-hcd, blocking the host remove code. xhci-hcd tries to stop the endpoint and give back the urb, but can't as the host is removed from PCI bus at the same time, preventing the normal way of giving back urb. Instead we need to rely on the stop command timeout function to give back the urb. This xhci_stop_endpoint_command_watchdog() timeout function used a XHCI_STATE_DYING flag to indicate if the timeout function is already running, but later this flag has been taking into use in other places to mark that xhci is dying. Remove checks for XHCI_STATE_DYING in xhci_urb_dequeue. We are still checking that reading from pci state does not return 0xffffffff or that host is not halted before trying to stop the endpoint. This whole area of stopping endpoints, giving back URBs, and the wathdog timeout need rework, this fix focuses on solving a specific deadlock issue that we can then send to stable before any major rework. Cc: Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 11 ----------- drivers/usb/host/xhci.c | 13 ------------- 2 files changed, 24 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 25f522b09dd9..e32029a31ca4 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -913,17 +913,6 @@ void xhci_stop_endpoint_command_watchdog(unsigned long arg) spin_lock_irqsave(&xhci->lock, flags); ep->stop_cmds_pending--; - if (xhci->xhc_state & XHCI_STATE_REMOVING) { - spin_unlock_irqrestore(&xhci->lock, flags); - return; - } - if (xhci->xhc_state & XHCI_STATE_DYING) { - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb, - "Stop EP timer ran, but another timer marked " - "xHCI as DYING, exiting."); - spin_unlock_irqrestore(&xhci->lock, flags); - return; - } if (!(ep->stop_cmds_pending == 0 && (ep->ep_state & EP_HALT_PENDING))) { xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb, "Stop EP timer ran, but no command pending, " diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 0c8deb9ed42d..9a0ec116654a 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1534,19 +1534,6 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) xhci_urb_free_priv(urb_priv); return ret; } - if ((xhci->xhc_state & XHCI_STATE_DYING) || - (xhci->xhc_state & XHCI_STATE_HALTED)) { - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb, - "Ep 0x%x: URB %p to be canceled on " - "non-responsive xHCI host.", - urb->ep->desc.bEndpointAddress, urb); - /* Let the stop endpoint command watchdog timer (which set this - * state) finish cleaning up the endpoint TD lists. We must - * have caught it in the middle of dropping a lock and giving - * back an URB. - */ - goto done; - } ep_index = xhci_get_endpoint_index(&urb->ep->desc); ep = &xhci->devs[urb->dev->slot_id]->eps[ep_index]; From 1392370ee7de8aa3f69936f55bea6bfcc9879c59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jan 2017 14:29:02 +0300 Subject: [PATCH 251/297] nvme-rdma: fix nvme_rdma_queue_is_ready Now that we don't abuse the cmd field in struct request for nvme command passthrough this function needs to be converted to the proper accessor as well. Fixes: d49187e97e ("nvme: introduce struct nvme_request") Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f587af345889..34e564857716 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1422,7 +1422,7 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) { if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { - struct nvme_command *cmd = (struct nvme_command *)rq->cmd; + struct nvme_command *cmd = nvme_req(rq)->cmd; if (rq->cmd_type != REQ_TYPE_DRV_PRIV || cmd->common.opcode != nvme_fabrics_command || From b5a10c5f7532b7473776da87e67f8301bbc32693 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 28 Dec 2016 22:13:15 -0200 Subject: [PATCH 252/297] nvme: apply DELAY_BEFORE_CHK_RDY quirk at probe time too Commit 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter readiness") introduced a quirk to adapters that cannot read the bit NVME_CSTS_RDY right after register NVME_REG_CC is set; these adapters need a delay or else the action of reading the bit NVME_CSTS_RDY could somehow corrupt adapter's registers state and it never recovers. When this quirk was added, we checked ctrl->tagset in order to avoid quirking in probe time, supposing we would never require such delay during probe. Well, it was too optimistic; we in fact need this quirk at probe time in some cases, like after a kexec. In some experiments, after abnormal shutdown of machine (aka power cord unplug), we booted into our bootloader in Power, which is a Linux kernel, and kexec'ed into another distro. If this kexec is too quick, we end up reaching the probe of NVMe adapter in that distro when adapter is in bad state (not fully initialized on our bootloader). What happens next is that nvme_wait_ready() is unable to complete, except if the quirk is enabled. So, this patch removes the original ctrl->tagset verification in order to enable the quirk even on probe time. Fixes: 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter readiness") Reported-by: Andrew Byrne Reported-by: Jaime A. H. Gomez Reported-by: Zachary D. Myers Signed-off-by: Guilherme G. Piccoli Acked-by: Jeffrey Lien Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2fc86dc7a8df..8a3c3e32a704 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1106,12 +1106,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) if (ret) return ret; - /* Checking for ctrl->tagset is a trick to avoid sleeping on module - * load, since we only need the quirk on reset_controller. Notice - * that the HGST device needs this delay only in firmware activation - * procedure; unfortunately we have no (easy) way to verify this. - */ - if ((ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) && ctrl->tagset) + if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); return nvme_wait_ready(ctrl, cap, false); From 0a417b8dc1f10b03e8f558b8a831f07ec4c23795 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Jan 2017 10:20:04 -0800 Subject: [PATCH 253/297] xfs: Timely free truncated dirty pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 99579ccec4e2 "xfs: skip dirty pages in ->releasepage()" started to skip dirty pages in xfs_vm_releasepage() which also has the effect that if a dirty page is truncated, it does not get freed by block_invalidatepage() and is lingering in LRU list waiting for reclaim. So a simple loop like: while true; do dd if=/dev/zero of=file bs=1M count=100 rm file done will keep using more and more memory until we hit low watermarks and start pagecache reclaim which will eventually reclaim also the truncate pages. Keeping these truncated (and thus never usable) pages in memory is just a waste of memory, is unnecessarily stressing page cache reclaim, and reportedly also leads to anonymous mmap(2) returning ENOMEM prematurely. So instead of just skipping dirty pages in xfs_vm_releasepage(), return to old behavior of skipping them only if they have delalloc or unwritten buffers and fix the spurious warnings by warning only if the page is clean. CC: stable@vger.kernel.org CC: Brian Foster CC: Vlastimil Babka Reported-by: Petr Tůma Fixes: 99579ccec4e271c3d4d4e7c946058766812afdab Signed-off-by: Jan Kara Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0f56fcd3a5d5..631e7c0e0a29 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1152,19 +1152,22 @@ xfs_vm_releasepage( * block_invalidatepage() can send pages that are still marked dirty * but otherwise have invalidated buffers. * - * We've historically freed buffers on the latter. Instead, quietly - * filter out all dirty pages to avoid spurious buffer state warnings. - * This can likely be removed once shrink_active_list() is fixed. + * We want to release the latter to avoid unnecessary buildup of the + * LRU, skip the former and warn if we've left any lingering + * delalloc/unwritten buffers on clean pages. Skip pages with delalloc + * or unwritten buffers and warn if the page is not dirty. Otherwise + * try to release the buffers. */ - if (PageDirty(page)) - return 0; - xfs_count_page_state(page, &delalloc, &unwritten); - if (WARN_ON_ONCE(delalloc)) + if (delalloc) { + WARN_ON_ONCE(!PageDirty(page)); return 0; - if (WARN_ON_ONCE(unwritten)) + } + if (unwritten) { + WARN_ON_ONCE(!PageDirty(page)); return 0; + } return try_to_free_buffers(page); } From 6ed0993a0b859ce62edf2930ded683e452286d39 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 7 Jan 2017 09:27:49 +0300 Subject: [PATCH 254/297] vfio-mdev: return -EFAULT if copy_to_user() fails The copy_to_user() function returns the number of bytes which it wasn't able to copy but we want to return a negative error code. Fixes: 9d1a546c53b4 ("docs: Sample driver to demonstrate how to use Mediated device framework.") Signed-off-by: Dan Carpenter Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- samples/vfio-mdev/mtty.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 1fc57a5093a7..975af5bbf28d 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1180,7 +1180,10 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, memcpy(&mdev_state->dev_info, &info, sizeof(info)); - return copy_to_user((void __user *)arg, &info, minsz); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; } case VFIO_DEVICE_GET_REGION_INFO: { @@ -1201,7 +1204,10 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, if (ret) return ret; - return copy_to_user((void __user *)arg, &info, minsz); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; } case VFIO_DEVICE_GET_IRQ_INFO: @@ -1224,7 +1230,10 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, if (info.count == -1) return -EINVAL; - return copy_to_user((void __user *)arg, &info, minsz); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; } case VFIO_DEVICE_SET_IRQS: { From 5c677869e0abbffbade2cfd82d46d0eebe823f34 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 7 Jan 2017 09:28:40 +0300 Subject: [PATCH 255/297] vfio-mdev: buffer overflow in ioctl() This is a sample driver for documentation so the impact is probably pretty low. But we should check that bar_index is valid so we don't write beyond the end of the mdev_state->region_info[] array. Fixes: 9d1a546c53b4 ("docs: Sample driver to demonstrate how to use Mediated device framework.") Signed-off-by: Dan Carpenter Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- samples/vfio-mdev/mtty.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 975af5bbf28d..382f4797428f 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1073,7 +1073,7 @@ int mtty_get_region_info(struct mdev_device *mdev, { unsigned int size = 0; struct mdev_state *mdev_state; - int bar_index; + u32 bar_index; if (!mdev) return -EINVAL; @@ -1082,8 +1082,11 @@ int mtty_get_region_info(struct mdev_device *mdev, if (!mdev_state) return -EINVAL; - mutex_lock(&mdev_state->ops_lock); bar_index = region_info->index; + if (bar_index >= VFIO_PCI_NUM_REGIONS) + return -EINVAL; + + mutex_lock(&mdev_state->ops_lock); switch (bar_index) { case VFIO_PCI_CONFIG_REGION_INDEX: From 73da4268fdbae972f617946d1c690f2136964802 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 7 Jan 2017 09:30:08 +0300 Subject: [PATCH 256/297] vfio-mdev: remove some dead code We set info.count to 1 in mtty_get_irq_info() so static checkers complain that, "Why do we have impossible conditions?" The answer is that it seems to be left over dead code that can be safely removed. Signed-off-by: Dan Carpenter Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- samples/vfio-mdev/mtty.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 382f4797428f..ca495686b9c3 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1230,9 +1230,6 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, if (ret) return ret; - if (info.count == -1) - return -EINVAL; - if (copy_to_user((void __user *)arg, &info, minsz)) return -EFAULT; From a89af4abdf9b353cdd6f61afc0eaaac403304873 Mon Sep 17 00:00:00 2001 From: Brendan McGrath Date: Sat, 7 Jan 2017 08:01:38 +1100 Subject: [PATCH 257/297] HID: i2c-hid: Add sleep between POWER ON and RESET Support for the Asus Touchpad was recently added. It turns out this device can fail initialisation (and become unusable) when the RESET command is sent too soon after the POWER ON command. Unfortunately the i2c-hid specification does not specify the need for a delay between these two commands. But it was discovered the Windows driver has a 1ms delay. As a result, this patch modifies the i2c-hid module to add a sleep inbetween the POWER ON and RESET commands which lasts between 1ms and 5ms. See https://github.com/vlasenko/hid-asus-dkms/issues/24 for further details. Signed-off-by: Brendan McGrath Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/i2c-hid/i2c-hid.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index 78fb32a7b103..ea3c3546cef7 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -426,6 +426,15 @@ static int i2c_hid_hwreset(struct i2c_client *client) if (ret) goto out_unlock; + /* + * The HID over I2C specification states that if a DEVICE needs time + * after the PWR_ON request, it should utilise CLOCK stretching. + * However, it has been observered that the Windows driver provides a + * 1ms sleep between the PWR_ON and RESET requests and that some devices + * rely on this. + */ + usleep_range(1000, 5000); + i2c_hid_dbg(ihid, "resetting...\n"); ret = i2c_hid_command(client, &hid_reset_cmd, NULL, 0); From 900742d89c1b4e04bd373aec8470b88e183f08ca Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Jan 2017 12:00:22 -0600 Subject: [PATCH 258/297] x86/unwind: Silence warnings for non-current tasks There are a handful of callers to save_stack_trace_tsk() and show_stack() which try to unwind the stack of a task other than current. In such cases, it's remotely possible that the task is running on one CPU while the unwinder is reading its stack from another CPU, causing the unwinder to see stack corruption. These cases seem to be mostly harmless. The unwinder has checks which prevent it from following bad pointers beyond the bounds of the stack. So it's not really a bug as long as the caller understands that unwinding another task will not always succeed. Since stack "corruption" on another task's stack isn't necessarily a bug, silence the warnings when unwinding tasks other than current. Reported-by: Dave Jones Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Miroslav Benes Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/00d8c50eea3446c1524a2a755397a3966629354c.1483978430.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 4443e499f279..195eebf6da20 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -207,6 +207,16 @@ bool unwind_next_frame(struct unwind_state *state) return true; bad_address: + /* + * When unwinding a non-current task, the task might actually be + * running on another CPU, in which case it could be modifying its + * stack while we're reading it. This is generally not a problem and + * can be ignored as long as the caller understands that unwinding + * another task will not always succeed. + */ + if (state->task != current) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", From 84936118bdf37bda513d4a361c38181a216427e0 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Jan 2017 12:00:23 -0600 Subject: [PATCH 259/297] x86/unwind: Disable KASAN checks for non-current tasks There are a handful of callers to save_stack_trace_tsk() and show_stack() which try to unwind the stack of a task other than current. In such cases, it's remotely possible that the task is running on one CPU while the unwinder is reading its stack from another CPU, causing the unwinder to see stack corruption. These cases seem to be mostly harmless. The unwinder has checks which prevent it from following bad pointers beyond the bounds of the stack. So it's not really a bug as long as the caller understands that unwinding another task will not always succeed. In such cases, it's possible that the unwinder may read a KASAN-poisoned region of the stack. Account for that by using READ_ONCE_NOCHECK() when reading the stack of another task. Use READ_ONCE() when reading the stack of the current task, since KASAN warnings can still be useful for finding bugs in that case. Reported-by: Dmitry Vyukov Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Jones Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Miroslav Benes Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/4c575eb288ba9f73d498dfe0acde2f58674598f1.1483978430.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 5 ++++- arch/x86/kernel/unwind_frame.c | 20 ++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a3269c897ec5..20ce3db20f24 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -52,13 +52,16 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { + struct inactive_task_frame *frame; + if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); - return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; + frame = (struct inactive_task_frame *)task->thread.sp; + return (unsigned long *)READ_ONCE_NOCHECK(frame->bp); } #else static inline unsigned long * diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 195eebf6da20..23d15565d02a 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -6,6 +6,21 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + static void unwind_dump(struct unwind_state *state, unsigned long *sp) { static bool dumped_before = false; @@ -48,7 +63,8 @@ unsigned long unwind_get_return_address(struct unwind_state *state) if (state->regs && user_mode(state->regs)) return 0; - addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return __kernel_text_address(addr) ? addr : 0; @@ -162,7 +178,7 @@ bool unwind_next_frame(struct unwind_state *state) if (state->regs) next_bp = (unsigned long *)state->regs->bp; else - next_bp = (unsigned long *)*state->bp; + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp); /* is the next frame pointer an encoded pointer to pt_regs? */ regs = decode_frame_pointer(next_bp); From 2c96b2fe9c57b4267c3f0a680d82d7cc52e1c447 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Jan 2017 12:00:24 -0600 Subject: [PATCH 260/297] x86/unwind: Include __schedule() in stack traces In the following commit: 0100301bfdf5 ("sched/x86: Rewrite the switch_to() code") ... the layout of the 'inactive_task_frame' struct was designed to have a frame pointer header embedded in it, so that the unwinder could use the 'bp' and 'ret_addr' fields to report __schedule() on the stack (or ret_from_fork() for newly forked tasks which haven't actually run yet). Finish the job by changing get_frame_pointer() to return a pointer to inactive_task_frame's 'bp' field rather than 'bp' itself. This allows the unwinder to start one frame higher on the stack, so that it properly reports __schedule(). Reported-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Jones Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/598e9f7505ed0aba86e8b9590aa528c6c7ae8dcd.1483978430.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 5 +---- arch/x86/include/asm/switch_to.h | 10 +++++++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 20ce3db20f24..2e41c50ddf47 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -52,16 +52,13 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { - struct inactive_task_frame *frame; - if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); - frame = (struct inactive_task_frame *)task->thread.sp; - return (unsigned long *)READ_ONCE_NOCHECK(frame->bp); + return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 5cb436acd463..fcc5cd387fd1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -36,7 +36,10 @@ static inline void prepare_switch_to(struct task_struct *prev, asmlinkage void ret_from_fork(void); -/* data that is pointed to by thread.sp */ +/* + * This is the structure pointed to by thread.sp for an inactive task. The + * order of the fields must match the code in __switch_to_asm(). + */ struct inactive_task_frame { #ifdef CONFIG_X86_64 unsigned long r15; @@ -48,6 +51,11 @@ struct inactive_task_frame { unsigned long di; #endif unsigned long bx; + + /* + * These two fields must be together. They form a stack frame header, + * needed by get_frame_pointer(). + */ unsigned long bp; unsigned long ret_addr; }; From ff3f7e2475bbf9201e95824e72698fcdc5c3d47a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Jan 2017 12:00:25 -0600 Subject: [PATCH 261/297] x86/entry: Fix the end of the stack for newly forked tasks When unwinding a task, the end of the stack is always at the same offset right below the saved pt_regs, regardless of which syscall was used to enter the kernel. That convention allows the unwinder to verify that a stack is sane. However, newly forked tasks don't always follow that convention, as reported by the following unwinder warning seen by Dave Jones: WARNING: kernel stack frame pointer at ffffc90001443f30 in kworker/u8:8:30468 has bad value (null) The warning was due to the following call chain: (ftrace handler) call_usermodehelper_exec_async+0x5/0x140 ret_from_fork+0x22/0x30 The problem is that ret_from_fork() doesn't create a stack frame before calling other functions. Fix that by carefully using the frame pointer macros. In addition to conforming to the end of stack convention, this also makes related stack traces more sensible by making it clear to the user that ret_from_fork() was involved. Reported-by: Dave Jones Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Miroslav Benes Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8854cdaab980e9700a81e9ebf0d4238e4bbb68ef.1483978430.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 30 +++++++++++------------------- arch/x86/entry/entry_64.S | 11 +++++++---- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 701d29f8e4d3..57f7ec35216e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -254,23 +254,6 @@ ENTRY(__switch_to_asm) jmp __switch_to END(__switch_to_asm) -/* - * The unwinder expects the last frame on the stack to always be at the same - * offset from the end of the page, which allows it to validate the stack. - * Calling schedule_tail() directly would break that convention because its an - * asmlinkage function so its argument has to be pushed on the stack. This - * wrapper creates a proper "end of stack" frame header before the call. - */ -ENTRY(schedule_tail_wrapper) - FRAME_BEGIN - - pushl %eax - call schedule_tail - popl %eax - - FRAME_END - ret -ENDPROC(schedule_tail_wrapper) /* * A newly forked process directly context switches into this address. * @@ -279,15 +262,24 @@ ENDPROC(schedule_tail_wrapper) * edi: kernel thread arg */ ENTRY(ret_from_fork) - call schedule_tail_wrapper + FRAME_BEGIN /* help unwinder find end of stack */ + + /* + * schedule_tail() is asmlinkage so we have to put its 'prev' argument + * on the stack. + */ + pushl %eax + call schedule_tail + popl %eax testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ 2: /* When we fork, we trace the syscall return in the child, too. */ - movl %esp, %eax + leal FRAME_OFFSET(%esp), %eax call syscall_return_slowpath + FRAME_END jmp restore_all /* kernel thread */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5b219707c2f2..044d18ebc43c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include #include #include +#include #include .code64 @@ -408,17 +409,19 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + FRAME_BEGIN /* help unwinder find end of stack */ movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ + call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ 2: - movq %rsp, %rdi + leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */ call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS + FRAME_END jmp restore_regs_and_iret 1: From b6416e61012429e0277bd15a229222fd17afc1c1 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 16 Dec 2016 14:30:35 -0800 Subject: [PATCH 262/297] jump_labels: API for flushing deferred jump label updates Modules that use static_key_deferred need a way to synchronize with any delayed work that is still pending when the module is unloaded. Introduce static_key_deferred_flush() which flushes any pending jump label updates. Signed-off-by: David Matlack Cc: stable@vger.kernel.org Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini --- include/linux/jump_label_ratelimit.h | 5 +++++ kernel/jump_label.c | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h index 089f70f83e97..23da3af459fe 100644 --- a/include/linux/jump_label_ratelimit.h +++ b/include/linux/jump_label_ratelimit.h @@ -14,6 +14,7 @@ struct static_key_deferred { #ifdef HAVE_JUMP_LABEL extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern void static_key_deferred_flush(struct static_key_deferred *key); extern void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); @@ -26,6 +27,10 @@ static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) STATIC_KEY_CHECK_USE(); static_key_slow_dec(&key->key); } +static inline void static_key_deferred_flush(struct static_key_deferred *key) +{ + STATIC_KEY_CHECK_USE(); +} static inline void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 93ad6c1fb9b6..a9b8cf500591 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -182,6 +182,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); +void static_key_deferred_flush(struct static_key_deferred *key) +{ + STATIC_KEY_CHECK_USE(); + flush_delayed_work(&key->work); +} +EXPORT_SYMBOL_GPL(static_key_deferred_flush); + void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { From cef84c302fe051744b983a92764d3fcca933415d Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 16 Dec 2016 14:30:36 -0800 Subject: [PATCH 263/297] KVM: x86: flush pending lapic jump label updates on module unload KVM's lapic emulation uses static_key_deferred (apic_{hw,sw}_disabled). These are implemented with delayed_work structs which can still be pending when the KVM module is unloaded. We've seen this cause kernel panics when the kvm_intel module is quickly reloaded. Use the new static_key_deferred_flush() API to flush pending updates on module unload. Signed-off-by: David Matlack Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 6 ++++++ arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/x86.c | 1 + 3 files changed, 8 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5fe290c1b7d8..2f6ef5121a4c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2426,3 +2426,9 @@ void kvm_lapic_init(void) jump_label_rate_limit(&apic_hw_disabled, HZ); jump_label_rate_limit(&apic_sw_disabled, HZ); } + +void kvm_lapic_exit(void) +{ + static_key_deferred_flush(&apic_hw_disabled); + static_key_deferred_flush(&apic_sw_disabled); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e0c80233b3e1..ff8039d61672 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -110,6 +110,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); +void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2f22810a7e0c..a0ac6e0060fb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6045,6 +6045,7 @@ out: void kvm_arch_exit(void) { + kvm_lapic_exit(); perf_unregister_guest_info_callbacks(&kvm_guest_cbs); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) From 129a72a0d3c8e139a04512325384fe5ac119e74d Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 11 Jan 2017 18:28:29 -0800 Subject: [PATCH 264/297] KVM: x86: Introduce segmented_write_std Introduces segemented_write_std. Switches from emulated reads/writes to standard read/writes in fxsave, fxrstor, sgdt, and sidt. This fixes CVE-2017-2584, a longstanding kernel memory leak. Since commit 283c95d0e389 ("KVM: x86: emulate FXSAVE and FXRSTOR", 2016-11-09), which is luckily not yet in any final release, this would also be an exploitable kernel memory *write*! Reported-by: Dmitry Vyukov Cc: stable@vger.kernel.org Fixes: 96051572c819194c37a8367624b285be10297eca Fixes: 283c95d0e3891b64087706b344a4b545d04a6e62 Suggested-by: Paolo Bonzini Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 56628a44668b..f36d0fa6b885 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -818,6 +818,20 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } +static int segmented_write_std(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned int size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception); +} + /* * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. @@ -3685,8 +3699,8 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, } /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, ctxt->dst.addr.mem, - &desc_ptr, 2 + ctxt->op_bytes); + return segmented_write_std(ctxt, ctxt->dst.addr.mem, + &desc_ptr, 2 + ctxt->op_bytes); } static int em_sgdt(struct x86_emulate_ctxt *ctxt) @@ -3932,7 +3946,7 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) else size = offsetof(struct fxregs_state, xmm_space[0]); - return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); } static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, @@ -3974,7 +3988,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); if (rc != X86EMUL_CONTINUE) return rc; From 4f3dbdf47e150016aacd734e663347fcaa768303 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 5 Jan 2017 17:39:42 -0800 Subject: [PATCH 265/297] KVM: eventfd: fix NULL deref irqbypass consumer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported syzkaller: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] PGD 0 Oops: 0002 [#1] SMP CPU: 1 PID: 125 Comm: kworker/1:1 Not tainted 4.9.0+ #1 Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm] task: ffff9bbe0dfbb900 task.stack: ffffb61802014000 RIP: 0010:irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] Call Trace: irqfd_shutdown+0x66/0xa0 [kvm] process_one_work+0x16b/0x480 worker_thread+0x4b/0x500 kthread+0x101/0x140 ? process_one_work+0x480/0x480 ? kthread_create_on_node+0x60/0x60 ret_from_fork+0x25/0x30 RIP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] RSP: ffffb61802017e20 CR2: 0000000000000008 The syzkaller folks reported a NULL pointer dereference that due to unregister an consumer which fails registration before. The syzkaller creates two VMs w/ an equal eventfd occasionally. So the second VM fails to register an irqbypass consumer. It will make irqfd as inactive and queue an workqueue work to shutdown irqfd and unregister the irqbypass consumer when eventfd is closed. However, the second consumer has been initialized though it fails registration. So the token(same as the first VM's) is taken to unregister the consumer through the workqueue, the consumer of the first VM is found and unregistered, then NULL deref incurred in the path of deleting consumer from the consumers list. This patch fixes it by making irq_bypass_register/unregister_consumer() looks for the consumer entry based on consumer pointer itself instead of token matching. Reported-by: Dmitry Vyukov Suggested-by: Alex Williamson Cc: stable@vger.kernel.org Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Dmitry Vyukov Cc: Alex Williamson Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- virt/lib/irqbypass.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 52abac4bb6a2..6d2fcd6fcb25 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -195,7 +195,7 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { - if (tmp->token == consumer->token) { + if (tmp->token == consumer->token || tmp == consumer) { mutex_unlock(&lock); module_put(THIS_MODULE); return -EBUSY; @@ -245,7 +245,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { - if (tmp->token != consumer->token) + if (tmp != consumer) continue; list_for_each_entry(producer, &producers, node) { From 546d87e5c903a7f3ee7b9f998949a94729fbc65b Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 3 Jan 2017 18:56:19 -0800 Subject: [PATCH 266/297] KVM: x86: fix NULL deref in vcpu_scan_ioapic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: BUG: unable to handle kernel NULL pointer dereference at 00000000000001b0 IP: _raw_spin_lock+0xc/0x30 PGD 3e28eb067 PUD 3f0ac6067 PMD 0 Oops: 0002 [#1] SMP CPU: 0 PID: 2431 Comm: test Tainted: G OE 4.10.0-rc1+ #3 Call Trace: ? kvm_ioapic_scan_entry+0x3e/0x110 [kvm] kvm_arch_vcpu_ioctl_run+0x10a8/0x15f0 [kvm] ? pick_next_task_fair+0xe1/0x4e0 ? kvm_arch_vcpu_load+0xea/0x260 [kvm] kvm_vcpu_ioctl+0x33a/0x600 [kvm] ? hrtimer_try_to_cancel+0x29/0x130 ? do_nanosleep+0x97/0xf0 do_vfs_ioctl+0xa1/0x5d0 ? __hrtimer_init+0x90/0x90 ? do_nanosleep+0x5b/0xf0 SyS_ioctl+0x79/0x90 do_syscall_64+0x6e/0x180 entry_SYSCALL64_slow_path+0x25/0x25 RIP: _raw_spin_lock+0xc/0x30 RSP: ffffa43688973cc0 The syzkaller folks reported a NULL pointer dereference due to ENABLE_CAP succeeding even without an irqchip. The Hyper-V synthetic interrupt controller is activated, resulting in a wrong request to rescan the ioapic and a NULL pointer dereference. #include #include #include #include #include #include #include #include #include #include #ifndef KVM_CAP_HYPERV_SYNIC #define KVM_CAP_HYPERV_SYNIC 123 #endif void* thr(void* arg) { struct kvm_enable_cap cap; cap.flags = 0; cap.cap = KVM_CAP_HYPERV_SYNIC; ioctl((long)arg, KVM_ENABLE_CAP, &cap); return 0; } int main() { void *host_mem = mmap(0, 0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); int kvmfd = open("/dev/kvm", 0); int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); struct kvm_userspace_memory_region memreg; memreg.slot = 0; memreg.flags = 0; memreg.guest_phys_addr = 0; memreg.memory_size = 0x1000; memreg.userspace_addr = (unsigned long)host_mem; host_mem[0] = 0xf4; ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg); int cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); struct kvm_sregs sregs; ioctl(cpufd, KVM_GET_SREGS, &sregs); sregs.cr0 = 0; sregs.cr4 = 0; sregs.efer = 0; sregs.cs.selector = 0; sregs.cs.base = 0; ioctl(cpufd, KVM_SET_SREGS, &sregs); struct kvm_regs regs = { .rflags = 2 }; ioctl(cpufd, KVM_SET_REGS, ®s); ioctl(vmfd, KVM_CREATE_IRQCHIP, 0); pthread_t th; pthread_create(&th, 0, thr, (void*)(long)cpufd); usleep(rand() % 10000); ioctl(cpufd, KVM_RUN, 0); pthread_join(th, 0); return 0; } This patch fixes it by failing ENABLE_CAP if without an irqchip. Reported-by: Dmitry Vyukov Fixes: 5c919412fe61 (kvm/x86: Hyper-V synthetic interrupt controller) Cc: stable@vger.kernel.org # 4.5+ Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Dmitry Vyukov Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a0ac6e0060fb..57d8a856cdc5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3342,6 +3342,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, switch (cap->cap) { case KVM_CAP_HYPERV_SYNIC: + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; return kvm_hv_activate_synic(vcpu); default: return -EINVAL; From 19c816e8e455f58da9997e4c6626f06203d8fbf0 Mon Sep 17 00:00:00 2001 From: Jike Song Date: Thu, 12 Jan 2017 16:52:02 +0800 Subject: [PATCH 267/297] capability: export has_capability has_capability() is sometimes needed by modules to test capability for specified task other than current, so export it. Cc: Kirti Wankhede Signed-off-by: Jike Song Acked-by: Serge Hallyn Acked-by: James Morris Signed-off-by: Alex Williamson --- kernel/capability.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/capability.c b/kernel/capability.c index a98e814f216f..f97fe77ceb88 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -318,6 +318,7 @@ bool has_capability(struct task_struct *t, int cap) { return has_ns_capability(t, &init_user_ns, cap); } +EXPORT_SYMBOL(has_capability); /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) From 33ab91103b3415e12457e3104f0e4517ce12d0f3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 12 Jan 2017 15:02:32 +0100 Subject: [PATCH 268/297] KVM: x86: fix emulation of "MOV SS, null selector" This is CVE-2017-2583. On Intel this causes a failed vmentry because SS's type is neither 3 nor 7 (even though the manual says this check is only done for usable SS, and the dmesg splat says that SS is unusable!). On AMD it's worse: svm.c is confused and sets CPL to 0 in the vmcb. The fix fabricates a data segment descriptor when SS is set to a null selector, so that CPL and SS.DPL are set correctly in the VMCS/vmcb. Furthermore, only allow setting SS to a NULL selector if SS.RPL < 3; this in turn ensures CPL < 3 because RPL must be equal to CPL. Thanks to Andy Lutomirski and Willy Tarreau for help in analyzing the bug and deciphering the manuals. Reported-by: Xiaohan Zhang Fixes: 79d5b4c3cd809c770d4bf9812635647016c56011 Cc: stable@nongnu.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 48 +++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f36d0fa6b885..cedbba0f3402 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1585,7 +1585,6 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, &ctxt->exception); } -/* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, @@ -1622,20 +1621,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; - /* NULL selector is not valid for TR, CS and SS (except for long mode) */ - if ((seg == VCPU_SREG_CS - || (seg == VCPU_SREG_SS - && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) - || seg == VCPU_SREG_TR) - && null_selector) - goto exception; - /* TR should be in GDT only */ if (seg == VCPU_SREG_TR && (selector & (1 << 2))) goto exception; - if (null_selector) /* for NULL selector skip all following checks */ + /* NULL selector is not valid for TR, CS and (except for long mode) SS */ + if (null_selector) { + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR) + goto exception; + + if (seg == VCPU_SREG_SS) { + if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl) + goto exception; + + /* + * ctxt->ops->set_segment expects the CPL to be in + * SS.DPL, so fake an expand-up 32-bit data segment. + */ + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + seg_desc.dpl = cpl; + seg_desc.d = 1; + seg_desc.g = 1; + } + + /* Skip all following checks */ goto load; + } ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) @@ -1751,6 +1764,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); + + /* + * None of MOV, POP and LSS can load a NULL selector in CPL=3, but + * they can load it at CPL<3 (Intel's manual says only LSS can, + * but it's wrong). + * + * However, the Intel manual says that putting IST=1/DPL=3 in + * an interrupt gate will result in SS=3 (the AMD manual instead + * says it doesn't), so allow SS=3 in __load_segment_descriptor + * and only forbid it here. + */ + if (seg == VCPU_SREG_SS && selector == 3 && + ctxt->mode == X86EMUL_MODE_PROT64) + return emulate_exception(ctxt, GP_VECTOR, 0, true); + return __load_segment_descriptor(ctxt, selector, seg, cpl, X86_TRANSFER_NONE, NULL); } From f99e86485cc32cd16e5cc97f9bb0474f28608d84 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 12 Jan 2017 07:58:32 -0700 Subject: [PATCH 269/297] block: Rename blk_queue_zone_size and bdev_zone_size All block device data fields and functions returning a number of 512B sectors are by convention named xxx_sectors while names in the form xxx_size are generally used for a number of bytes. The blk_queue_zone_size and bdev_zone_size functions were not following this convention so rename them. No functional change is introduced by this patch. Signed-off-by: Damien Le Moal Collapsed the two patches, they were nonsensically split and broke bisection. Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ++-- block/partition-generic.c | 14 +++++++------- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 6 +++--- include/linux/blkdev.h | 6 +++--- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 472211fa183a..3bd15d8095b1 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -16,7 +16,7 @@ static inline sector_t blk_zone_start(struct request_queue *q, sector_t sector) { - sector_t zone_mask = blk_queue_zone_size(q) - 1; + sector_t zone_mask = blk_queue_zone_sectors(q) - 1; return sector & ~zone_mask; } @@ -222,7 +222,7 @@ int blkdev_reset_zones(struct block_device *bdev, return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ - zone_sectors = blk_queue_zone_size(q); + zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) return -EINVAL; diff --git a/block/partition-generic.c b/block/partition-generic.c index d7beb6bbbf66..7afb9907821f 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -434,7 +434,7 @@ static bool part_zone_aligned(struct gendisk *disk, struct block_device *bdev, sector_t from, sector_t size) { - unsigned int zone_size = bdev_zone_size(bdev); + unsigned int zone_sectors = bdev_zone_sectors(bdev); /* * If this function is called, then the disk is a zoned block device @@ -446,7 +446,7 @@ static bool part_zone_aligned(struct gendisk *disk, * regular block devices (no zone operation) and their zone size will * be reported as 0. Allow this case. */ - if (!zone_size) + if (!zone_sectors) return true; /* @@ -455,24 +455,24 @@ static bool part_zone_aligned(struct gendisk *disk, * use it. Check the zone size too: it should be a power of 2 number * of sectors. */ - if (WARN_ON_ONCE(!is_power_of_2(zone_size))) { + if (WARN_ON_ONCE(!is_power_of_2(zone_sectors))) { u32 rem; - div_u64_rem(from, zone_size, &rem); + div_u64_rem(from, zone_sectors, &rem); if (rem) return false; if ((from + size) < get_capacity(disk)) { - div_u64_rem(size, zone_size, &rem); + div_u64_rem(size, zone_sectors, &rem); if (rem) return false; } } else { - if (from & (zone_size - 1)) + if (from & (zone_sectors - 1)) return false; if ((from + size) < get_capacity(disk) && - (size & (zone_size - 1))) + (size & (zone_sectors - 1))) return false; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0738f48293cc..0d8802453758 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -713,8 +713,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 702638e21c76..46fd30d8af77 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1553,16 +1553,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83695641bd5e..ff3d774f2751 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -739,7 +739,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } -static inline unsigned int blk_queue_zone_size(struct request_queue *q) +static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) { return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } @@ -1536,12 +1536,12 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } -static inline unsigned int bdev_zone_size(struct block_device *bdev) +static inline unsigned int bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) - return blk_queue_zone_size(q); + return blk_queue_zone_sectors(q); return 0; } From 41c066f2c4d436c535616fe182331766c57838f0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 14:54:53 +0000 Subject: [PATCH 270/297] arm64: assembler: make adr_l work in modules under KASLR When CONFIG_RANDOMIZE_MODULE_REGION_FULL=y, the offset between loaded modules and the core kernel may exceed 4 GB, putting symbols exported by the core kernel out of the reach of the ordinary adrp/add instruction pairs used to generate relative symbol references. So make the adr_l macro emit a movz/movk sequence instead when executing in module context. While at it, remove the pointless special case for the stack pointer. Acked-by: Mark Rutland Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/assembler.h | 36 ++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 446f6c46d4b1..3a4301163e04 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -164,22 +164,25 @@ lr .req x30 // link register /* * Pseudo-ops for PC-relative adr/ldr/str , where - * is within the range +/- 4 GB of the PC. + * is within the range +/- 4 GB of the PC when running + * in core kernel context. In module context, a movz/movk sequence + * is used, since modules may be loaded far away from the kernel + * when KASLR is in effect. */ /* * @dst: destination register (64 bit wide) * @sym: name of the symbol - * @tmp: optional scratch register to be used if == sp, which - * is not allowed in an adrp instruction */ - .macro adr_l, dst, sym, tmp= - .ifb \tmp + .macro adr_l, dst, sym +#ifndef MODULE adrp \dst, \sym add \dst, \dst, :lo12:\sym - .else - adrp \tmp, \sym - add \dst, \tmp, :lo12:\sym - .endif +#else + movz \dst, #:abs_g3:\sym + movk \dst, #:abs_g2_nc:\sym + movk \dst, #:abs_g1_nc:\sym + movk \dst, #:abs_g0_nc:\sym +#endif .endm /* @@ -190,6 +193,7 @@ lr .req x30 // link register * the address */ .macro ldr_l, dst, sym, tmp= +#ifndef MODULE .ifb \tmp adrp \dst, \sym ldr \dst, [\dst, :lo12:\sym] @@ -197,6 +201,15 @@ lr .req x30 // link register adrp \tmp, \sym ldr \dst, [\tmp, :lo12:\sym] .endif +#else + .ifb \tmp + adr_l \dst, \sym + ldr \dst, [\dst] + .else + adr_l \tmp, \sym + ldr \dst, [\tmp] + .endif +#endif .endm /* @@ -206,8 +219,13 @@ lr .req x30 // link register * while needs to be preserved. */ .macro str_l, src, sym, tmp +#ifndef MODULE adrp \tmp, \sym str \src, [\tmp, :lo12:\sym] +#else + adr_l \tmp, \sym + str \src, [\tmp] +#endif .endm /* From cc8e8342930129aa2c9b629e1653e4681f0896ea Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Jan 2017 16:21:58 +0800 Subject: [PATCH 271/297] ceph: fix mds cluster availability check We should apply the check after getting the initial mdsmap. Fixes: e9e427f0a14f ("ceph: check availability of mds cluster on mount") Link: http://tracker.ceph.com/issues/18161 Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4f49253387a0..ec6b35e9f966 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2106,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc, dout("do_request mdsmap err %d\n", err); goto finish; } + if (mdsc->mdsmap->m_epoch == 0) { + dout("do_request no mdsmap, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + goto finish; + } if (!(mdsc->fsc->mount_options->flags & CEPH_MOUNT_OPT_MOUNTWAIT) && !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { From 84fcc2d2bd6cbf621e49e1d0f7eaef2e3c666b40 Mon Sep 17 00:00:00 2001 From: "Geng, Jichao" Date: Thu, 5 Jan 2017 16:50:39 +0800 Subject: [PATCH 272/297] ceph: fix get_oldest_context() For no snapshot case, we should use ci->truncate_{seq,size}. Fixes: 5f743e456606 ("ceph: record truncate size/seq for snap data writeback") Signed-off-by: Geng, Jichao Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9cd0c0ea7cdb..e4b066cd912a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -502,9 +502,9 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); if (truncate_size) - *truncate_size = capsnap->truncate_size; + *truncate_size = ci->i_truncate_size; if (truncate_seq) - *truncate_seq = capsnap->truncate_seq; + *truncate_seq = ci->i_truncate_seq; } spin_unlock(&ci->i_ceph_lock); return snapc; From 30f939feaeee23e21391cfc7b484f012eb189c3c Mon Sep 17 00:00:00 2001 From: Vlad Tsyrklevich Date: Mon, 9 Jan 2017 22:53:36 +0700 Subject: [PATCH 273/297] i2c: fix kernel memory disclosure in dev interface i2c_smbus_xfer() does not always fill an entire block, allowing kernel stack memory disclosure through the temp variable. Clear it before it's read to. Signed-off-by: Vlad Tsyrklevich Signed-off-by: Wolfram Sang Cc: stable@kernel.org --- drivers/i2c/i2c-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c index 66f323fd3982..6f638bbc922d 100644 --- a/drivers/i2c/i2c-dev.c +++ b/drivers/i2c/i2c-dev.c @@ -331,7 +331,7 @@ static noinline int i2cdev_ioctl_smbus(struct i2c_client *client, unsigned long arg) { struct i2c_smbus_ioctl_data data_arg; - union i2c_smbus_data temp; + union i2c_smbus_data temp = {}; int datasize, res; if (copy_from_user(&data_arg, From 331c34255293cd02d395b7097008b509ba89e60e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 4 Jan 2017 20:57:22 -0800 Subject: [PATCH 274/297] i2c: do not enable fall back to Host Notify by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Falling back unconditionally to HostNotify as primary client's interrupt breaks some drivers which alter their functionality depending on whether interrupt is present or not, so let's introduce a board flag telling I2C core explicitly if we want wired interrupt or HostNotify-based one: I2C_CLIENT_HOST_NOTIFY. For DT-based systems we introduce "host-notify" property that we convert to I2C_CLIENT_HOST_NOTIFY board flag. Tested-by: Benjamin Tissoires Signed-off-by: Dmitry Torokhov Acked-by: Pali Rohár Acked-by: Rob Herring Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/i2c.txt | 8 ++++++++ drivers/i2c/i2c-core.c | 17 ++++++++--------- include/linux/i2c.h | 1 + 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/i2c.txt b/Documentation/devicetree/bindings/i2c/i2c.txt index 5fa691e6f638..cee9d5055fa2 100644 --- a/Documentation/devicetree/bindings/i2c/i2c.txt +++ b/Documentation/devicetree/bindings/i2c/i2c.txt @@ -62,6 +62,9 @@ wants to support one of the below features, it should adapt the bindings below. "irq" and "wakeup" names are recognized by I2C core, other names are left to individual drivers. +- host-notify + device uses SMBus host notify protocol instead of interrupt line. + - multi-master states that there is another master active on this bus. The OS can use this information to adapt power management to keep the arbitration awake @@ -81,6 +84,11 @@ Binding may contain optional "interrupts" property, describing interrupts used by the device. I2C core will assign "irq" interrupt (or the very first interrupt if not using interrupt names) as primary interrupt for the slave. +Alternatively, devices supporting SMbus Host Notify, and connected to +adapters that support this feature, may use "host-notify" property. I2C +core will create a virtual interrupt for Host Notify and assign it as +primary interrupt for the slave. + Also, if device is marked as a wakeup source, I2C core will set up "wakeup" interrupt for the device. If "wakeup" interrupt name is not present in the binding, then primary interrupt will be used as wakeup interrupt. diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index cf9e396d7702..7b117240f1ea 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -931,7 +931,10 @@ static int i2c_device_probe(struct device *dev) if (!client->irq) { int irq = -ENOENT; - if (dev->of_node) { + if (client->flags & I2C_CLIENT_HOST_NOTIFY) { + dev_dbg(dev, "Using Host Notify IRQ\n"); + irq = i2c_smbus_host_notify_to_irq(client); + } else if (dev->of_node) { irq = of_irq_get_byname(dev->of_node, "irq"); if (irq == -EINVAL || irq == -ENODATA) irq = of_irq_get(dev->of_node, 0); @@ -940,14 +943,7 @@ static int i2c_device_probe(struct device *dev) } if (irq == -EPROBE_DEFER) return irq; - /* - * ACPI and OF did not find any useful IRQ, try to see - * if Host Notify can be used. - */ - if (irq < 0) { - dev_dbg(dev, "Using Host Notify IRQ\n"); - irq = i2c_smbus_host_notify_to_irq(client); - } + if (irq < 0) irq = 0; @@ -1716,6 +1712,9 @@ static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap, info.of_node = of_node_get(node); info.archdata = &dev_ad; + if (of_property_read_bool(node, "host-notify")) + info.flags |= I2C_CLIENT_HOST_NOTIFY; + if (of_get_property(node, "wakeup-source", NULL)) info.flags |= I2C_CLIENT_WAKE; diff --git a/include/linux/i2c.h b/include/linux/i2c.h index b2109c522dec..4b45ec46161f 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -665,6 +665,7 @@ i2c_unlock_adapter(struct i2c_adapter *adapter) #define I2C_CLIENT_TEN 0x10 /* we have a ten bit chip address */ /* Must equal I2C_M_TEN below */ #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ +#define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ From 6f724fb3039522486fce2e32e4c0fbe238a6ab02 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 6 Jan 2017 19:02:57 +0800 Subject: [PATCH 275/297] i2c: print correct device invalid address In of_i2c_register_device(), when the check for device address validity fails we print the info.addr, which has not been assigned properly. Fix this by printing the actual invalid address. Signed-off-by: John Garry Reviewed-by: Vladimir Zapolskiy Signed-off-by: Wolfram Sang Fixes: b4e2f6ac1281 ("i2c: apply DT flags when probing") Cc: stable@kernel.org --- drivers/i2c/i2c-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 7b117240f1ea..c26296c2eac5 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -1704,7 +1704,7 @@ static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap, if (i2c_check_addr_validity(addr, info.flags)) { dev_err(&adap->dev, "of_i2c: invalid addr=%x on %s\n", - info.addr, node->full_name); + addr, node->full_name); return ERR_PTR(-EINVAL); } From 2659161dd40dbb599a19f320164373093df44a89 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Dec 2016 22:27:33 +0000 Subject: [PATCH 276/297] i2c: fix spelling mistake: "insufficent" -> "insufficient" Trivial fix to spelling mistake in WARN message, insufficient has an insufficient number of i's in the spelling. Signed-off-by: Colin Ian King Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index c26296c2eac5..583e95042a21 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -3632,7 +3632,7 @@ int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb) int ret; if (!client || !slave_cb) { - WARN(1, "insufficent data\n"); + WARN(1, "insufficient data\n"); return -EINVAL; } From 701dc207bf551d9fe6defa36e84a911e880398c3 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Wed, 11 Jan 2017 10:11:44 +0100 Subject: [PATCH 277/297] i2c: piix4: Avoid race conditions with IMC On AMD's SB800 and upwards, the SMBus is shared with the Integrated Micro Controller (IMC). The platform provides a hardware semaphore to avoid race conditions among them. (Check page 288 of the SB800-Series Southbridges Register Reference Guide http://support.amd.com/TechDocs/45482.pdf) Without this patch, many access to the SMBus end with an invalid transaction or even with the bus stalled. Reported-by: Alexandre Desnoyers Signed-off-by: Ricardo Ribalda Delgado Reviewed-by: Andy Shevchenko : Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-piix4.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index c2268cdf38e8..e34d82e79b98 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -585,10 +585,29 @@ static s32 piix4_access_sb800(struct i2c_adapter *adap, u16 addr, u8 command, int size, union i2c_smbus_data *data) { struct i2c_piix4_adapdata *adapdata = i2c_get_adapdata(adap); + unsigned short piix4_smba = adapdata->smba; + int retries = MAX_TIMEOUT; + int smbslvcnt; u8 smba_en_lo; u8 port; int retval; + /* Request the SMBUS semaphore, avoid conflicts with the IMC */ + smbslvcnt = inb_p(SMBSLVCNT); + do { + outb_p(smbslvcnt | 0x10, SMBSLVCNT); + + /* Check the semaphore status */ + smbslvcnt = inb_p(SMBSLVCNT); + if (smbslvcnt & 0x10) + break; + + usleep_range(1000, 2000); + } while (--retries); + /* SMBus is still owned by the IMC, we give up */ + if (!retries) + return -EBUSY; + mutex_lock(&piix4_mutex_sb800); outb_p(piix4_port_sel_sb800, SB800_PIIX4_SMB_IDX); @@ -606,6 +625,9 @@ static s32 piix4_access_sb800(struct i2c_adapter *adap, u16 addr, mutex_unlock(&piix4_mutex_sb800); + /* Release the semaphore */ + outb_p(smbslvcnt | 0x20, SMBSLVCNT); + return retval; } From d1b333d12cde9cabe898160b6be9769d3382d81c Mon Sep 17 00:00:00 2001 From: Jike Song Date: Thu, 12 Jan 2017 16:52:03 +0800 Subject: [PATCH 278/297] vfio iommu type1: fix the testing of capability for remote task Before the mdev enhancement type1 iommu used capable() to test the capability of current task; in the course of mdev development a new requirement, testing for another task other than current, was raised. ns_capable() was used for this purpose, however it still tests current, the only difference is, in a specified namespace. Fix it by using has_capability() instead, which tests the cap for specified task in init_user_ns, the same namespace as capable(). Cc: Gerd Hoffmann Signed-off-by: Jike Song Reviewed-by: James Morris Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9266271a787a..77373e51b283 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -495,8 +495,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, unsigned long *pfn_base, bool do_accounting) { unsigned long limit; - bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns, - CAP_IPC_LOCK); + bool lock_cap = has_capability(dma->task, CAP_IPC_LOCK); struct mm_struct *mm; int ret; bool rsvd; From 3139dc8ded6f27552a248d23fe9f086e3027fa12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Wed, 11 Jan 2017 15:39:31 +0100 Subject: [PATCH 279/297] dmaengine: rcar-dmac: unmap slave resource when channel is freed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The slave mapping should be removed together with other channel resources when the channel is freed. If it's not unmapped it will hang around forever after the channel is freed. Fixes: 9f878603dbdb7db3 ("dmaengine: rcar-dmac: add iommu support for slave transfers") Reported-by: Laurent Pinchart Signed-off-by: Niklas Söderlund Reviewed-by: Laurent Pinchart Signed-off-by: Vinod Koul --- drivers/dma/sh/rcar-dmac.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c index 2e441d0ccd79..4c357d475465 100644 --- a/drivers/dma/sh/rcar-dmac.c +++ b/drivers/dma/sh/rcar-dmac.c @@ -986,6 +986,7 @@ static void rcar_dmac_free_chan_resources(struct dma_chan *chan) { struct rcar_dmac_chan *rchan = to_rcar_dmac_chan(chan); struct rcar_dmac *dmac = to_rcar_dmac(chan->device); + struct rcar_dmac_chan_map *map = &rchan->map; struct rcar_dmac_desc_page *page, *_page; struct rcar_dmac_desc *desc; LIST_HEAD(list); @@ -1019,6 +1020,13 @@ static void rcar_dmac_free_chan_resources(struct dma_chan *chan) free_page((unsigned long)page); } + /* Remove slave mapping if present. */ + if (map->slave.xfer_size) { + dma_unmap_resource(chan->device->dev, map->addr, + map->slave.xfer_size, map->dir, 0); + map->slave.xfer_size = 0; + } + pm_runtime_put(chan->device->dev); } From 94a6fa899d2cb5ee76933406df32996576a562e4 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 12 Jan 2017 08:24:16 -0700 Subject: [PATCH 280/297] vfio/type1: Remove pid_namespace.h include Using has_capability() rather than ns_capable(), we're no longer using this header. Cc: Jike Song Cc: Kirti Wankhede Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 77373e51b283..b3cc33fa6d26 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include From 2e3258ecfaebace1ceffaa14e0ea94775d54f46f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:10 +0100 Subject: [PATCH 281/297] block: add blk_rq_payload_bytes Add a helper to calculate the actual data transfer size for special payload requests. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ff3d774f2751..1ca8e8fd1078 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1000,6 +1000,19 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +/* + * Some commands like WRITE SAME have a payload or data transfer size which + * is different from the size of the request. Any driver that supports such + * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to + * calculate the data transfer size. + */ +static inline unsigned int blk_rq_payload_bytes(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec.bv_len; + return blk_rq_bytes(rq); +} + static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { From fd102b125e174edbea34e6e7a2d371bc7901c53d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:11 +0100 Subject: [PATCH 282/297] scsi: use blk_rq_payload_bytes Without that we'll pass a wrong payload size in cmd->sdb, which can lead to hangs with drivers that need the total transfer size. Signed-off-by: Christoph Hellwig Reported-by: Chris Valean Reported-by: Dexuan Cui Fixes: f9d03f96 ("block: improve handling of the magic discard payload") Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index c35b6de4ca64..ad4ff8fcd4dd 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1018,7 +1018,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) count = blk_rq_map_sg(req->q, req, sdb->table.sgl); BUG_ON(count > sdb->table.nents); sdb->table.nents = count; - sdb->length = blk_rq_bytes(req); + sdb->length = blk_rq_payload_bytes(req); return BLKPREP_OK; } From b131c61d62266eb21b0f125f63f3d07e5670d726 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:12 +0100 Subject: [PATCH 283/297] nvme: use blk_rq_payload_bytes The new blk_rq_payload_bytes generalizes the payload length hacks that nvme_map_len did before. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 5 ++--- drivers/nvme/host/nvme.h | 8 -------- drivers/nvme/host/pci.c | 19 ++++++++----------- drivers/nvme/host/rdma.c | 13 +++++-------- 4 files changed, 15 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index aa0bc60810a7..fcc9dcfdf675 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1654,13 +1654,12 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, struct nvme_fc_fcp_op *op) { struct nvmefc_fcp_req *freq = &op->fcp_req; - u32 map_len = nvme_map_len(rq); enum dma_data_direction dir; int ret; freq->sg_cnt = 0; - if (!map_len) + if (!blk_rq_payload_bytes(rq)) return 0; freq->sg_table.sgl = freq->first_sgl; @@ -1854,7 +1853,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - data_len = nvme_map_len(rq); + data_len = blk_rq_payload_bytes(rq); if (data_len) io_dir = ((rq_data_dir(rq) == WRITE) ? NVMEFC_FCP_WRITE : NVMEFC_FCP_READ); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6377e14586dc..aead6d08ed2c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -225,14 +225,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } -static inline unsigned nvme_map_len(struct request *rq) -{ - if (req_op(rq) == REQ_OP_DISCARD) - return sizeof(struct nvme_dsm_range); - else - return blk_rq_bytes(rq); -} - static inline void nvme_cleanup_cmd(struct request *req) { if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 19beeb7b2ac2..3faefabf339c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -306,11 +306,11 @@ static __le64 **iod_list(struct request *req) return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); } -static int nvme_init_iod(struct request *rq, unsigned size, - struct nvme_dev *dev) +static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = blk_rq_nr_phys_segments(rq); + unsigned int size = blk_rq_payload_bytes(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); @@ -420,12 +420,11 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif -static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, - int total_len) +static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; - int length = total_len; + int length = blk_rq_payload_bytes(req); struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); @@ -501,7 +500,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, } static int nvme_map_data(struct nvme_dev *dev, struct request *req, - unsigned size, struct nvme_command *cmnd) + struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; @@ -519,7 +518,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_ATTR_NO_WARN)) goto out; - if (!nvme_setup_prps(dev, req, size)) + if (!nvme_setup_prps(dev, req)) goto out_unmap; ret = BLK_MQ_RQ_QUEUE_ERROR; @@ -580,7 +579,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; - unsigned map_len; int ret = BLK_MQ_RQ_QUEUE_OK; /* @@ -600,13 +598,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret != BLK_MQ_RQ_QUEUE_OK) return ret; - map_len = nvme_map_len(req); - ret = nvme_init_iod(req, map_len, dev); + ret = nvme_init_iod(req, dev); if (ret != BLK_MQ_RQ_QUEUE_OK) goto out_free_cmd; if (blk_rq_nr_phys_segments(req)) - ret = nvme_map_data(dev, req, map_len, &cmnd); + ret = nvme_map_data(dev, req, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK) goto out_cleanup_iod; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 34e564857716..557f29b1f1bb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -981,8 +981,7 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, } static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, - struct request *rq, unsigned int map_len, - struct nvme_command *c) + struct request *rq, struct nvme_command *c) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_device *dev = queue->device; @@ -1014,9 +1013,9 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, } if (count == 1) { - if (rq_data_dir(rq) == WRITE && - map_len <= nvme_rdma_inline_data_size(queue) && - nvme_rdma_queue_idx(queue)) + if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + blk_rq_payload_bytes(rq) <= + nvme_rdma_inline_data_size(queue)) return nvme_rdma_map_sg_inline(queue, req, c); if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) @@ -1444,7 +1443,6 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command *c = sqe->data; bool flush = false; struct ib_device *dev; - unsigned int map_len; int ret; WARN_ON_ONCE(rq->tag < 0); @@ -1462,8 +1460,7 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - map_len = nvme_map_len(rq); - ret = nvme_rdma_map_data(queue, rq, map_len, c); + ret = nvme_rdma_map_data(queue, rq, c); if (ret < 0) { dev_err(queue->ctrl->ctrl.device, "Failed to map data (%d)\n", ret); From f80de881d8df967488b7343381619efa15019493 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:13 +0100 Subject: [PATCH 284/297] sd: remove __data_len hack for WRITE SAME Now that we have the blk_rq_payload_bytes helper available to determine the actual I/O size we don't need to mess around with __data_len for WRITE SAME. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b1933041da39..1fbb1ecf49f2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -836,7 +836,6 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) struct bio *bio = rq->bio; sector_t sector = blk_rq_pos(rq); unsigned int nr_sectors = blk_rq_sectors(rq); - unsigned int nr_bytes = blk_rq_bytes(rq); int ret; if (sdkp->device->no_write_same) @@ -869,21 +868,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) cmd->transfersize = sdp->sector_size; cmd->allowed = SD_MAX_RETRIES; - - /* - * For WRITE_SAME the data transferred in the DATA IN buffer is - * different from the amount of data actually written to the target. - * - * We set up __data_len to the amount of data transferred from the - * DATA IN buffer so that blk_rq_map_sg set up the proper S/G list - * to transfer a single sector of data first, but then reset it to - * the amount of data to be written right after so that the I/O path - * knows how much to actually write. - */ - rq->__data_len = sdp->sector_size; - ret = scsi_init_io(cmd); - rq->__data_len = nr_bytes; - return ret; + return scsi_init_io(cmd); } static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd) From bef13315e990fd3d3fb4c39013aefd53f06c3657 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 15:18:16 -0700 Subject: [PATCH 285/297] block: don't try to discard from __blkdev_issue_zeroout Discard can return -EIO asynchronously if the alignment for the request isn't suitable for the driver, which makes a proper fallback to other methods in __blkdev_issue_zeroout impossible. Thus only issue a sync discard from blkdev_issue_zeroout an don't try discard at all from __blkdev_issue_zeroout as a non-invasive workaround. One more reason why abusing discard for zeroing must die.. Signed-off-by: Christoph Hellwig Reported-by: Eryu Guan Fixes: e73c23ff ("block: add async variant of blkdev_issue_zeroout") Signed-off-by: Jens Axboe --- block/blk-lib.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index ed89c8f4b2a0..f8c82a9b4012 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -301,13 +301,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if ((sector | nr_sects) & bs_mask) return -EINVAL; - if (discard) { - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, - BLKDEV_DISCARD_ZERO, biop); - if (ret == 0 || (ret && ret != -EOPNOTSUPP)) - goto out; - } - ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, biop); if (ret == 0 || (ret && ret != -EOPNOTSUPP)) @@ -370,6 +363,12 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, struct bio *bio = NULL; struct blk_plug plug; + if (discard) { + if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, + BLKDEV_DISCARD_ZERO)) + return 0; + } + blk_start_plug(&plug); ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, &bio, discard); From 695085b4bc7603551db0b3da897b8bf9893ca218 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 13 Jan 2017 01:11:18 -0500 Subject: [PATCH 286/297] x86/tsc: Add the Intel Denverton Processor to native_calibrate_tsc() The Intel Denverton microserver uses a 25 MHz TSC crystal, so we can derive its exact [*] TSC frequency using CPUID and some arithmetic, eg.: TSC: 1800 MHz (25000000 Hz * 216 / 3 / 1000000) [*] 'exact' is only as good as the crystal, which should be +/- 20ppm Signed-off-by: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/306899f94804aece6d8fa8b4223ede3b48dbb59c.1484287748.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index be3a49ee0356..e41af597aed8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -694,6 +694,7 @@ unsigned long native_calibrate_tsc(void) crystal_khz = 24000; /* 24.0 MHz */ break; case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; case INTEL_FAM6_ATOM_GOLDMONT: From 453828625731d0ba7218242ef6ec88f59408f368 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 12 Jan 2017 16:53:11 +0100 Subject: [PATCH 287/297] x86/mpx: Use compatible types in comparison to fix sparse error info->si_addr is of type void __user *, so it should be compared against something from the same address space. This fixes the following sparse error: arch/x86/mm/mpx.c:296:27: error: incompatible types in comparison expression (different address spaces) Signed-off-by: Tobias Klauser Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 324e5713d386..af59f808742f 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -293,7 +293,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) * We were not able to extract an address from the instruction, * probably because there was something invalid in it. */ - if (info->si_addr == (void *)-1) { + if (info->si_addr == (void __user *)-1) { err = -EINVAL; goto err_out; } From 63cae12bce9861cec309798d34701cf3da20bc71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Dec 2016 14:59:00 +0100 Subject: [PATCH 288/297] perf/core: Fix sys_perf_event_open() vs. hotplug There is problem with installing an event in a task that is 'stuck' on an offline CPU. Blocked tasks are not dis-assosciated from offlined CPUs, after all, a blocked task doesn't run and doesn't require a CPU etc.. Only on wakeup do we ammend the situation and place the task on a available CPU. If we hit such a task with perf_install_in_context() we'll loop until either that task wakes up or the CPU comes back online, if the task waking depends on the event being installed, we're stuck. While looking into this issue, I also spotted another problem, if we hit a task with perf_install_in_context() that is in the middle of being migrated, that is we observe the old CPU before sending the IPI, but run the IPI (on the old CPU) while the task is already running on the new CPU, things also go sideways. Rework things to rely on task_curr() -- outside of rq->lock -- which is rather tricky. Imagine the following scenario where we're trying to install the first event into our task 't': CPU0 CPU1 CPU2 (current == t) t->perf_event_ctxp[] = ctx; smp_mb(); cpu = task_cpu(t); switch(t, n); migrate(t, 2); switch(p, t); ctx = t->perf_event_ctxp[]; // must not be NULL smp_function_call(cpu, ..); generic_exec_single() func(); spin_lock(ctx->lock); if (task_curr(t)) // false add_event_to_ctx(); spin_unlock(ctx->lock); perf_event_context_sched_in(); spin_lock(ctx->lock); // sees event So its CPU0's store of t->perf_event_ctxp[] that must not go 'missing'. Because if CPU2's load of that variable were to observe NULL, it would not try to schedule the ctx and we'd have a task running without its counter, which would be 'bad'. As long as we observe !NULL, we'll acquire ctx->lock. If we acquire it first and not see the event yet, then CPU0 must observe task_curr() and retry. If the install happens first, then we must see the event on sched-in and all is well. I think we can translate the first part (until the 'must not be NULL') of the scenario to a litmus test like: C C-peterz { } P0(int *x, int *y) { int r1; WRITE_ONCE(*x, 1); smp_mb(); r1 = READ_ONCE(*y); } P1(int *y, int *z) { WRITE_ONCE(*y, 1); smp_store_release(z, 1); } P2(int *x, int *z) { int r1; int r2; r1 = smp_load_acquire(z); smp_mb(); r2 = READ_ONCE(*x); } exists (0:r1=0 /\ 2:r1=1 /\ 2:r2=0) Where: x is perf_event_ctxp[], y is our tasks's CPU, and z is our task being placed on the rq of CPU2. The P0 smp_mb() is the one added by this patch, ordering the store to perf_event_ctxp[] from find_get_context() and the load of task_cpu() in task_function_call(). The smp_store_release/smp_load_acquire model the RCpc locking of the rq->lock and the smp_mb() of P2 is the context switch switching from whatever CPU2 was running to our task 't'. This litmus test evaluates into: Test C-peterz Allowed States 7 0:r1=0; 2:r1=0; 2:r2=0; 0:r1=0; 2:r1=0; 2:r2=1; 0:r1=0; 2:r1=1; 2:r2=1; 0:r1=1; 2:r1=0; 2:r2=0; 0:r1=1; 2:r1=0; 2:r2=1; 0:r1=1; 2:r1=1; 2:r2=0; 0:r1=1; 2:r1=1; 2:r2=1; No Witnesses Positive: 0 Negative: 7 Condition exists (0:r1=0 /\ 2:r1=1 /\ 2:r2=0) Observation C-peterz Never 0 7 Hash=e427f41d9146b2a5445101d3e2fcaa34 And the strong and weak model agree. Reported-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Will Deacon Cc: jeremy.linton@arm.com Link: http://lkml.kernel.org/r/20161209135900.GU3174@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 70 ++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index ab15509fab8c..72ce7d63e561 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2249,7 +2249,7 @@ static int __perf_install_in_context(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - bool activate = true; + bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); @@ -2257,27 +2257,26 @@ static int __perf_install_in_context(void *info) raw_spin_lock(&ctx->lock); task_ctx = ctx; - /* If we're on the wrong CPU, try again */ - if (task_cpu(ctx->task) != smp_processor_id()) { + reprogram = (ctx->task == current); + + /* + * If the task is running, it must be running on this CPU, + * otherwise we cannot reprogram things. + * + * If its not running, we don't care, ctx->lock will + * serialize against it becoming runnable. + */ + if (task_curr(ctx->task) && !reprogram) { ret = -ESRCH; goto unlock; } - /* - * If we're on the right CPU, see if the task we target is - * current, if not we don't have to activate the ctx, a future - * context switch will do that for us. - */ - if (ctx->task != current) - activate = false; - else - WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); - + WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } - if (activate) { + if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx); @@ -2328,13 +2327,36 @@ perf_install_in_context(struct perf_event_context *ctx, /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. + * + * Instead we use task_curr(), which tells us if the task is running. + * However, since we use task_curr() outside of rq::lock, we can race + * against the actual state. This means the result can be wrong. + * + * If we get a false positive, we retry, this is harmless. + * + * If we get a false negative, things are complicated. If we are after + * perf_event_context_sched_in() ctx::lock will serialize us, and the + * value must be correct. If we're before, it doesn't matter since + * perf_event_context_sched_in() will program the counter. + * + * However, this hinges on the remote context switch having observed + * our task->perf_event_ctxp[] store, such that it will in fact take + * ctx::lock in perf_event_context_sched_in(). + * + * We do this by task_function_call(), if the IPI fails to hit the task + * we know any future context switch of task must see the + * perf_event_ctpx[] store. */ -again: + /* - * Cannot use task_function_call() because we need to run on the task's - * CPU regardless of whether its current or not. + * This smp_mb() orders the task->perf_event_ctxp[] store with the + * task_cpu() load, such that if the IPI then does not find the task + * running, a future context switch of that task must observe the + * store. */ - if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + smp_mb(); +again: + if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); @@ -2348,12 +2370,16 @@ again: raw_spin_unlock_irq(&ctx->lock); return; } - raw_spin_unlock_irq(&ctx->lock); /* - * Since !ctx->is_active doesn't mean anything, we must IPI - * unconditionally. + * If the task is not running, ctx->lock will avoid it becoming so, + * thus we can safely install the event. */ - goto again; + if (task_curr(task)) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); } /* From 321027c1fe77f892f4ea07846aeae08cefbbb290 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Jan 2017 21:09:50 +0100 Subject: [PATCH 289/297] perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race Di Shen reported a race between two concurrent sys_perf_event_open() calls where both try and move the same pre-existing software group into a hardware context. The problem is exactly that described in commit: f63a8daa5812 ("perf: Fix event->ctx locking") ... where, while we wait for a ctx->mutex acquisition, the event->ctx relation can have changed under us. That very same commit failed to recognise sys_perf_event_context() as an external access vector to the events and thereby didn't apply the established locking rules correctly. So while one sys_perf_event_open() call is stuck waiting on mutex_lock_double(), the other (which owns said locks) moves the group about. So by the time the former sys_perf_event_open() acquires the locks, the context we've acquired is stale (and possibly dead). Apply the established locking rules as per perf_event_ctx_lock_nested() to the mutex_lock_double() for the 'move_group' case. This obviously means we need to validate state after we acquire the locks. Reported-by: Di Shen (Keen Lab) Tested-by: John Dias Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kees Cook Cc: Linus Torvalds Cc: Min Chong Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: f63a8daa5812 ("perf: Fix event->ctx locking") Link: http://lkml.kernel.org/r/20170106131444.GZ3174@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 58 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 72ce7d63e561..cbc5937265da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9529,6 +9529,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } +/* + * Variation on perf_event_ctx_lock_nested(), except we take two context + * mutexes. + */ +static struct perf_event_context * +__perf_event_ctx_lock_double(struct perf_event *group_leader, + struct perf_event_context *ctx) +{ + struct perf_event_context *gctx; + +again: + rcu_read_lock(); + gctx = READ_ONCE(group_leader->ctx); + if (!atomic_inc_not_zero(&gctx->refcount)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + mutex_lock_double(&gctx->mutex, &ctx->mutex); + + if (group_leader->ctx != gctx) { + mutex_unlock(&ctx->mutex); + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + goto again; + } + + return gctx; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -9772,12 +9803,31 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - gctx = group_leader->ctx; - mutex_lock_double(&gctx->mutex, &ctx->mutex); + gctx = __perf_event_ctx_lock_double(group_leader, ctx); + if (gctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } + + /* + * Check if we raced against another sys_perf_event_open() call + * moving the software group underneath us. + */ + if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * If someone moved the group out from under us, check + * if this new event wound up on the same ctx, if so + * its the regular !move_group case, otherwise fail. + */ + if (gctx != ctx) { + err = -EINVAL; + goto err_locked; + } else { + perf_event_ctx_unlock(group_leader, gctx); + move_group = 0; + } + } } else { mutex_lock(&ctx->mutex); } @@ -9879,7 +9929,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_unpin_context(ctx); if (move_group) - mutex_unlock(&gctx->mutex); + perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { @@ -9905,7 +9955,7 @@ SYSCALL_DEFINE5(perf_event_open, err_locked: if (move_group) - mutex_unlock(&gctx->mutex); + perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); /* err_file: */ fput(event_file); From 475113d937adfd150eb82b5e2c5507125a68e7af Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 28 Dec 2016 14:31:03 +0100 Subject: [PATCH 290/297] perf/x86/intel: Account interrupts for PEBS errors It's possible to set up PEBS events to get only errors and not any data, like on SNB-X (model 45) and IVB-EP (model 62) via 2 perf commands running simultaneously: taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10 This leads to a soft lock up, because the error path of the intel_pmu_drain_pebs_nhm() does not account event->hw.interrupt for error PEBS interrupts, so in case you're getting ONLY errors you don't have a way to stop the event when it's over the max_samples_per_tick limit: NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816] ... RIP: 0010:[] [] smp_call_function_single+0xe2/0x140 ... Call Trace: ? trace_hardirqs_on_caller+0xf5/0x1b0 ? perf_cgroup_attach+0x70/0x70 perf_install_in_context+0x199/0x1b0 ? ctx_resched+0x90/0x90 SYSC_perf_event_open+0x641/0xf90 SyS_perf_event_open+0x9/0x10 do_syscall_64+0x6c/0x1f0 entry_SYSCALL64_slow_path+0x25/0x25 Add perf_event_account_interrupt() which does the interrupt and frequency checks and call it from intel_pmu_drain_pebs_nhm()'s error path. We keep the pending_kill and pending_wakeup logic only in the __perf_event_overflow() path, because they make sense only if there's any data to deliver. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Vince Weaver Link: http://lkml.kernel.org/r/1482931866-6018-2-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 6 ++++- include/linux/perf_event.h | 1 + kernel/events/core.c | 47 +++++++++++++++++++++++++------------- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index be202390bbd3..9dfeeeca0ea8 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; /* log dropped samples number */ - if (error[bit]) + if (error[bit]) { perf_log_lost_samples(event, error[bit]); + if (perf_event_account_interrupt(event)) + x86_pmu_stop(event, 0); + } + if (counts[bit]) { __intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4741ecdb9817..78ed8105e64d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); +extern int perf_event_account_interrupt(struct perf_event *event); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, diff --git a/kernel/events/core.c b/kernel/events/core.c index cbc5937265da..110b38a58493 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7060,25 +7060,12 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } -/* - * Generic event overflow handling, sampling. - */ - -static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) +static int +__perf_event_account_interrupt(struct perf_event *event, int throttle) { - int events = atomic_read(&event->event_limit); struct hw_perf_event *hwc = &event->hw; - u64 seq; int ret = 0; - - /* - * Non-sampling counters might still use the PMI to fold short - * hardware counters, ignore those. - */ - if (unlikely(!is_sampling_event(event))) - return 0; + u64 seq; seq = __this_cpu_read(perf_throttled_seq); if (seq != hwc->interrupts_seq) { @@ -7106,6 +7093,34 @@ static int __perf_event_overflow(struct perf_event *event, perf_adjust_period(event, delta, hwc->last_period, true); } + return ret; +} + +int perf_event_account_interrupt(struct perf_event *event) +{ + return __perf_event_account_interrupt(event, 1); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + int ret = 0; + + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + + ret = __perf_event_account_interrupt(event, throttle); + /* * XXX event_limit might not quite work as expected on inherited * events From 18e7a45af91acdde99d3aa1372cc40e1f8142f7b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Jan 2017 15:24:54 +0100 Subject: [PATCH 291/297] perf/x86: Reject non sampling events with precise_ip As Peter suggested [1] rejecting non sampling PEBS events, because they dont make any sense and could cause bugs in the NMI handler [2]. [1] http://lkml.kernel.org/r/20170103094059.GC3093@worktop [2] http://lkml.kernel.org/r/1482931866-6018-3-git-send-email-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Vince Weaver Link: http://lkml.kernel.org/r/20170103142454.GA26251@krava Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 019c5887b698..1635c0c8df23 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + + /* There's no sense in having PEBS for non sampling events: */ + if (!is_sampling_event(event)) + return -EINVAL; } /* * check that PEBS LBR correction does not conflict with From c7334ce814f7e5d8fc1f9b3126cda0640c2f81b3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 14 Jan 2017 14:09:03 +0100 Subject: [PATCH 292/297] Revert "driver core: Add deferred_probe attribute to devices in sysfs" This reverts commit 6751667a29d6fd64afb9ce30567ad616b68ed789. Rob Herring objected to it, and a replacement for it will be added using debugfs in the future. Cc: Ben Hutchings Reported-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- .../ABI/testing/sysfs-devices-deferred_probe | 12 ------------ drivers/base/base.h | 2 -- drivers/base/core.c | 7 ------- drivers/base/dd.c | 13 ------------- 4 files changed, 34 deletions(-) delete mode 100644 Documentation/ABI/testing/sysfs-devices-deferred_probe diff --git a/Documentation/ABI/testing/sysfs-devices-deferred_probe b/Documentation/ABI/testing/sysfs-devices-deferred_probe deleted file mode 100644 index 58553d7a321f..000000000000 --- a/Documentation/ABI/testing/sysfs-devices-deferred_probe +++ /dev/null @@ -1,12 +0,0 @@ -What: /sys/devices/.../deferred_probe -Date: August 2016 -Contact: Ben Hutchings -Description: - The /sys/devices/.../deferred_probe attribute is - present for all devices. If a driver detects during - probing a device that a related device is not yet - ready, it may defer probing of the first device. The - kernel will retry probing the first device after any - other device is successfully probed. This attribute - reads as 1 if probing of this device is currently - deferred, or 0 otherwise. diff --git a/drivers/base/base.h b/drivers/base/base.h index ada9dce34e6d..e19b1008e5fb 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -141,8 +141,6 @@ extern void device_unblock_probing(void); extern struct kset *devices_kset; extern void devices_kset_move_last(struct device *dev); -extern struct device_attribute dev_attr_deferred_probe; - #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS) extern void module_add_driver(struct module *mod, struct device_driver *drv); extern void module_remove_driver(struct device_driver *drv); diff --git a/drivers/base/core.c b/drivers/base/core.c index 020ea7f05520..8c25e68e67d7 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1060,14 +1060,8 @@ static int device_add_attrs(struct device *dev) goto err_remove_dev_groups; } - error = device_create_file(dev, &dev_attr_deferred_probe); - if (error) - goto err_remove_online; - return 0; - err_remove_online: - device_remove_file(dev, &dev_attr_online); err_remove_dev_groups: device_remove_groups(dev, dev->groups); err_remove_type_groups: @@ -1085,7 +1079,6 @@ static void device_remove_attrs(struct device *dev) struct class *class = dev->class; const struct device_type *type = dev->type; - device_remove_file(dev, &dev_attr_deferred_probe); device_remove_file(dev, &dev_attr_online); device_remove_groups(dev, dev->groups); diff --git a/drivers/base/dd.c b/drivers/base/dd.c index a8b258e5407b..a1fbf55c4d3a 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -53,19 +53,6 @@ static LIST_HEAD(deferred_probe_pending_list); static LIST_HEAD(deferred_probe_active_list); static atomic_t deferred_trigger_count = ATOMIC_INIT(0); -static ssize_t deferred_probe_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - bool value; - - mutex_lock(&deferred_probe_mutex); - value = !list_empty(&dev->p->deferred_probe); - mutex_unlock(&deferred_probe_mutex); - - return sprintf(buf, "%d\n", value); -} -DEVICE_ATTR_RO(deferred_probe); - /* * In some cases, like suspend to RAM or hibernation, It might be reasonable * to prohibit probing of devices as it could be unsafe. From 0100a3e67a9cef64d72cd3a1da86f3ddbee50363 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Mon, 12 Dec 2016 18:42:28 -0500 Subject: [PATCH 293/297] efi/x86: Prune invalid memory map entries and fix boot regression Some machines, such as the Lenovo ThinkPad W541 with firmware GNET80WW (2.28), include memory map entries with phys_addr=0x0 and num_pages=0. These machines fail to boot after the following commit, commit 8e80632fb23f ("efi/esrt: Use efi_mem_reserve() and avoid a kmalloc()") Fix this by removing such bogus entries from the memory map. Furthermore, currently the log output for this case (with efi=debug) looks like: [ 0.000000] efi: mem45: [Reserved | | | | | | | | | | | | ] range=[0x0000000000000000-0xffffffffffffffff] (0MB) This is clearly wrong, and also not as informative as it could be. This patch changes it so that if we find obviously invalid memory map entries, we print an error and skip those entries. It also detects the display of the address range calculation overflow, so the new output is: [ 0.000000] efi: [Firmware Bug]: Invalid EFI memory map entries: [ 0.000000] efi: mem45: [Reserved | | | | | | | | | | | | ] range=[0x0000000000000000-0x0000000000000000] (invalid) It also detects memory map sizes that would overflow the physical address, for example phys_addr=0xfffffffffffff000 and num_pages=0x0200000000000001, and prints: [ 0.000000] efi: [Firmware Bug]: Invalid EFI memory map entries: [ 0.000000] efi: mem45: [Reserved | | | | | | | | | | | | ] range=[phys_addr=0xfffffffffffff000-0x20ffffffffffffffff] (invalid) It then removes these entries from the memory map. Signed-off-by: Peter Jones Signed-off-by: Ard Biesheuvel [ardb: refactor for clarity with no functional changes, avoid PAGE_SHIFT] Signed-off-by: Matt Fleming [Matt: Include bugzilla info in commit log] Cc: # v4.9+ Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://bugzilla.kernel.org/show_bug.cgi?id=191121 Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 66 +++++++++++++++++++++++++++++++++++++ include/linux/efi.h | 1 + 2 files changed, 67 insertions(+) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 936a488d6cf6..274dfc481849 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -210,6 +210,70 @@ int __init efi_memblock_x86_reserve_range(void) return 0; } +#define OVERFLOW_ADDR_SHIFT (64 - EFI_PAGE_SHIFT) +#define OVERFLOW_ADDR_MASK (U64_MAX << OVERFLOW_ADDR_SHIFT) +#define U64_HIGH_BIT (~(U64_MAX >> 1)) + +static bool __init efi_memmap_entry_valid(const efi_memory_desc_t *md, int i) +{ + u64 end = (md->num_pages << EFI_PAGE_SHIFT) + md->phys_addr - 1; + u64 end_hi = 0; + char buf[64]; + + if (md->num_pages == 0) { + end = 0; + } else if (md->num_pages > EFI_PAGES_MAX || + EFI_PAGES_MAX - md->num_pages < + (md->phys_addr >> EFI_PAGE_SHIFT)) { + end_hi = (md->num_pages & OVERFLOW_ADDR_MASK) + >> OVERFLOW_ADDR_SHIFT; + + if ((md->phys_addr & U64_HIGH_BIT) && !(end & U64_HIGH_BIT)) + end_hi += 1; + } else { + return true; + } + + pr_warn_once(FW_BUG "Invalid EFI memory map entries:\n"); + + if (end_hi) { + pr_warn("mem%02u: %s range=[0x%016llx-0x%llx%016llx] (invalid)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, end_hi, end); + } else { + pr_warn("mem%02u: %s range=[0x%016llx-0x%016llx] (invalid)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, end); + } + return false; +} + +static void __init efi_clean_memmap(void) +{ + efi_memory_desc_t *out = efi.memmap.map; + const efi_memory_desc_t *in = out; + const efi_memory_desc_t *end = efi.memmap.map_end; + int i, n_removal; + + for (i = n_removal = 0; in < end; i++) { + if (efi_memmap_entry_valid(in, i)) { + if (out != in) + memcpy(out, in, efi.memmap.desc_size); + out = (void *)out + efi.memmap.desc_size; + } else { + n_removal++; + } + in = (void *)in + efi.memmap.desc_size; + } + + if (n_removal > 0) { + u64 size = efi.memmap.nr_map - n_removal; + + pr_warn("Removing %d invalid memory map entries.\n", n_removal); + efi_memmap_install(efi.memmap.phys_map, size); + } +} + void __init efi_print_memmap(void) { efi_memory_desc_t *md; @@ -472,6 +536,8 @@ void __init efi_init(void) } } + efi_clean_memmap(); + if (efi_enabled(EFI_DBG)) efi_print_memmap(); } diff --git a/include/linux/efi.h b/include/linux/efi.h index 0c5420208c40..5b1af30ece55 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -103,6 +103,7 @@ typedef struct { #define EFI_PAGE_SHIFT 12 #define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) +#define EFI_PAGES_MAX (U64_MAX >> EFI_PAGE_SHIFT) typedef struct { u32 type; From a12f1ae61c489076a9aeb90bddca7722bf330df3 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 13 Dec 2016 12:09:56 -0800 Subject: [PATCH 294/297] aio: fix lock dep warning lockdep reports a warnning. file_start_write/file_end_write only acquire/release the lock for regular files. So checking the files in aio side too. [ 453.532141] ------------[ cut here ]------------ [ 453.533011] WARNING: CPU: 1 PID: 1298 at ../kernel/locking/lockdep.c:3514 lock_release+0x434/0x670 [ 453.533011] DEBUG_LOCKS_WARN_ON(depth <= 0) [ 453.533011] Modules linked in: [ 453.533011] CPU: 1 PID: 1298 Comm: fio Not tainted 4.9.0+ #964 [ 453.533011] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.0-1.fc24 04/01/2014 [ 453.533011] ffff8803a24b7a70 ffffffff8196cffb ffff8803a24b7ae8 0000000000000000 [ 453.533011] ffff8803a24b7ab8 ffffffff81091ee1 ffff8803a5dba700 00000dba00000008 [ 453.533011] ffffed0074496f59 ffff8803a5dbaf54 ffff8803ae0f8488 fffffffffffffdef [ 453.533011] Call Trace: [ 453.533011] [] dump_stack+0x67/0x9c [ 453.533011] [] __warn+0x111/0x130 [ 453.533011] [] warn_slowpath_fmt+0x97/0xb0 [ 453.533011] [] ? __warn+0x130/0x130 [ 453.533011] [] ? blk_finish_plug+0x29/0x60 [ 453.533011] [] lock_release+0x434/0x670 [ 453.533011] [] ? import_single_range+0xd4/0x110 [ 453.533011] [] ? rw_verify_area+0x65/0x140 [ 453.533011] [] ? aio_write+0x1f6/0x280 [ 453.533011] [] aio_write+0x229/0x280 [ 453.533011] [] ? aio_complete+0x640/0x640 [ 453.533011] [] ? debug_check_no_locks_freed+0x1a0/0x1a0 [ 453.533011] [] ? debug_lockdep_rcu_enabled.part.2+0x1a/0x30 [ 453.533011] [] ? debug_lockdep_rcu_enabled+0x35/0x40 [ 453.533011] [] ? __might_fault+0x7e/0xf0 [ 453.533011] [] do_io_submit+0x94c/0xb10 [ 453.533011] [] ? do_io_submit+0x23e/0xb10 [ 453.533011] [] ? SyS_io_destroy+0x270/0x270 [ 453.533011] [] ? mark_held_locks+0x23/0xc0 [ 453.533011] [] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 453.533011] [] SyS_io_submit+0x10/0x20 [ 453.533011] [] entry_SYSCALL_64_fastpath+0x18/0xad [ 453.533011] [] ? trace_hardirqs_off_caller+0xc0/0x110 [ 453.533011] ---[ end trace b2fbe664d1cc0082 ]--- Cc: Dmitry Monakhov Cc: Jan Kara Cc: Christoph Hellwig Cc: Al Viro Reviewed-by: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Al Viro --- fs/aio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 4ab67e8cb776..873b4ca82ccb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1085,7 +1085,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) * Tell lockdep we inherited freeze protection from submission * thread. */ - __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); file_end_write(file); } @@ -1525,7 +1526,8 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, * by telling it the lock got released so that it doesn't * complain about held lock when we return to userspace. */ - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } kfree(iovec); return ret; From 4d22c75d4c7b5c5f4bd31054f09103ee490878fd Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 11 Jan 2017 13:25:00 -0600 Subject: [PATCH 295/297] coredump: Ensure proper size of sparse core files If the last section of a core file ends with an unmapped or zero page, the size of the file does not correspond with the last dump_skip() call. gdb complains that the file is truncated and can be confusing to users. After all of the vma sections are written, make sure that the file size is no smaller than the current file position. This problem can be demonstrated with gdb's bigcore testcase on the sparc architecture. Signed-off-by: Dave Kleikamp Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Al Viro --- fs/binfmt_elf.c | 1 + fs/coredump.c | 18 ++++++++++++++++++ include/linux/coredump.h | 1 + 3 files changed, 20 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 29a02daf08a9..422370293cfd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2298,6 +2298,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; } } + dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/coredump.c b/fs/coredump.c index e525b6017cdf..ae6b05629ca1 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align) return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); + +/* + * Ensures that file size is big enough to contain the current file + * postion. This prevents gdb from complaining about a truncated file + * if the last "write" to the file was dump_skip. + */ +void dump_truncate(struct coredump_params *cprm) +{ + struct file *file = cprm->file; + loff_t offset; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + offset = file->f_op->llseek(file, 0, SEEK_CUR); + if (i_size_read(file->f_mapping->host) < offset) + do_truncate(file->f_path.dentry, offset, 0, file); + } +} +EXPORT_SYMBOL(dump_truncate); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index d016a121a8c4..28ffa94aed6b 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -14,6 +14,7 @@ struct coredump_params; extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); +extern void dump_truncate(struct coredump_params *cprm); #ifdef CONFIG_COREDUMP extern void do_coredump(const siginfo_t *siginfo); #else From b9dc6f65bc5e232d1c05fe34b5daadc7e8bbf1fb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 14 Jan 2017 19:33:08 -0500 Subject: [PATCH 296/297] fix a fencepost error in pipe_advance() The logics in pipe_advance() used to release all buffers past the new position failed in cases when the number of buffers to release was equal to pipe->buffers. If that happened, none of them had been released, leaving pipe full. Worse, it was trivial to trigger and we end up with pipe full of uninitialized pages. IOW, it's an infoleak. Cc: stable@vger.kernel.org # v4.9 Reported-by: "Alan J. Wylie" Tested-by: "Alan J. Wylie" Signed-off-by: Al Viro --- lib/iov_iter.c | 66 ++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 25f572303801..e68604ae3ced 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -730,43 +730,50 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); -static void pipe_advance(struct iov_iter *i, size_t size) +static inline void pipe_truncate(struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; - struct pipe_buffer *buf; - int idx = i->idx; - size_t off = i->iov_offset, orig_sz; - - if (unlikely(i->count < size)) - size = i->count; - orig_sz = size; - - if (size) { - if (off) /* make it relative to the beginning of buffer */ - size += off - pipe->bufs[idx].offset; - while (1) { - buf = &pipe->bufs[idx]; - if (size <= buf->len) - break; - size -= buf->len; - idx = next_idx(idx, pipe); - } - buf->len = size; - i->idx = idx; - off = i->iov_offset = buf->offset + size; - } - if (off) - idx = next_idx(idx, pipe); if (pipe->nrbufs) { - int unused = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - /* [curbuf,unused) is in use. Free [idx,unused) */ - while (idx != unused) { + size_t off = i->iov_offset; + int idx = i->idx; + int nrbufs = (idx - pipe->curbuf) & (pipe->buffers - 1); + if (off) { + pipe->bufs[idx].len = off - pipe->bufs[idx].offset; + idx = next_idx(idx, pipe); + nrbufs++; + } + while (pipe->nrbufs > nrbufs) { pipe_buf_release(pipe, &pipe->bufs[idx]); idx = next_idx(idx, pipe); pipe->nrbufs--; } } - i->count -= orig_sz; +} + +static void pipe_advance(struct iov_iter *i, size_t size) +{ + struct pipe_inode_info *pipe = i->pipe; + if (unlikely(i->count < size)) + size = i->count; + if (size) { + struct pipe_buffer *buf; + size_t off = i->iov_offset, left = size; + int idx = i->idx; + if (off) /* make it relative to the beginning of buffer */ + left += off - pipe->bufs[idx].offset; + while (1) { + buf = &pipe->bufs[idx]; + if (left <= buf->len) + break; + left -= buf->len; + idx = next_idx(idx, pipe); + } + i->idx = idx; + i->iov_offset = buf->offset + left; + } + i->count -= size; + /* ... and discard everything past that point */ + pipe_truncate(i); } void iov_iter_advance(struct iov_iter *i, size_t size) @@ -826,6 +833,7 @@ void iov_iter_pipe(struct iov_iter *i, int direction, size_t count) { BUG_ON(direction != ITER_PIPE); + WARN_ON(pipe->nrbufs == pipe->buffers); i->type = direction; i->pipe = pipe; i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); From 49def1853334396f948dcb4cedb9347abb318df5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Jan 2017 16:21:59 -0800 Subject: [PATCH 297/297] Linux 4.10-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5f1a84735ff6..96e2352d10a8 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Roaring Lionus # *DOCUMENTATION*