From 1c2bcc766be44467809f1798cd4ceacafe20a852 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 22 Feb 2017 17:25:42 +0100 Subject: [PATCH 001/297] batman-adv: Keep fragments equally sized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The batman-adv fragmentation packets have the design problem that they cannot be refragmented and cannot handle padding by the underlying link. The latter often leads to problems when networks are incorrectly configured and don't use a common MTU. The sender could for example fragment a 1271 byte frame (plus external ethernet header (14) and batadv unicast header (10)) to fit in a 1280 bytes large MTU of the underlying link (max. 1294 byte frames). This would create a 1294 bytes large frame (fragment 2) and a 55 bytes large frame (fragment 1). The extra 54 bytes are the fragment header (20) added to each fragment and the external ethernet header (14) for the second fragment. Let us assume that the next hop is then not able to transport 1294 bytes to its next hop. The 1294 byte large frame will be dropped but the 55 bytes large fragment will still be forwarded to its destination. Or let us assume that the underlying hardware requires that each frame has a minimum size (e.g. 60 bytes). Then it will pad the 55 bytes frame to 60 bytes. The receiver of the 60 bytes frame will no longer be able to correctly assemble the two frames together because it is not aware that 5 bytes of the 60 bytes frame are padding and don't belong to the reassembled frame. This can partly be avoided by splitting frames more equally. In this example, the 675 and 674 bytes large fragment frames could both potentially reach its destination without being too large or too small. Reported-by: Martin Weinelt Fixes: ee75ed88879a ("batman-adv: Fragment and send skbs larger than mtu") Signed-off-by: Sven Eckelmann Acked-by: Linus Lüssing Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 11149e5be4e0..106bda56ec98 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -404,7 +404,7 @@ out: * batadv_frag_create - create a fragment from skb * @skb: skb to create fragment from * @frag_head: header to use in new fragment - * @mtu: size of new fragment + * @fragment_size: size of new fragment * * Split the passed skb into two fragments: A new one with size matching the * passed mtu and the old one with the rest. The new skb contains data from the @@ -414,11 +414,11 @@ out: */ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, struct batadv_frag_packet *frag_head, - unsigned int mtu) + unsigned int fragment_size) { struct sk_buff *skb_fragment; unsigned int header_size = sizeof(*frag_head); - unsigned int fragment_size = mtu - header_size; + unsigned int mtu = fragment_size + header_size; skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); if (!skb_fragment) @@ -456,7 +456,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, struct sk_buff *skb_fragment; unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; unsigned int header_size = sizeof(frag_header); - unsigned int max_fragment_size, max_packet_size; + unsigned int max_fragment_size, num_fragments; int ret; /* To avoid merge and refragmentation at next-hops we never send @@ -464,10 +464,15 @@ int batadv_frag_send_packet(struct sk_buff *skb, */ mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE); max_fragment_size = mtu - header_size; - max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; + + if (skb->len == 0 || max_fragment_size == 0) + return -EINVAL; + + num_fragments = (skb->len - 1) / max_fragment_size + 1; + max_fragment_size = (skb->len - 1) / num_fragments + 1; /* Don't even try to fragment, if we need more than 16 fragments */ - if (skb->len > max_packet_size) { + if (num_fragments > BATADV_FRAG_MAX_FRAGMENTS) { ret = -EAGAIN; goto free_skb; } @@ -507,7 +512,8 @@ int batadv_frag_send_packet(struct sk_buff *skb, goto put_primary_if; } - skb_fragment = batadv_frag_create(skb, &frag_header, mtu); + skb_fragment = batadv_frag_create(skb, &frag_header, + max_fragment_size); if (!skb_fragment) { ret = -ENOMEM; goto put_primary_if; From 1a9070ec91b37234fe915849b767c61584c64a44 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 4 Mar 2017 15:48:50 +0100 Subject: [PATCH 002/297] batman-adv: Initialize gw sel_class via batadv_algo The gateway selection class variable is shared between different algorithm versions. But the interpretation of the content is algorithm specific. The initialization is therefore also algorithm specific. But this was implemented incorrectly and the initialization for BATMAN_V always overwrote the value previously written for BATMAN_IV. This could only be avoided when BATMAN_V was disabled during compile time. Using a special batadv_algo hook for this initialization avoids this problem. Fixes: 50164d8f500f ("batman-adv: B.A.T.M.A.N. V - implement GW selection logic") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 11 +++++++++++ net/batman-adv/bat_v.c | 14 +++++++++++--- net/batman-adv/gateway_common.c | 5 +++++ net/batman-adv/soft-interface.c | 1 - net/batman-adv/types.h | 2 ++ 5 files changed, 29 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f00f666e2ccd..7bfd0d7ef49d 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2477,6 +2477,16 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) batadv_iv_ogm_schedule(hard_iface); } +/** + * batadv_iv_init_sel_class - initialize GW selection class + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv) +{ + /* set default TQ difference threshold to 20 */ + atomic_set(&bat_priv->gw.sel_class, 20); +} + static struct batadv_gw_node * batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) { @@ -2823,6 +2833,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .del_if = batadv_iv_ogm_orig_del_if, }, .gw = { + .init_sel_class = batadv_iv_init_sel_class, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, #ifdef CONFIG_BATMAN_ADV_DEBUGFS diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 2ac612d7bab4..2e2471ca84e3 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -668,6 +668,16 @@ err_ifinfo1: return ret; } +/** + * batadv_v_init_sel_class - initialize GW selection class + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) +{ + /* set default throughput difference threshold to 5Mbps */ + atomic_set(&bat_priv->gw.sel_class, 50); +} + static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, char *buff, size_t count) { @@ -1052,6 +1062,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { .dump = batadv_v_orig_dump, }, .gw = { + .init_sel_class = batadv_v_init_sel_class, .store_sel_class = batadv_v_store_sel_class, .show_sel_class = batadv_v_show_sel_class, .get_best_gw_node = batadv_v_gw_get_best_gw_node, @@ -1092,9 +1103,6 @@ int batadv_v_mesh_init(struct batadv_priv *bat_priv) if (ret < 0) return ret; - /* set default throughput difference threshold to 5Mbps */ - atomic_set(&bat_priv->gw.sel_class, 50); - return 0; } diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 21184810d89f..3e3f91ab694f 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -253,6 +253,11 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, */ void batadv_gw_init(struct batadv_priv *bat_priv) { + if (bat_priv->algo_ops->gw.init_sel_class) + bat_priv->algo_ops->gw.init_sel_class(bat_priv); + else + atomic_set(&bat_priv->gw.sel_class, 1); + batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1, NULL, BATADV_TVLV_GW, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 7b3494ae6ad9..2e0b3463ab4a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -820,7 +820,6 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); - atomic_set(&bat_priv->gw.sel_class, 20); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index e913aee28c98..5137d859694c 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1489,6 +1489,7 @@ struct batadv_algo_orig_ops { /** * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific) + * @init_sel_class: initialize GW selection class (optional) * @store_sel_class: parse and stores a new GW selection class (optional) * @show_sel_class: prints the current GW selection class (optional) * @get_best_gw_node: select the best GW from the list of available nodes @@ -1499,6 +1500,7 @@ struct batadv_algo_orig_ops { * @dump: dump gateways to a netlink socket (optional) */ struct batadv_algo_gw_ops { + void (*init_sel_class)(struct batadv_priv *bat_priv); ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, size_t count); ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff); From 9ce9f7999741f342eeffd036ab09213a2fa93040 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Mon, 13 Feb 2017 13:49:58 -0600 Subject: [PATCH 003/297] gpio: altera-a10sr: Set gpio_chip parent property Set the gpio_chip parent property since some recent functions such as devprop_gpiochip_set_names() can use it. Signed-off-by: Thor Thayer Signed-off-by: Linus Walleij --- drivers/gpio/gpio-altera-a10sr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-altera-a10sr.c b/drivers/gpio/gpio-altera-a10sr.c index 9e1a138fed53..16a8951b2bed 100644 --- a/drivers/gpio/gpio-altera-a10sr.c +++ b/drivers/gpio/gpio-altera-a10sr.c @@ -96,7 +96,7 @@ static int altr_a10sr_gpio_probe(struct platform_device *pdev) gpio->regmap = a10sr->regmap; gpio->gp = altr_a10sr_gc; - + gpio->gp.parent = pdev->dev.parent; gpio->gp.of_node = pdev->dev.of_node; ret = devm_gpiochip_add_data(&pdev->dev, &gpio->gp, gpio); From fa6256db033067b57318decdc1c583512a1f8f68 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 15 Feb 2017 02:02:06 +0300 Subject: [PATCH 004/297] gpio: mockup: return -EFAULT if copy_from_user() fails copy_from_user() returns the number of bytes remaining to be copied but we want to return negative error codes on failue. Fixes: 9202ba2397d1 ("gpio: mockup: implement event injecting over debugfs") Signed-off-by: Dan Carpenter Acked-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/gpio/gpio-mockup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c index 06dac72cb69c..d99338689213 100644 --- a/drivers/gpio/gpio-mockup.c +++ b/drivers/gpio/gpio-mockup.c @@ -197,7 +197,7 @@ static ssize_t gpio_mockup_event_write(struct file *file, struct seq_file *sfile; struct gpio_desc *desc; struct gpio_chip *gc; - int status, val; + int val; char buf; sfile = file->private_data; @@ -206,9 +206,8 @@ static ssize_t gpio_mockup_event_write(struct file *file, chip = priv->chip; gc = &chip->gc; - status = copy_from_user(&buf, usr_buf, 1); - if (status) - return status; + if (copy_from_user(&buf, usr_buf, 1)) + return -EFAULT; if (buf == '0') val = 0; From b115bebc07f282067eccc06fd5aa3060ab1426da Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Feb 2017 16:13:44 +0100 Subject: [PATCH 005/297] gpio: xgene: mark PM functions as __maybe_unused When CONFIG_PM_SLEEP is disabled, we get a warning about unused functions: drivers/gpio/gpio-xgene.c:155:12: warning: 'xgene_gpio_resume' defined but not used [-Wunused-function] static int xgene_gpio_resume(struct device *dev) ^~~~~~~~~~~~~~~~~ drivers/gpio/gpio-xgene.c:142:12: warning: 'xgene_gpio_suspend' defined but not used [-Wunused-function] static int xgene_gpio_suspend(struct device *dev) The warnings are harmless and can be avoided by simplifying the code and marking the functions as __maybe_unused. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Walleij --- drivers/gpio/gpio-xgene.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/gpio/gpio-xgene.c b/drivers/gpio/gpio-xgene.c index 40a8881c2ce8..f1c6ec17b90a 100644 --- a/drivers/gpio/gpio-xgene.c +++ b/drivers/gpio/gpio-xgene.c @@ -42,9 +42,7 @@ struct xgene_gpio { struct gpio_chip chip; void __iomem *base; spinlock_t lock; -#ifdef CONFIG_PM u32 set_dr_val[XGENE_MAX_GPIO_BANKS]; -#endif }; static int xgene_gpio_get(struct gpio_chip *gc, unsigned int offset) @@ -138,8 +136,7 @@ static int xgene_gpio_dir_out(struct gpio_chip *gc, return 0; } -#ifdef CONFIG_PM -static int xgene_gpio_suspend(struct device *dev) +static __maybe_unused int xgene_gpio_suspend(struct device *dev) { struct xgene_gpio *gpio = dev_get_drvdata(dev); unsigned long bank_offset; @@ -152,7 +149,7 @@ static int xgene_gpio_suspend(struct device *dev) return 0; } -static int xgene_gpio_resume(struct device *dev) +static __maybe_unused int xgene_gpio_resume(struct device *dev) { struct xgene_gpio *gpio = dev_get_drvdata(dev); unsigned long bank_offset; @@ -166,10 +163,6 @@ static int xgene_gpio_resume(struct device *dev) } static SIMPLE_DEV_PM_OPS(xgene_gpio_pm, xgene_gpio_suspend, xgene_gpio_resume); -#define XGENE_GPIO_PM_OPS (&xgene_gpio_pm) -#else -#define XGENE_GPIO_PM_OPS NULL -#endif static int xgene_gpio_probe(struct platform_device *pdev) { @@ -241,7 +234,7 @@ static struct platform_driver xgene_gpio_driver = { .name = "xgene-gpio", .of_match_table = xgene_gpio_of_match, .acpi_match_table = ACPI_PTR(xgene_gpio_acpi_match), - .pm = XGENE_GPIO_PM_OPS, + .pm = &xgene_gpio_pm, }, .probe = xgene_gpio_probe, }; From f759921cfbf4847319d197a6ed7c9534d593f8bc Mon Sep 17 00:00:00 2001 From: Phil Reid Date: Mon, 20 Feb 2017 09:41:45 +0800 Subject: [PATCH 006/297] gpio: altera: Use handle_level_irq when configured as a level_high When a threaded irq handler is chained attached to one of the gpio pins when configure for level irq the altera_gpio_irq_leveL_high_handler does not mask the interrupt while being handled by the chained irq. This resulting in the threaded irq not getting enough cycles to complete quickly enough before the irq was disabled as faulty. handle_level_irq should be used in this situation instead of handle_simple_irq. In gpiochip_irqchip_add set default handler to handle_bad_irq as per Documentation/gpio/driver.txt. Then set the correct handler in the set_type callback. Signed-off-by: Phil Reid Signed-off-by: Linus Walleij --- drivers/gpio/gpio-altera.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/gpio-altera.c b/drivers/gpio/gpio-altera.c index 5bddbd507ca9..3fe6a21e05a5 100644 --- a/drivers/gpio/gpio-altera.c +++ b/drivers/gpio/gpio-altera.c @@ -90,21 +90,18 @@ static int altera_gpio_irq_set_type(struct irq_data *d, altera_gc = gpiochip_get_data(irq_data_get_irq_chip_data(d)); - if (type == IRQ_TYPE_NONE) + if (type == IRQ_TYPE_NONE) { + irq_set_handler_locked(d, handle_bad_irq); return 0; - if (type == IRQ_TYPE_LEVEL_HIGH && - altera_gc->interrupt_trigger == IRQ_TYPE_LEVEL_HIGH) + } + if (type == altera_gc->interrupt_trigger) { + if (type == IRQ_TYPE_LEVEL_HIGH) + irq_set_handler_locked(d, handle_level_irq); + else + irq_set_handler_locked(d, handle_simple_irq); return 0; - if (type == IRQ_TYPE_EDGE_RISING && - altera_gc->interrupt_trigger == IRQ_TYPE_EDGE_RISING) - return 0; - if (type == IRQ_TYPE_EDGE_FALLING && - altera_gc->interrupt_trigger == IRQ_TYPE_EDGE_FALLING) - return 0; - if (type == IRQ_TYPE_EDGE_BOTH && - altera_gc->interrupt_trigger == IRQ_TYPE_EDGE_BOTH) - return 0; - + } + irq_set_handler_locked(d, handle_bad_irq); return -EINVAL; } @@ -230,7 +227,6 @@ static void altera_gpio_irq_edge_handler(struct irq_desc *desc) chained_irq_exit(chip, desc); } - static void altera_gpio_irq_leveL_high_handler(struct irq_desc *desc) { struct altera_gpio_chip *altera_gc; @@ -310,7 +306,7 @@ static int altera_gpio_probe(struct platform_device *pdev) altera_gc->interrupt_trigger = reg; ret = gpiochip_irqchip_add(&altera_gc->mmchip.gc, &altera_irq_chip, 0, - handle_simple_irq, IRQ_TYPE_NONE); + handle_bad_irq, IRQ_TYPE_NONE); if (ret) { dev_err(&pdev->dev, "could not add irqchip\n"); From f2f10b7e722a75c6d75a7f7cd06b0eee3ae20f7c Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Fri, 17 Feb 2017 07:40:52 -0600 Subject: [PATCH 007/297] HID: chicony: Add support for another ASUS Zen AiO keyboard Add support for media keys on the keyboard that comes with the Asus V221ID and ZN241IC All In One computers. The keys to support here are WLAN, BRIGHTNESSDOWN and BRIGHTNESSUP. This device is not visibly branded as Chicony, and the USB Vendor ID suggests that it is a JESS device. However this seems like the right place to put it: the usage codes are identical to the currently supported devices, and this driver already supports the ASUS AIO keyboard AK1D. Signed-off-by: Daniel Drake Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 4 ++-- drivers/hid/hid-chicony.c | 1 + drivers/hid/hid-core.c | 1 + drivers/hid/hid-ids.h | 1 + 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 1aeb80e52424..8eab3200ac9a 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -175,11 +175,11 @@ config HID_CHERRY Support for Cherry Cymotion keyboard. config HID_CHICONY - tristate "Chicony Tactical pad" + tristate "Chicony devices" depends on HID default !EXPERT ---help--- - Support for Chicony Tactical pad. + Support for Chicony Tactical pad and special keys on Chicony keyboards. config HID_CORSAIR tristate "Corsair devices" diff --git a/drivers/hid/hid-chicony.c b/drivers/hid/hid-chicony.c index bc3cec199fee..f04ed9aabc3f 100644 --- a/drivers/hid/hid-chicony.c +++ b/drivers/hid/hid-chicony.c @@ -86,6 +86,7 @@ static const struct hid_device_id ch_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_WIRELESS2) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_AK1D) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_ACER_SWITCH12) }, + { HID_USB_DEVICE(USB_VENDOR_ID_JESS, USB_DEVICE_ID_JESS_ZEN_AIO_KBD) }, { } }; MODULE_DEVICE_TABLE(hid, ch_devices); diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index e9e87d337446..ae01ae601d74 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1910,6 +1910,7 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_MOUSE_A081) }, { HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_MOUSE_A0C2) }, { HID_USB_DEVICE(USB_VENDOR_ID_HUION, USB_DEVICE_ID_HUION_TABLET) }, + { HID_USB_DEVICE(USB_VENDOR_ID_JESS, USB_DEVICE_ID_JESS_ZEN_AIO_KBD) }, { HID_USB_DEVICE(USB_VENDOR_ID_JESS2, USB_DEVICE_ID_JESS2_COLOR_RUMBLE_PAD) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ION, USB_DEVICE_ID_ICADE) }, { HID_USB_DEVICE(USB_VENDOR_ID_KENSINGTON, USB_DEVICE_ID_KS_SLIMBLADE) }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 86c95d30ac80..b3df60da0297 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -557,6 +557,7 @@ #define USB_VENDOR_ID_JESS 0x0c45 #define USB_DEVICE_ID_JESS_YUREX 0x1010 +#define USB_DEVICE_ID_JESS_ZEN_AIO_KBD 0x5112 #define USB_VENDOR_ID_JESS2 0x0f30 #define USB_DEVICE_ID_JESS2_COLOR_RUMBLE_PAD 0x0111 From a687c5765b5ae19fe559e14615ddc87ebb46d409 Mon Sep 17 00:00:00 2001 From: Roderick Colenbrander Date: Fri, 24 Feb 2017 16:14:15 -0800 Subject: [PATCH 008/297] HID: sony: Fix input device leak when connecting a DS4 twice using USB/BT When a user connects a DS4 twice using USB and BT, we reject the second device connection after the setup work. We then perform a cleanup, but during cleanup we are not removing the touchpad device. This leads to leakage of an input device, which we would never remove. It can likely result into a kernel oops as well when the touchpad evdev node is accessed and the underlaying HID device has been removed from the system. [jkosina@suse.cz: added stable annotation] Fixes: ac797b95f532 ("HID: sony: Make the DS4 touchpad a separate device") Cc: stable@vger.kernel.org Signed-off-by: Roderick Colenbrander Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index f405b07d0381..740996f9bdd4 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -2632,6 +2632,8 @@ err_stop: sony_leds_remove(sc); if (sc->quirks & SONY_BATTERY_SUPPORT) sony_battery_remove(sc); + if (sc->touchpad) + sony_unregister_touchpad(sc); sony_cancel_work_sync(sc); kfree(sc->output_report_dmabuf); sony_remove_dev_list(sc); From 68598d2ea886322f9b4b0058e5b288418622de95 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 1 Mar 2017 02:12:50 +0300 Subject: [PATCH 009/297] btrfs: remove btrfs_err_str function from uapi/linux/btrfs.h btrfs_err_str function is not called from anywhere and is replicated in the userspace headers for btrfs-progs. It's removal also fixes the following linux/btrfs.h userspace compilation error: /usr/include/linux/btrfs.h: In function 'btrfs_err_str': /usr/include/linux/btrfs.h:740:11: error: 'NULL' undeclared (first use in this function) return NULL; Suggested-by: Jeff Mahoney Signed-off-by: Dmitry V. Levin Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index db4c253f8011..dcfc3a5a9cb1 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -713,33 +713,6 @@ enum btrfs_err_code { BTRFS_ERROR_DEV_ONLY_WRITABLE, BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS }; -/* An error code to error string mapping for the kernel -* error codes -*/ -static inline char *btrfs_err_str(enum btrfs_err_code err_code) -{ - switch (err_code) { - case BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET: - return "unable to go below two devices on raid1"; - case BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET: - return "unable to go below four devices on raid10"; - case BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET: - return "unable to go below two devices on raid5"; - case BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET: - return "unable to go below three devices on raid6"; - case BTRFS_ERROR_DEV_TGT_REPLACE: - return "unable to remove the dev_replace target dev"; - case BTRFS_ERROR_DEV_MISSING_NOT_FOUND: - return "no missing devices found to remove"; - case BTRFS_ERROR_DEV_ONLY_WRITABLE: - return "unable to remove the only writeable device"; - case BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS: - return "add/delete/balance/replace/resize operation "\ - "in progress"; - default: - return NULL; - } -} #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) From a04e54f2c35823ca32d56afcd5cea5b783e2f51a Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 3 Nov 2016 23:06:53 -0700 Subject: [PATCH 010/297] target/pscsi: Fix TYPE_TAPE + TYPE_MEDIMUM_CHANGER export The following fixes a divide by zero OOPs with TYPE_TAPE due to pscsi_tape_read_blocksize() failing causing a zero sd->sector_size being propigated up via dev_attrib.hw_block_size. It also fixes another long-standing bug where TYPE_TAPE and TYPE_MEDIMUM_CHANGER where using pscsi_create_type_other(), which does not call scsi_device_get() to take the device reference. Instead, rename pscsi_create_type_rom() to pscsi_create_type_nondisk() and use it for all cases. Finally, also drop a dump_stack() in pscsi_get_blocks() for non TYPE_DISK, which in modern target-core can get invoked via target_sense_desc_format() during CHECK_CONDITION. Reported-by: Malcolm Haak Cc: Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_pscsi.c | 47 ++++++++---------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index a8f8e53f2f57..44d92f23a3f0 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -154,7 +154,7 @@ static void pscsi_tape_read_blocksize(struct se_device *dev, buf = kzalloc(12, GFP_KERNEL); if (!buf) - return; + goto out_free; memset(cdb, 0, MAX_COMMAND_SIZE); cdb[0] = MODE_SENSE; @@ -169,9 +169,10 @@ static void pscsi_tape_read_blocksize(struct se_device *dev, * If MODE_SENSE still returns zero, set the default value to 1024. */ sdev->sector_size = (buf[9] << 16) | (buf[10] << 8) | (buf[11]); +out_free: if (!sdev->sector_size) sdev->sector_size = 1024; -out_free: + kfree(buf); } @@ -314,9 +315,10 @@ static int pscsi_add_device_to_list(struct se_device *dev, sd->lun, sd->queue_depth); } - dev->dev_attrib.hw_block_size = sd->sector_size; + dev->dev_attrib.hw_block_size = + min_not_zero((int)sd->sector_size, 512); dev->dev_attrib.hw_max_sectors = - min_t(int, sd->host->max_sectors, queue_max_hw_sectors(q)); + min_not_zero(sd->host->max_sectors, queue_max_hw_sectors(q)); dev->dev_attrib.hw_queue_depth = sd->queue_depth; /* @@ -339,8 +341,10 @@ static int pscsi_add_device_to_list(struct se_device *dev, /* * For TYPE_TAPE, attempt to determine blocksize with MODE_SENSE. */ - if (sd->type == TYPE_TAPE) + if (sd->type == TYPE_TAPE) { pscsi_tape_read_blocksize(dev, sd); + dev->dev_attrib.hw_block_size = sd->sector_size; + } return 0; } @@ -406,7 +410,7 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd) /* * Called with struct Scsi_Host->host_lock called. */ -static int pscsi_create_type_rom(struct se_device *dev, struct scsi_device *sd) +static int pscsi_create_type_nondisk(struct se_device *dev, struct scsi_device *sd) __releases(sh->host_lock) { struct pscsi_hba_virt *phv = dev->se_hba->hba_ptr; @@ -433,28 +437,6 @@ static int pscsi_create_type_rom(struct se_device *dev, struct scsi_device *sd) return 0; } -/* - * Called with struct Scsi_Host->host_lock called. - */ -static int pscsi_create_type_other(struct se_device *dev, - struct scsi_device *sd) - __releases(sh->host_lock) -{ - struct pscsi_hba_virt *phv = dev->se_hba->hba_ptr; - struct Scsi_Host *sh = sd->host; - int ret; - - spin_unlock_irq(sh->host_lock); - ret = pscsi_add_device_to_list(dev, sd); - if (ret) - return ret; - - pr_debug("CORE_PSCSI[%d] - Added Type: %s for %d:%d:%d:%llu\n", - phv->phv_host_id, scsi_device_type(sd->type), sh->host_no, - sd->channel, sd->id, sd->lun); - return 0; -} - static int pscsi_configure_device(struct se_device *dev) { struct se_hba *hba = dev->se_hba; @@ -542,11 +524,8 @@ static int pscsi_configure_device(struct se_device *dev) case TYPE_DISK: ret = pscsi_create_type_disk(dev, sd); break; - case TYPE_ROM: - ret = pscsi_create_type_rom(dev, sd); - break; default: - ret = pscsi_create_type_other(dev, sd); + ret = pscsi_create_type_nondisk(dev, sd); break; } @@ -611,8 +590,7 @@ static void pscsi_free_device(struct se_device *dev) else if (pdv->pdv_lld_host) scsi_host_put(pdv->pdv_lld_host); - if ((sd->type == TYPE_DISK) || (sd->type == TYPE_ROM)) - scsi_device_put(sd); + scsi_device_put(sd); pdv->pdv_sd = NULL; } @@ -1064,7 +1042,6 @@ static sector_t pscsi_get_blocks(struct se_device *dev) if (pdv->pdv_bd && pdv->pdv_bd->bd_part) return pdv->pdv_bd->bd_part->nr_sects; - dump_stack(); return 0; } From 13603685c1f12c67a7a2427f00b63f39a2b6f7c9 Mon Sep 17 00:00:00 2001 From: Max Lohrmann Date: Tue, 7 Mar 2017 22:09:56 -0800 Subject: [PATCH 011/297] target: Fix VERIFY_16 handling in sbc_parse_cdb As reported by Max, the Windows 2008 R2 chkdsk utility expects VERIFY_16 to be supported, and does not handle the returned CHECK_CONDITION properly, resulting in an infinite loop. The kernel will log huge amounts of this error: kernel: TARGET_CORE[iSCSI]: Unsupported SCSI Opcode 0x8f, sending CHECK_CONDITION. Signed-off-by: Max Lohrmann Cc: Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_sbc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index 68d8aef7ab78..c194063f169b 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -1105,9 +1105,15 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops) return ret; break; case VERIFY: + case VERIFY_16: size = 0; - sectors = transport_get_sectors_10(cdb); - cmd->t_task_lba = transport_lba_32(cdb); + if (cdb[0] == VERIFY) { + sectors = transport_get_sectors_10(cdb); + cmd->t_task_lba = transport_lba_32(cdb); + } else { + sectors = transport_get_sectors_16(cdb); + cmd->t_task_lba = transport_lba_64(cdb); + } cmd->execute_cmd = sbc_emulate_noop; goto check_lba; case REZERO_UNIT: From 7b4fdf77a450ec0fdcb2f677b080ddbf2c186544 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 3 Mar 2017 21:44:00 +0100 Subject: [PATCH 012/297] netfilter: don't track fragmented packets Andrey reports syzkaller splat caused by NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); in ipv4 nat. But this assertion (and the comment) are wrong, this function does see fragments when IP_NODEFRAG setsockopt is used. As conntrack doesn't track packets without complete l4 header, only the first fragment is tracked. Because applying nat to first packet but not the rest makes no sense this also turns off tracking of all fragments. Reported-by: Andrey Konovalov Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 4 ++++ net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index bc1486f2c064..2e14ed11a35c 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -165,6 +165,10 @@ static unsigned int ipv4_conntrack_local(void *priv, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; + + if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */ + return NF_ACCEPT; + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index f8aad03d674b..6f5e8d01b876 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -255,11 +255,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); - /* We never see fragments: conntrack defrags on pre-routing - * and local-out, and nf_nat_out protects post-routing. - */ - NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); - ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to From 8e05ba7f848475bdc3aa546cf88418f7e51a6671 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Sat, 4 Mar 2017 18:00:02 +0800 Subject: [PATCH 013/297] netfilter: nf_nat_sctp: fix ICMP packet to be dropped accidently Regarding RFC 792, the first 64 bits of the original SCTP datagram's data could be contained in ICMP packet, such as: 0 1 2 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Type | Code | Checksum | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | unused | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Internet Header + 64 bits of Original Data Datagram | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ However, according to RFC 4960, SCTP datagram header is as below: 0 1 2 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Source Port Number | Destination Port Number | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Verification Tag | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Checksum | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ It means only the first three fields of SCTP header can be carried in ICMP packet except for Checksum field. At present in sctp_manip_pkt(), no matter whether the packet is ICMP or not, it always calculates SCTP packet checksum. However, not only the calculation of checksum is unnecessary for ICMP, but also it causes another fatal issue that ICMP packet is dropped. The header size of SCTP is used to identify whether the writeable length of skb is bigger than skb->len through skb_make_writable() in sctp_manip_pkt(). But when it deals with ICMP packet, skb_make_writable() directly returns false as the writeable length of skb is bigger than skb->len. Subsequently ICMP is dropped. Now we correct this misbahavior. When sctp_manip_pkt() handles ICMP packet, 8 bytes rather than the whole SCTP header size is used to check if writeable length of skb is overflowed. Meanwhile, as it's meaningless to calculate checksum when packet is ICMP, the computation of checksum is ignored as well. Signed-off-by: Ying Xue Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_proto_sctp.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index 31d358691af0..804e8a0ab36e 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -33,8 +33,16 @@ sctp_manip_pkt(struct sk_buff *skb, enum nf_nat_manip_type maniptype) { sctp_sctphdr_t *hdr; + int hdrsize = 8; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + /* This could be an inner header returned in imcp packet; in such + * cases we cannot update the checksum field since it is outside + * of the 8 bytes of transport layer headers we are guaranteed. + */ + if (skb->len >= hdroff + sizeof(*hdr)) + hdrsize = sizeof(*hdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); @@ -47,6 +55,9 @@ sctp_manip_pkt(struct sk_buff *skb, hdr->dest = tuple->dst.u.sctp.port; } + if (hdrsize < sizeof(*hdr)) + return true; + if (skb->ip_summed != CHECKSUM_PARTIAL) { hdr->checksum = sctp_compute_cksum(skb, hdroff); skb->ip_summed = CHECKSUM_NONE; From 568af6de058cb2b0c5b98d98ffcf37cdc6bc38a7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Mar 2017 19:53:47 +0100 Subject: [PATCH 014/297] netfilter: nf_tables: set pktinfo->thoff at AH header if found Phil Sutter reports that IPv6 AH header matching is broken. From userspace, nft generates bytecode that expects to find the AH header at NFT_PAYLOAD_TRANSPORT_HEADER both for IPv4 and IPv6. However, pktinfo->thoff is set to the inner header after the AH header in IPv6, while in IPv4 pktinfo->thoff points to the AH header indeed. This behaviour is inconsistent. This patch fixes this problem by updating ipv6_find_hdr() to get the IP6_FH_F_AUTH flag so this function stops at the AH header, so both IPv4 and IPv6 pktinfo->thoff point to the AH header. This is also inconsistent when trying to match encapsulated headers: 1) A packet that looks like IPv4 + AH + TCP dport 22 will *not* match. 2) A packet that looks like IPv6 + AH + TCP dport 22 will match. Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv6.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index d150b5066201..97983d1c05e4 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -9,12 +9,13 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { + unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; nft_set_pktinfo(pkt, skb, state); - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) { nft_set_pktinfo_proto_unspec(pkt, skb); return; @@ -32,6 +33,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_IPV6) + unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; @@ -50,7 +52,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, if (pkt_len + sizeof(*ip6h) > skb->len) return -1; - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) return -1; From 89cf83d4e065ff9fbd2ddc674489c8058eeca758 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 16 Feb 2017 12:54:41 +0000 Subject: [PATCH 015/297] drm/i915: Squelch any ktime/jiffie rounding errors for wait-ioctl We wait upon jiffies, but report the time elapsed using a high-resolution timer. This discrepancy can lead to us timing out the wait prior to us reporting the elapsed time as complete. This restores the squelching lost in commit e95433c73a11 ("drm/i915: Rearrange i915_wait_request() accounting with callers"). Fixes: e95433c73a11 ("drm/i915: Rearrange i915_wait_request() accounting with callers") Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Joonas Lahtinen Cc: # v4.10-rc1+ Cc: stable@vger.kernel.org Link: http://patchwork.freedesktop.org/patch/msgid/20170216125441.30923-1-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen (cherry picked from commit c1d2061b28c2aa25ec39b60d9c248e6beebd7315) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 6908123162d1..c45af09555dc 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3029,6 +3029,16 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start)); if (args->timeout_ns < 0) args->timeout_ns = 0; + + /* + * Apparently ktime isn't accurate enough and occasionally has a + * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch + * things up to make the test happy. We allow up to 1 jiffy. + * + * This is a regression from the timespec->ktime conversion. + */ + if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns)) + args->timeout_ns = 0; } i915_gem_object_put(obj); From 1d972d6021a1388021df51a58248e68372ce2b5d Mon Sep 17 00:00:00 2001 From: Ander Conselvan de Oliveira Date: Thu, 23 Feb 2017 09:15:57 +0200 Subject: [PATCH 016/297] drm/i915/glk: Fix watermark computations for third sprite plane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Geminilake has a third sprite plane (or fourth universal plane) that is independent from the cursor. Make sure that for_each_plane_id_on_crtc() is aware of that extra plane so that the watermark code takes it into account. Fixes: e9c9882556fc ("drm/i915/glk: Configure number of sprite planes properly") Cc: Ander Conselvan de Oliveira Cc: Rodrigo Vivi Cc: Daniel Vetter Cc: Jani Nikula Cc: intel-gfx@lists.freedesktop.org Cc: Signed-off-by: Ander Conselvan de Oliveira Reviewed-by: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/20170223071600.14356-2-ander.conselvan.de.oliveira@intel.com (cherry picked from commit 19c3164db457e0fc65d4501fd354506228576241) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_drv.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0a4b42d31391..7febe6eecf72 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -293,6 +293,7 @@ enum plane_id { PLANE_PRIMARY, PLANE_SPRITE0, PLANE_SPRITE1, + PLANE_SPRITE2, PLANE_CURSOR, I915_MAX_PLANES, }; From b717a0392530ae8da0da041abe5c3a6098b55660 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 24 Feb 2017 11:43:06 +0000 Subject: [PATCH 017/297] drm/i915/fbdev: Stop repeating tile configuration on stagnation If we cease making progress in finding matching outputs for a tiled configuration, stop looping over the remaining unconfigured outputs. v2: Use conn_seq (instead of pass) to only apply tile configuration on first pass. Fixes: b0ee9e7fa5b4 ("drm/fb: add support for tiled monitor configurations. (v2)") Signed-off-by: Chris Wilson Cc: Tomasz Lis Cc: Dave Airlie Cc: Daniel Vetter Cc: Jani Nikula Cc: Sean Paul Cc: # v3.19+ Reviewed-by: Tomasz Lis Link: http://patchwork.freedesktop.org/patch/msgid/20170224114306.4400-1-chris@chris-wilson.co.uk (cherry picked from commit 754a76591b12c88f57ad8b4ca533a5c9566a1922) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_fbdev.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c index 1b8ba2e77539..2d449fb5d1d2 100644 --- a/drivers/gpu/drm/i915/intel_fbdev.c +++ b/drivers/gpu/drm/i915/intel_fbdev.c @@ -357,14 +357,13 @@ static bool intel_fb_initial_config(struct drm_fb_helper *fb_helper, bool *enabled, int width, int height) { struct drm_i915_private *dev_priv = to_i915(fb_helper->dev); - unsigned long conn_configured, mask; + unsigned long conn_configured, conn_seq, mask; unsigned int count = min(fb_helper->connector_count, BITS_PER_LONG); int i, j; bool *save_enabled; bool fallback = true; int num_connectors_enabled = 0; int num_connectors_detected = 0; - int pass = 0; save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL); if (!save_enabled) @@ -374,6 +373,7 @@ static bool intel_fb_initial_config(struct drm_fb_helper *fb_helper, mask = BIT(count) - 1; conn_configured = 0; retry: + conn_seq = conn_configured; for (i = 0; i < count; i++) { struct drm_fb_helper_connector *fb_conn; struct drm_connector *connector; @@ -387,7 +387,7 @@ retry: if (conn_configured & BIT(i)) continue; - if (pass == 0 && !connector->has_tile) + if (conn_seq == 0 && !connector->has_tile) continue; if (connector->status == connector_status_connected) @@ -498,10 +498,8 @@ retry: conn_configured |= BIT(i); } - if ((conn_configured & mask) != mask) { - pass++; + if ((conn_configured & mask) != mask && conn_configured != conn_seq) goto retry; - } /* * If the BIOS didn't enable everything it could, fall back to have the From 8c9923707f30ff56d9fd242053594b18f38d8036 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 27 Feb 2017 12:26:54 +0000 Subject: [PATCH 018/297] drm/i915: Remove the vma from the drm_mm if binding fails As we track whether a vma has been inserted into the drm_mm using the vma->flags, if we fail to bind the vma into the GTT we do not update those bits and will attempt to reinsert the vma into the drm_mm on future passes. To prevent that, we want to unwind i915_vma_insert() if we fail in our attempt to bind. Fixes: 59bfa1248e22 ("drm/i915: Start passing around i915_vma from execbuffer") Testcase: igt/drv_selftest/live_gtt Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Joonas Lahtinen Cc: # v4.9+ Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20170227122654.27651-3-chris@chris-wilson.co.uk (cherry picked from commit 31c7effa39f21f0fea1b3250ae9ff32b9c7e1ae5) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_vma.c | 57 +++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 155906e84812..df20e9bc1c0f 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -512,10 +512,36 @@ err_unpin: return ret; } +static void +i915_vma_remove(struct i915_vma *vma) +{ + struct drm_i915_gem_object *obj = vma->obj; + + GEM_BUG_ON(!drm_mm_node_allocated(&vma->node)); + GEM_BUG_ON(vma->flags & (I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND)); + + drm_mm_remove_node(&vma->node); + list_move_tail(&vma->vm_link, &vma->vm->unbound_list); + + /* Since the unbound list is global, only move to that list if + * no more VMAs exist. + */ + if (--obj->bind_count == 0) + list_move_tail(&obj->global_link, + &to_i915(obj->base.dev)->mm.unbound_list); + + /* And finally now the object is completely decoupled from this vma, + * we can drop its hold on the backing storage and allow it to be + * reaped by the shrinker. + */ + i915_gem_object_unpin_pages(obj); + GEM_BUG_ON(atomic_read(&obj->mm.pages_pin_count) < obj->bind_count); +} + int __i915_vma_do_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags) { - unsigned int bound = vma->flags; + const unsigned int bound = vma->flags; int ret; lockdep_assert_held(&vma->vm->i915->drm.struct_mutex); @@ -524,18 +550,18 @@ int __i915_vma_do_pin(struct i915_vma *vma, if (WARN_ON(bound & I915_VMA_PIN_OVERFLOW)) { ret = -EBUSY; - goto err; + goto err_unpin; } if ((bound & I915_VMA_BIND_MASK) == 0) { ret = i915_vma_insert(vma, size, alignment, flags); if (ret) - goto err; + goto err_unpin; } ret = i915_vma_bind(vma, vma->obj->cache_level, flags); if (ret) - goto err; + goto err_remove; if ((bound ^ vma->flags) & I915_VMA_GLOBAL_BIND) __i915_vma_set_map_and_fenceable(vma); @@ -544,7 +570,12 @@ int __i915_vma_do_pin(struct i915_vma *vma, GEM_BUG_ON(i915_vma_misplaced(vma, size, alignment, flags)); return 0; -err: +err_remove: + if ((bound & I915_VMA_BIND_MASK) == 0) { + GEM_BUG_ON(vma->pages); + i915_vma_remove(vma); + } +err_unpin: __i915_vma_unpin(vma); return ret; } @@ -657,9 +688,6 @@ int i915_vma_unbind(struct i915_vma *vma) } vma->flags &= ~(I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND); - drm_mm_remove_node(&vma->node); - list_move_tail(&vma->vm_link, &vma->vm->unbound_list); - if (vma->pages != obj->mm.pages) { GEM_BUG_ON(!vma->pages); sg_free_table(vma->pages); @@ -667,18 +695,7 @@ int i915_vma_unbind(struct i915_vma *vma) } vma->pages = NULL; - /* Since the unbound list is global, only move to that list if - * no more VMAs exist. */ - if (--obj->bind_count == 0) - list_move_tail(&obj->global_link, - &to_i915(obj->base.dev)->mm.unbound_list); - - /* And finally now the object is completely decoupled from this vma, - * we can drop its hold on the backing storage and allow it to be - * reaped by the shrinker. - */ - i915_gem_object_unpin_pages(obj); - GEM_BUG_ON(atomic_read(&obj->mm.pages_pin_count) < obj->bind_count); + i915_vma_remove(vma); destroy: if (unlikely(i915_vma_is_closed(vma))) From 34dc8993eef63681b062871413a9484008a2a78f Mon Sep 17 00:00:00 2001 From: Mika Kuoppala Date: Wed, 15 Feb 2017 15:52:59 +0200 Subject: [PATCH 019/297] drm/i915: Avoid tweaking evaluation thresholds on Baytrail v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Certain Baytrails, namely the 4 cpu core variants, have been plaqued by spurious system hangs, mostly occurring with light loads. Multiple bisects by various people point to a commit which changes the reclocking strategy for Baytrail to follow its bigger brethen: commit 8fb55197e64d ("drm/i915: Agressive downclocking on Baytrail") There is also a review comment attached to this commit from Deepak S on avoiding punit access on Cherryview and thus it was excluded on common reclocking path. By taking the same approach and omitting the punit access by not tweaking the thresholds when the hardware has been asked to move into different frequency, considerable gains in stability have been observed. With J1900 box, light render/video load would end up in system hang in usually less than 12 hours. With this patch applied, the cumulative uptime has now been 34 days without issues. To provoke system hang, light loads on both render and bsd engines in parallel have been used: glxgears >/dev/null 2>/dev/null & mpv --vo=vaapi --hwdec=vaapi --loop=inf vid.mp4 So far, author has not witnessed system hang with above load and this patch applied. Reports from the tenacious people at kernel bugzilla are also promising. Considering that the punit access frequency with this patch is considerably less, there is a possibility that this will push the, still unknown, root cause past the triggering point on most loads. But as we now can reliably reproduce the hang independently, we can reduce the pain that users are having and use a static thresholds until a root cause is found. v3: don't break debugfs and simplification (Chris Wilson) References: https://bugzilla.kernel.org/show_bug.cgi?id=109051 Cc: Chris Wilson Cc: Ville Syrjälä Cc: Len Brown Cc: Daniel Vetter Cc: Jani Nikula Cc: fritsch@xbmc.org Cc: miku@iki.fi Cc: Ezequiel Garcia CC: Michal Feix Cc: Hans de Goede Cc: Deepak S Cc: Jarkko Nikula Cc: # v4.2+ Acked-by: Daniel Vetter Acked-by: Chris Wilson Signed-off-by: Mika Kuoppala Link: http://patchwork.freedesktop.org/patch/msgid/1487166779-26945-1-git-send-email-mika.kuoppala@intel.com (cherry picked from commit 6067a27d1f0184596d51decbac1c1fdc4acb012f) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_pm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 249623d45be0..65cd4c56c9dd 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -4891,6 +4891,12 @@ static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val) break; } + /* When byt can survive without system hang with dynamic + * sw freq adjustments, this restriction can be lifted. + */ + if (IS_VALLEYVIEW(dev_priv)) + goto skip_hw_write; + I915_WRITE(GEN6_RP_UP_EI, GT_INTERVAL_FROM_US(dev_priv, ei_up)); I915_WRITE(GEN6_RP_UP_THRESHOLD, @@ -4911,6 +4917,7 @@ static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val) GEN6_RP_UP_BUSY_AVG | GEN6_RP_DOWN_IDLE_AVG); +skip_hw_write: dev_priv->rps.power = new_power; dev_priv->rps.up_threshold = threshold_up; dev_priv->rps.down_threshold = threshold_down; From d253371c4c2f5fc2d884ef25f64decd7549aff5a Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 Feb 2017 16:32:10 +0200 Subject: [PATCH 020/297] drm/i915/gen9: Increase PCODE request timeout to 50ms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 2c7d0602c815277f7cb7c932b091288710d8aba7 Author: Imre Deak Date: Mon Dec 5 18:27:37 2016 +0200 drm/i915/gen9: Fix PCODE polling during CDCLK change notification there is still one report of the CDCLK-change request timing out on a KBL machine, see the Reference link. On that machine the maximum time the request took to succeed was 34ms, so increase the timeout to 50ms. v2: - Change timeout from 100 to 50 ms to maintain the current 50 ms limit for atomic waits in the driver. (Chris, Tvrtko) Reference: https://bugs.freedesktop.org/show_bug.cgi?id=99345 Cc: Ville Syrjälä Cc: Chris Wilson Cc: Tvrtko Ursulin Cc: Signed-off-by: Imre Deak Acked-by: Chris Wilson Link: http://patchwork.freedesktop.org/patch/msgid/1487946730-17162-1-git-send-email-imre.deak@intel.com (cherry picked from commit 0129936ddda26afd5d9d207c4e86b2425952579f) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_pm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 65cd4c56c9dd..940bab22d464 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -7923,10 +7923,10 @@ static bool skl_pcode_try_request(struct drm_i915_private *dev_priv, u32 mbox, * @timeout_base_ms: timeout for polling with preemption enabled * * Keep resending the @request to @mbox until PCODE acknowledges it, PCODE - * reports an error or an overall timeout of @timeout_base_ms+10 ms expires. + * reports an error or an overall timeout of @timeout_base_ms+50 ms expires. * The request is acknowledged once the PCODE reply dword equals @reply after * applying @reply_mask. Polling is first attempted with preemption enabled - * for @timeout_base_ms and if this times out for another 10 ms with + * for @timeout_base_ms and if this times out for another 50 ms with * preemption disabled. * * Returns 0 on success, %-ETIMEDOUT in case of a timeout, <0 in case of some @@ -7962,14 +7962,15 @@ int skl_pcode_request(struct drm_i915_private *dev_priv, u32 mbox, u32 request, * worst case) _and_ PCODE was busy for some reason even after a * (queued) request and @timeout_base_ms delay. As a workaround retry * the poll with preemption disabled to maximize the number of - * requests. Increase the timeout from @timeout_base_ms to 10ms to + * requests. Increase the timeout from @timeout_base_ms to 50ms to * account for interrupts that could reduce the number of these - * requests. + * requests, and for any quirks of the PCODE firmware that delays + * the request completion. */ DRM_DEBUG_KMS("PCODE timeout, retrying with preemption disabled\n"); WARN_ON_ONCE(timeout_base_ms > 3); preempt_disable(); - ret = wait_for_atomic(COND, 10); + ret = wait_for_atomic(COND, 50); preempt_enable(); out: From 38230243ef316ac696956d75dc78a22e3aa789b9 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Tue, 28 Feb 2017 15:28:47 +0100 Subject: [PATCH 021/297] drm/i915: Move updating color management to before vblank evasion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This cannot be done reliably during vblank evasasion since the color management registers are not double buffered. The original commit that moved it always during vblank evasion was wrong, so revert it to before vblank evasion again. Signed-off-by: Maarten Lankhorst Fixes: 20a34e78f0d7 ("drm/i915: Update color management during vblank evasion.") Cc: stable@vger.kernel.org # v4.7+ Link: http://patchwork.freedesktop.org/patch/msgid/1488292128-14540-1-git-send-email-maarten.lankhorst@linux.intel.com Reviewed-by: Ville Syrjälä (cherry picked from commit 567f0792a6ad11c0c2620944b8eeb777359fb85a) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_display.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 01341670738f..9a8b6a13233d 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -14946,17 +14946,19 @@ static void intel_begin_crtc_commit(struct drm_crtc *crtc, to_intel_atomic_state(old_crtc_state->state); bool modeset = needs_modeset(crtc->state); + if (!modeset && + (intel_cstate->base.color_mgmt_changed || + intel_cstate->update_pipe)) { + intel_color_set_csc(crtc->state); + intel_color_load_luts(crtc->state); + } + /* Perform vblank evasion around commit operation */ intel_pipe_update_start(intel_crtc); if (modeset) goto out; - if (crtc->state->color_mgmt_changed || to_intel_crtc_state(crtc->state)->update_pipe) { - intel_color_set_csc(crtc->state); - intel_color_load_luts(crtc->state); - } - if (intel_cstate->update_pipe) intel_update_pipe_config(intel_crtc, old_intel_cstate); else if (INTEL_GEN(dev_priv) >= 9) From 0d9dc306e15b59bf50db87ebcb1e2248586d4733 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 7 Mar 2017 13:20:31 +0000 Subject: [PATCH 022/297] drm/i915: Store a permanent error in obj->mm.pages Once the object has been truncated, it is unrecoverable. To facilitate detection of this state store the error in obj->mm.pages. This is required for the next patch which should be applied to v4.10 (via stable), so we also need to mark this patch for backporting. In that regard, let's consider this to be a fix/improvement too. v2: Avoid dereferencing the ERR_PTR when freeing the object. Fixes: 1233e2db199d ("drm/i915: Move object backing storage manipulation to its own locking") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: # v4.10+ Link: http://patchwork.freedesktop.org/patch/msgid/20170307132031.32461-1-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen (cherry picked from commit 4e5462ee843c883790e9609cf560d88960ea4227) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c45af09555dc..3591e8656ff9 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2119,6 +2119,7 @@ i915_gem_object_truncate(struct drm_i915_gem_object *obj) */ shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1); obj->mm.madv = __I915_MADV_PURGED; + obj->mm.pages = ERR_PTR(-EFAULT); } /* Try to discard unwanted pages */ @@ -2218,7 +2219,9 @@ void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj, __i915_gem_object_reset_page_iter(obj); - obj->ops->put_pages(obj, pages); + if (!IS_ERR(pages)) + obj->ops->put_pages(obj, pages); + unlock: mutex_unlock(&obj->mm.lock); } @@ -2437,7 +2440,7 @@ int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj) if (err) return err; - if (unlikely(!obj->mm.pages)) { + if (unlikely(IS_ERR_OR_NULL(obj->mm.pages))) { err = ____i915_gem_object_get_pages(obj); if (err) goto unlock; @@ -2515,7 +2518,7 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj, pinned = true; if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) { - if (unlikely(!obj->mm.pages)) { + if (unlikely(IS_ERR_OR_NULL(obj->mm.pages))) { ret = ____i915_gem_object_get_pages(obj); if (ret) goto err_unlock; From 4e6fdafa7ac395ad47a80a0e7b4fd1e11550f862 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 7 Mar 2017 12:03:38 +0000 Subject: [PATCH 023/297] drm/i915: Use pagecache write to prepopulate shmemfs from pwrite-ioctl Before we instantiate/pin the backing store for our use, we can prepopulate the shmemfs filp efficiently using a write into the pagecache. We avoid the penalty of instantiating all the pages, important if the user is just writing to a few and never uses the object on the GPU, and using a direct write into shmemfs allows it to avoid the cost of retrieving a page (mostly the clear-before-use, but in theory we could curtail swapin) before it is overwritten. This can be extended later to provide additional specialisation for other backends (other than shmemfs). For now it provides a defense against very large write-only allocations from exhausting all of system memory. v2: Smelling fixes. Fixes: fe115628d567 ("drm/i915: Implement pwrite without struct-mutex") References: https://bugs.freedesktop.org/show_bug.cgi?id=99107 Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Joonas Lahtinen Cc: Mika Kuoppala Cc: # v4.10+ Reviewed-by: Joonas Lahtinen Reviewed-by: Tvrtko Ursulin Link: http://patchwork.freedesktop.org/patch/msgid/20170307120338.7277-2-chris@chris-wilson.co.uk (cherry picked from commit 7c55e2c5772dcf3cbacd0fa2bcfeefae416b73f7) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem.c | 78 ++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem_object.h | 3 + 2 files changed, 81 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 3591e8656ff9..10777da73039 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1434,6 +1434,12 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, trace_i915_gem_object_pwrite(obj, args->offset, args->size); + ret = -ENODEV; + if (obj->ops->pwrite) + ret = obj->ops->pwrite(obj, args); + if (ret != -ENODEV) + goto err; + ret = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL, @@ -2566,6 +2572,75 @@ err_unlock: goto out_unlock; } +static int +i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj, + const struct drm_i915_gem_pwrite *arg) +{ + struct address_space *mapping = obj->base.filp->f_mapping; + char __user *user_data = u64_to_user_ptr(arg->data_ptr); + u64 remain, offset; + unsigned int pg; + + /* Before we instantiate/pin the backing store for our use, we + * can prepopulate the shmemfs filp efficiently using a write into + * the pagecache. We avoid the penalty of instantiating all the + * pages, important if the user is just writing to a few and never + * uses the object on the GPU, and using a direct write into shmemfs + * allows it to avoid the cost of retrieving a page (either swapin + * or clearing-before-use) before it is overwritten. + */ + if (READ_ONCE(obj->mm.pages)) + return -ENODEV; + + /* Before the pages are instantiated the object is treated as being + * in the CPU domain. The pages will be clflushed as required before + * use, and we can freely write into the pages directly. If userspace + * races pwrite with any other operation; corruption will ensue - + * that is userspace's prerogative! + */ + + remain = arg->size; + offset = arg->offset; + pg = offset_in_page(offset); + + do { + unsigned int len, unwritten; + struct page *page; + void *data, *vaddr; + int err; + + len = PAGE_SIZE - pg; + if (len > remain) + len = remain; + + err = pagecache_write_begin(obj->base.filp, mapping, + offset, len, 0, + &page, &data); + if (err < 0) + return err; + + vaddr = kmap(page); + unwritten = copy_from_user(vaddr + pg, user_data, len); + kunmap(page); + + err = pagecache_write_end(obj->base.filp, mapping, + offset, len, len - unwritten, + page, data); + if (err < 0) + return err; + + if (unwritten) + return -EFAULT; + + remain -= len; + user_data += len; + offset += len; + pg = 0; + } while (remain); + + return 0; +} + static bool ban_context(const struct i915_gem_context *ctx) { return (i915_gem_context_is_bannable(ctx) && @@ -3987,8 +4062,11 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj, static const struct drm_i915_gem_object_ops i915_gem_object_ops = { .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE | I915_GEM_OBJECT_IS_SHRINKABLE, + .get_pages = i915_gem_object_get_pages_gtt, .put_pages = i915_gem_object_put_pages_gtt, + + .pwrite = i915_gem_object_pwrite_gtt, }; struct drm_i915_gem_object * diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h index bf90b07163d1..76b80a0be797 100644 --- a/drivers/gpu/drm/i915/i915_gem_object.h +++ b/drivers/gpu/drm/i915/i915_gem_object.h @@ -54,6 +54,9 @@ struct drm_i915_gem_object_ops { struct sg_table *(*get_pages)(struct drm_i915_gem_object *); void (*put_pages)(struct drm_i915_gem_object *, struct sg_table *); + int (*pwrite)(struct drm_i915_gem_object *, + const struct drm_i915_gem_pwrite *); + int (*dmabuf_export)(struct drm_i915_gem_object *); void (*release)(struct drm_i915_gem_object *); }; From edd06b8353772dca7afcd4640dafa83b521edd55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 7 Mar 2017 22:54:19 +0200 Subject: [PATCH 024/297] drm/i915: Nuke debug messages from the pipe update critical section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit printks are slow so we should not be doing them from the vblank evade critical section. These could explain why we sometimes seem to blow past our 100 usec deadline. The problem has been there ever since commit bfd16b2a23dc ("drm/i915: Make updating pipe without modeset atomic.") but it may not have been readily visible until commit e1edbd44e23b ("drm/i915: Complain if we take too long under vblank evasion.") increased our chances of noticing it. Cc: stable@vger.kernel.org Cc: Maarten Lankhorst Fixes: bfd16b2a23dc ("drm/i915: Make updating pipe without modeset atomic.") Signed-off-by: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/20170307205419.19447-1-ville.syrjala@linux.intel.com Reviewed-by: Maarten Lankhorst (cherry picked from commit c3f8ad57a01a31397e5a0349a226a32f35ddc19c) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_display.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 9a8b6a13233d..b3e0cd133b49 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -3669,10 +3669,6 @@ static void intel_update_pipe_config(struct intel_crtc *crtc, /* drm_atomic_helper_update_legacy_modeset_state might not be called. */ crtc->base.mode = crtc->base.state->mode; - DRM_DEBUG_KMS("Updating pipe size %ix%i -> %ix%i\n", - old_crtc_state->pipe_src_w, old_crtc_state->pipe_src_h, - pipe_config->pipe_src_w, pipe_config->pipe_src_h); - /* * Update pipe size and adjust fitter if needed: the reason for this is * that in compute_mode_changes we check the native mode (not the pfit @@ -4796,23 +4792,17 @@ static void skylake_pfit_enable(struct intel_crtc *crtc) struct intel_crtc_scaler_state *scaler_state = &crtc->config->scaler_state; - DRM_DEBUG_KMS("for crtc_state = %p\n", crtc->config); - if (crtc->config->pch_pfit.enabled) { int id; - if (WARN_ON(crtc->config->scaler_state.scaler_id < 0)) { - DRM_ERROR("Requesting pfit without getting a scaler first\n"); + if (WARN_ON(crtc->config->scaler_state.scaler_id < 0)) return; - } id = scaler_state->scaler_id; I915_WRITE(SKL_PS_CTRL(pipe, id), PS_SCALER_EN | PS_FILTER_MEDIUM | scaler_state->scalers[id].mode); I915_WRITE(SKL_PS_WIN_POS(pipe, id), crtc->config->pch_pfit.pos); I915_WRITE(SKL_PS_WIN_SZ(pipe, id), crtc->config->pch_pfit.size); - - DRM_DEBUG_KMS("for crtc_state = %p scaler_id = %d\n", crtc->config, id); } } From 5a8cf90d743f2d05433c6109f6c1b9b904b0cdb7 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 2 Feb 2017 20:47:41 +0000 Subject: [PATCH 025/297] drm/i915: Drain the freed state from the tail of the next commit If we have any residual freed atomic state from earlier commits, flush the freed list after performing the current modeset. This prevents the freed list from ever-growing if userspace manages to starve the kernel threads (i.e. we are never able to run our free state worker and eventually the system may even oom). Fixes: 6f0f02dc56f1 ("drm/i915: Move atomic state free from out of fence release") Testcase: igt/kms_cursor/legacy/all-pipes-single-bo Reported-by: Maarten Lankhorst Signed-off-by: Chris Wilson Cc: Maarten Lankhorst Cc: Joonas Lahtinen Cc: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20170202204741.18231-1-chris@chris-wilson.co.uk Reviewed-by: Maarten Lankhorst (cherry picked from commit ba318c61a9719577b6f451c055f364e4116874b2) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_display.c | 34 +++++++++++++++++----------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index b3e0cd133b49..3282b0f4b134 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -14369,6 +14369,24 @@ static void skl_update_crtcs(struct drm_atomic_state *state, } while (progress); } +static void intel_atomic_helper_free_state(struct drm_i915_private *dev_priv) +{ + struct intel_atomic_state *state, *next; + struct llist_node *freed; + + freed = llist_del_all(&dev_priv->atomic_helper.free_list); + llist_for_each_entry_safe(state, next, freed, freed) + drm_atomic_state_put(&state->base); +} + +static void intel_atomic_helper_free_state_worker(struct work_struct *work) +{ + struct drm_i915_private *dev_priv = + container_of(work, typeof(*dev_priv), atomic_helper.free_work); + + intel_atomic_helper_free_state(dev_priv); +} + static void intel_atomic_commit_tail(struct drm_atomic_state *state) { struct drm_device *dev = state->dev; @@ -14535,6 +14553,8 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state) * can happen also when the device is completely off. */ intel_uncore_arm_unclaimed_mmio_detection(dev_priv); + + intel_atomic_helper_free_state(dev_priv); } static void intel_atomic_commit_work(struct work_struct *work) @@ -16591,18 +16611,6 @@ fail: drm_modeset_acquire_fini(&ctx); } -static void intel_atomic_helper_free_state(struct work_struct *work) -{ - struct drm_i915_private *dev_priv = - container_of(work, typeof(*dev_priv), atomic_helper.free_work); - struct intel_atomic_state *state, *next; - struct llist_node *freed; - - freed = llist_del_all(&dev_priv->atomic_helper.free_list); - llist_for_each_entry_safe(state, next, freed, freed) - drm_atomic_state_put(&state->base); -} - int intel_modeset_init(struct drm_device *dev) { struct drm_i915_private *dev_priv = to_i915(dev); @@ -16623,7 +16631,7 @@ int intel_modeset_init(struct drm_device *dev) dev->mode_config.funcs = &intel_mode_funcs; INIT_WORK(&dev_priv->atomic_helper.free_work, - intel_atomic_helper_free_state); + intel_atomic_helper_free_state_worker); intel_init_quirks(dev); From 6d399783e9d4e9bd44931501948059d24ad96ff8 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 23 Feb 2017 12:26:41 -0800 Subject: [PATCH 026/297] md/raid10: submit bio directly to replacement disk Commit 57c67df(md/raid10: submit IO from originating thread instead of md thread) submits bio directly for normal disks but not for replacement disks. There is no point we shouldn't do this for replacement disks. Cc: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 063c43d83b72..1443305613c5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1477,11 +1477,24 @@ retry_write: mbio->bi_bdev = (void*)rdev; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } } From 99b3d74ec05c4a4c57766a90d65b53d78ab06404 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 23 Feb 2017 12:31:10 -0800 Subject: [PATCH 027/297] md: delete dead code Nobody is using mddev_check_plugged(), so delete the dead code Signed-off-by: Shaohua Li --- drivers/md/md.c | 8 -------- drivers/md/md.h | 6 ------ 2 files changed, 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 548d1b8014f8..82bd1f3d2b19 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -440,14 +440,6 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -void md_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct mddev *mddev = cb->data; - md_wakeup_thread(mddev->thread); - kfree(cb); -} -EXPORT_SYMBOL(md_unplug); - static inline struct mddev *mddev_get(struct mddev *mddev) { atomic_inc(&mddev->active); diff --git a/drivers/md/md.h b/drivers/md/md.h index b8859cbf84b6..dde8ecb760c8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -676,16 +676,10 @@ extern void mddev_resume(struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); -static inline int mddev_check_plugged(struct mddev *mddev) -{ - return !!blk_check_plugged(md_unplug, mddev, - sizeof(struct blk_plug_cb)); -} static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { From 9c8043f337f14d1743006dfc59c03e80a42e3884 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 24 Feb 2017 11:15:12 +0800 Subject: [PATCH 028/297] md-cluster: free md_cluster_info if node leave cluster To avoid memory leak, we need to free the cinfo which is allocated when node join cluster. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 2b13117fb918..ba7edcdd09ce 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -974,6 +974,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); dlm_release_lockspace(cinfo->lockspace, 2); + kfree(cinfo); return 0; } From 75df023f4f2188c21181996e28234fef9351ef45 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 24 Feb 2017 11:15:13 +0800 Subject: [PATCH 029/297] md-cluster: remove useless memset from gather_all_resync_info This memset is not needed. The lvb is already zeroed because it was recently allocated by lockres_init, which uses kzalloc(), and read_resync_info() doesn't need it to be zero anyway. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index ba7edcdd09ce..321ecac23027 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -777,7 +777,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) bm_lockres->flags |= DLM_LKF_NOQUEUE; ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (ret == -EAGAIN) { - memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); s = read_resync_info(mddev, bm_lockres); if (s) { pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", From c94836342192b05d599d6aa3397f732f7a238689 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 24 Feb 2017 11:15:23 +0800 Subject: [PATCH 030/297] md: move funcs from pers->resize to update_size raid1_resize and raid5_resize should also check the mddev->queue if run underneath dm-raid. And both set_capacity and revalidate_disk are used in pers->resize such as raid1, raid10 and raid5. So move them from personality file to common code. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 8 ++++++-- drivers/md/raid1.c | 2 -- drivers/md/raid10.c | 4 ---- drivers/md/raid5.c | 2 -- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 82bd1f3d2b19..bd15a18485c8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6525,8 +6525,12 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) - revalidate_disk(mddev->gendisk); + if (!rv) { + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + } return rv; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fbc2d7851b49..10c3865e1186 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3246,8 +3246,6 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, newsize); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1443305613c5..c4db6d1fb6a2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3956,10 +3956,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, size); - if (mddev->queue) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - } if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { mddev->recovery_cp = oldsize; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4fb09b3fcb41..6bfedfcf41c1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7605,8 +7605,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, newsize); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; From 1b3bae49fba52f1ec499c36c53bc07761a9f6c4d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Mar 2017 07:31:28 +1100 Subject: [PATCH 031/297] md: don't impose the MD_SB_DISKS limit on arrays without metadata. These arrays, created with "mdadm --build" don't benefit from a limit. The default will be used, which is '0' and is interpreted as "don't impose a limit". Reported-by: ian_bruce@mail.ru Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index bd15a18485c8..cd89ad3c3a0d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6450,11 +6450,10 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->layout = info->layout; mddev->chunk_sectors = info->chunk_size >> 9; - mddev->max_disks = MD_SB_DISKS; - if (mddev->persistent) { - mddev->flags = 0; - mddev->sb_flags = 0; + mddev->max_disks = MD_SB_DISKS; + mddev->flags = 0; + mddev->sb_flags = 0; } set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); From 61eb2b43b99ebdc9bc6bc83d9792257b243e7cb3 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 28 Feb 2017 13:00:20 -0800 Subject: [PATCH 032/297] md/raid1/10: fix potential deadlock Neil Brown pointed out a potential deadlock in raid 10 code with bio_split/chain. The raid1 code could have the same issue, but recent barrier rework makes it less likely to happen. The deadlock happens in below sequence: 1. generic_make_request(bio), this will set current->bio_list 2. raid10_make_request will split bio to bio1 and bio2 3. __make_request(bio1), wait_barrer, add underlayer disk bio to current->bio_list 4. __make_request(bio2), wait_barrer If raise_barrier happens between 3 & 4, since wait_barrier runs at 3, raise_barrier waits for IO completion from 3. And since raise_barrier sets barrier, 4 waits for raise_barrier. But IO from 3 can't be dispatched because raid10_make_request() doesn't finished yet. The solution is to adjust the IO ordering. Quotes from Neil: " It is much safer to: if (need to split) { split = bio_split(bio, ...) bio_chain(...) make_request_fn(split); generic_make_request(bio); } else make_request_fn(mddev, bio); This way we first process the initial section of the bio (in 'split') which will queue some requests to the underlying devices. These requests will be queued in generic_make_request. Then we queue the remainder of the bio, which will be added to the end of the generic_make_request queue. Then we return. generic_make_request() will pop the lower-level device requests off the queue and handle them first. Then it will process the remainder of the original bio once the first section has been fully processed. " Note, this only happens in read path. In write path, the bio is flushed to underlaying disks either by blk flush (from schedule) or offladed to raid1/10d. It's queued in current->bio_list. Cc: Coly Li Cc: stable@vger.kernel.org (v3.14+, only the raid10 part) Suggested-by: NeilBrown Reviewed-by: Jack Wang Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 25 +++++++++++++++++++++++-- drivers/md/raid10.c | 18 ++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 10c3865e1186..c33e96e33b8e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1587,9 +1587,30 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio) split = bio; } - if (bio_data_dir(split) == READ) + if (bio_data_dir(split) == READ) { raid1_read_request(mddev, split); - else + + /* + * If a bio is splitted, the first part of bio will + * pass barrier but the bio is queued in + * current->bio_list (see generic_make_request). If + * there is a raise_barrier() called here, the second + * part of bio can't pass barrier. But since the first + * part bio isn't dispatched to underlaying disks yet, + * the barrier is never released, hence raise_barrier + * will alays wait. We have a deadlock. + * Note, this only happens in read path. For write + * path, the first part of bio is dispatched in a + * schedule() call (because of blk plug) or offloaded + * to raid10d. + * Quitting from the function immediately can change + * the bio order queued in bio_list and avoid the deadlock. + */ + if (split != bio) { + generic_make_request(bio); + break; + } + } else raid1_write_request(mddev, split); } while (split != bio); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c4db6d1fb6a2..b1b1f982a722 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1584,7 +1584,25 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) split = bio; } + /* + * If a bio is splitted, the first part of bio will pass + * barrier but the bio is queued in current->bio_list (see + * generic_make_request). If there is a raise_barrier() called + * here, the second part of bio can't pass barrier. But since + * the first part bio isn't dispatched to underlaying disks + * yet, the barrier is never released, hence raise_barrier will + * alays wait. We have a deadlock. + * Note, this only happens in read path. For write path, the + * first part of bio is dispatched in a schedule() call + * (because of blk plug) or offloaded to raid10d. + * Quitting from the function immediately can change the bio + * order queued in bio_list and avoid the deadlock. + */ __make_request(mddev, split); + if (split != bio && bio_data_dir(bio) == READ) { + generic_make_request(bio); + break; + } } while (split != bio); /* In case raid10d snuck in to freeze_array */ From a1016e94cce9fb6ea56d7602263783e2d95d6e92 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 9 Mar 2017 17:14:32 +0000 Subject: [PATCH 033/297] ARM: wire up statx syscall Wire up the new statx syscall for ARM. Signed-off-by: Russell King --- arch/arm/tools/syscall.tbl | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 3c2cb5d5adfa..0bb0e9c6376c 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -411,3 +411,4 @@ 394 common pkey_mprotect sys_pkey_mprotect 395 common pkey_alloc sys_pkey_alloc 396 common pkey_free sys_pkey_free +397 common statx sys_statx From 9a8b0a230aca55ee142fd76f4765f1da1799da93 Mon Sep 17 00:00:00 2001 From: Mihail Atanassov Date: Wed, 15 Feb 2017 14:00:15 +0000 Subject: [PATCH 034/297] drm: mali-dp: Remove mclk rate management The rate of mclk depends on the use-case. If no downscaling is required, then mclk == pxlclk is a valid option; with downscaling however, the rate at which mclk runs determines how much a plane can be downscaled before composition. This is a system integration + power management issue that is more suited to firmware rather than this driver. Signed-off-by: Mihail Atanassov Signed-off-by: Liviu Dudau --- drivers/gpu/drm/arm/malidp_crtc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/arm/malidp_crtc.c b/drivers/gpu/drm/arm/malidp_crtc.c index 08e6a71f5d05..294b53697334 100644 --- a/drivers/gpu/drm/arm/malidp_crtc.c +++ b/drivers/gpu/drm/arm/malidp_crtc.c @@ -63,8 +63,7 @@ static void malidp_crtc_enable(struct drm_crtc *crtc) clk_prepare_enable(hwdev->pxlclk); - /* mclk needs to be set to the same or higher rate than pxlclk */ - clk_set_rate(hwdev->mclk, crtc->state->adjusted_mode.crtc_clock * 1000); + /* We rely on firmware to set mclk to a sensible level. */ clk_set_rate(hwdev->pxlclk, crtc->state->adjusted_mode.crtc_clock * 1000); hwdev->modeset(hwdev, &vm); From d1479f6108006555fe33d7cfe8db4f95ad614b9a Mon Sep 17 00:00:00 2001 From: Mihail Atanassov Date: Thu, 9 Feb 2017 11:32:00 +0000 Subject: [PATCH 035/297] drm: mali-dp: Fix smart layer not going to composition Use rectangle 1 as a generic plane. Existing code already sets the smart layer bounding box size + offset. The rectangles' offsets are relative to the bounding box, so there is no need to set R1's offset (reset value is 0), just its size which is the same as the bounding box. Signed-off-by: Mihail Atanassov Signed-off-by: Liviu Dudau --- drivers/gpu/drm/arm/malidp_hw.c | 2 +- drivers/gpu/drm/arm/malidp_planes.c | 18 ++++++++++++++++-- drivers/gpu/drm/arm/malidp_regs.h | 1 + 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/arm/malidp_hw.c b/drivers/gpu/drm/arm/malidp_hw.c index 488aedf5b58d..9f5513006eee 100644 --- a/drivers/gpu/drm/arm/malidp_hw.c +++ b/drivers/gpu/drm/arm/malidp_hw.c @@ -83,7 +83,7 @@ static const struct malidp_layer malidp550_layers[] = { { DE_VIDEO1, MALIDP550_DE_LV1_BASE, MALIDP550_DE_LV1_PTR_BASE, MALIDP_DE_LV_STRIDE0 }, { DE_GRAPHICS1, MALIDP550_DE_LG_BASE, MALIDP550_DE_LG_PTR_BASE, MALIDP_DE_LG_STRIDE }, { DE_VIDEO2, MALIDP550_DE_LV2_BASE, MALIDP550_DE_LV2_PTR_BASE, MALIDP_DE_LV_STRIDE0 }, - { DE_SMART, MALIDP550_DE_LS_BASE, MALIDP550_DE_LS_PTR_BASE, 0 }, + { DE_SMART, MALIDP550_DE_LS_BASE, MALIDP550_DE_LS_PTR_BASE, MALIDP550_DE_LS_R1_STRIDE }, }; #define MALIDP_DE_DEFAULT_PREFETCH_START 5 diff --git a/drivers/gpu/drm/arm/malidp_planes.c b/drivers/gpu/drm/arm/malidp_planes.c index 414aada10fe5..d5aec082294c 100644 --- a/drivers/gpu/drm/arm/malidp_planes.c +++ b/drivers/gpu/drm/arm/malidp_planes.c @@ -37,6 +37,8 @@ #define LAYER_V_VAL(x) (((x) & 0x1fff) << 16) #define MALIDP_LAYER_COMP_SIZE 0x010 #define MALIDP_LAYER_OFFSET 0x014 +#define MALIDP550_LS_ENABLE 0x01c +#define MALIDP550_LS_R1_IN_SIZE 0x020 /* * This 4-entry look-up-table is used to determine the full 8-bit alpha value @@ -242,6 +244,11 @@ static void malidp_de_plane_update(struct drm_plane *plane, LAYER_V_VAL(plane->state->crtc_y), mp->layer->base + MALIDP_LAYER_OFFSET); + if (mp->layer->id == DE_SMART) + malidp_hw_write(mp->hwdev, + LAYER_H_VAL(src_w) | LAYER_V_VAL(src_h), + mp->layer->base + MALIDP550_LS_R1_IN_SIZE); + /* first clear the rotation bits */ val = malidp_hw_read(mp->hwdev, mp->layer->base + MALIDP_LAYER_CONTROL); val &= ~LAYER_ROT_MASK; @@ -330,9 +337,16 @@ int malidp_de_planes_init(struct drm_device *drm) plane->hwdev = malidp->dev; plane->layer = &map->layers[i]; - /* Skip the features which the SMART layer doesn't have */ - if (id == DE_SMART) + if (id == DE_SMART) { + /* + * Enable the first rectangle in the SMART layer to be + * able to use it as a drm plane. + */ + malidp_hw_write(malidp->dev, 1, + plane->layer->base + MALIDP550_LS_ENABLE); + /* Skip the features which the SMART layer doesn't have. */ continue; + } drm_plane_create_rotation_property(&plane->base, DRM_ROTATE_0, flags); malidp_hw_write(malidp->dev, MALIDP_ALPHA_LUT, diff --git a/drivers/gpu/drm/arm/malidp_regs.h b/drivers/gpu/drm/arm/malidp_regs.h index aff6d4a84e99..b816067a65c5 100644 --- a/drivers/gpu/drm/arm/malidp_regs.h +++ b/drivers/gpu/drm/arm/malidp_regs.h @@ -84,6 +84,7 @@ /* Stride register offsets relative to Lx_BASE */ #define MALIDP_DE_LG_STRIDE 0x18 #define MALIDP_DE_LV_STRIDE0 0x18 +#define MALIDP550_DE_LS_R1_STRIDE 0x28 /* macros to set values into registers */ #define MALIDP_DE_H_FRONTPORCH(x) (((x) & 0xfff) << 0) From cb6950b7152fb3760942f9cb16bd2a35e5a1bfd1 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Tue, 7 Mar 2017 00:34:57 +0530 Subject: [PATCH 036/297] arm64: kprobes: remove kprobe_exceptions_notify Commit fc62d0207ae0 ("kprobes: Introduce weak variant of kprobe_exceptions_notify()") introduces a generic empty version of the function for architectures that don't need special handling, like arm64. As such, remove the arch/arm64/ specific handler. Signed-off-by: Naveen N. Rao Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/kprobes.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 2a07aae5b8a2..c5c45942fb6e 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -372,12 +372,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) return 0; } -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return NOTIFY_DONE; -} - static void __kprobes kprobe_handler(struct pt_regs *regs) { struct kprobe *p, *cur_kprobe; From b0de0ccc8b9edd8846828e0ecdc35deacdf186b0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 6 Mar 2017 19:06:40 +0000 Subject: [PATCH 037/297] arm64: kasan: avoid bad virt_to_pfn() Booting a v4.11-rc1 kernel with DEBUG_VIRTUAL and KASAN enabled produces the following splat (trimmed for brevity): [ 0.000000] virt_to_phys used for non-linear address: ffff200008080000 (0xffff200008080000) [ 0.000000] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:14 __virt_to_phys+0x48/0x70 [ 0.000000] PC is at __virt_to_phys+0x48/0x70 [ 0.000000] LR is at __virt_to_phys+0x48/0x70 [ 0.000000] Call trace: [ 0.000000] [] __virt_to_phys+0x48/0x70 [ 0.000000] [] kasan_init+0x1c0/0x498 [ 0.000000] [] setup_arch+0x2fc/0x948 [ 0.000000] [] start_kernel+0xb8/0x570 [ 0.000000] [] __primary_switched+0x6c/0x74 This is because we use virt_to_pfn() on a kernel image address when trying to figure out its nid, so that we can allocate its shadow from the same node. As with other recent changes, this patch uses lm_alias() to solve this. We could instead use NUMA_NO_NODE, as x86 does for all shadow allocations, though we'll likely want the "real" memory shadow to be backed from its corresponding nid anyway, so we may as well be consistent and find the nid for the image shadow. Cc: Catalin Marinas Cc: Will Deacon Acked-by: Laura Abbott Signed-off-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/mm/kasan_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 55d1e9205543..687a358a3733 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -162,7 +162,7 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); vmemmap_populate(kimg_shadow_start, kimg_shadow_end, - pfn_to_nid(virt_to_pfn(_text))); + pfn_to_nid(virt_to_pfn(lm_alias(_text)))); /* * vmemmap_populate() has populated the shadow region that covers the From 5c2a625937ba49bc691089370638223d310cda9a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 8 Mar 2017 16:27:04 -0800 Subject: [PATCH 038/297] arm64: support keyctl() system call in 32-bit mode As is the case for a number of other architectures that have a 32-bit compat mode, enable KEYS_COMPAT if both COMPAT and KEYS are enabled. This allows AArch32 programs to use the keyctl() system call when running on an AArch64 kernel. Signed-off-by: Eric Biggers Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a39029b5414e..f21e9a76ff67 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1063,6 +1063,10 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config KEYS_COMPAT + def_bool y + depends on COMPAT && KEYS + endmenu menu "Power management options" From 14088540ad63c648e5cdf490412033f792d16b6b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 10 Mar 2017 17:44:18 +0000 Subject: [PATCH 039/297] arm64: use const cap for system_uses_ttbr0_pan() Since commit 4b65a5db362783ab ("arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1"), system_uses_ttbr0_pan() has used cpus_have_cap() to determine whether PAN is present. Since commit a4023f682739439b ("arm64: Add hypervisor safe helper for checking constant capabilities"), which was introduced around the same time, cpus_have_cap() doesn't try to use a static key, and must always perform a load, test, and consitional branch (likely a tbnz for the latter two). Elsewhere, we moved to using cpus_have_const_cap(), which can use a static key (i.e. a non-conditional branch), which is patched at runtime when the feature is detected. This patch makes system_uses_ttbr0_pan() use cpus_have_const_cap(). The static key is likely a win for hot-paths like the uacccess primitives, and this makes our usage consistent regardless. Signed-off-by: Mark Rutland Reviewed-by: Suzuki K Poulose Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 05310ad8c5ab..f31c48d0cd68 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -251,7 +251,7 @@ static inline bool system_supports_fpsimd(void) static inline bool system_uses_ttbr0_pan(void) { return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && - !cpus_have_cap(ARM64_HAS_PAN); + !cpus_have_const_cap(ARM64_HAS_PAN); } #endif /* __ASSEMBLY__ */ From 0e4c0e6ea7d4a988a5ae2791c7cb5769b5256dad Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 17 Feb 2017 15:25:08 +0100 Subject: [PATCH 040/297] arm64: kernel: Update kerneldoc for cpu_suspend() rename Commit af391b15f7b56ce1 ("arm64: kernel: rename __cpu_suspend to keep it aligned with arm") renamed cpu_suspend() to arm_cpuidle_suspend(), but forgot to update the kerneldoc header. Fixes: af391b15f7b56ce1 ("arm64: kernel: rename __cpu_suspend to keep it aligned with arm") Signed-off-by: Geert Uytterhoeven Signed-off-by: Will Deacon --- arch/arm64/kernel/cpuidle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index 75a0f8acef66..fd691087dc9a 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -30,7 +30,7 @@ int arm_cpuidle_init(unsigned int cpu) } /** - * cpu_suspend() - function to enter a low-power idle state + * arm_cpuidle_suspend() - function to enter a low-power idle state * @arg: argument to pass to CPU suspend operations * * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU From d8a8ed9758241e138933c67e40db2db2790eca19 Mon Sep 17 00:00:00 2001 From: Tom St Denis Date: Thu, 9 Mar 2017 13:21:07 -0500 Subject: [PATCH 041/297] drm/amd/amdgpu: Disable GFX_PG on Carrizo until compute issues solved Currently compute jobs will stall if GFX_PG is enabled. Until this is resolved we'll disable GFX_PG. Signed-off-by: Tom St Denis Reviewed-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index 50bdb24ef8d6..4a785d6acfb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -1051,7 +1051,7 @@ static int vi_common_early_init(void *handle) /* rev0 hardware requires workarounds to support PG */ adev->pg_flags = 0; if (adev->rev_id != 0x00) { - adev->pg_flags |= AMD_PG_SUPPORT_GFX_PG | + adev->pg_flags |= AMD_PG_SUPPORT_GFX_SMG | AMD_PG_SUPPORT_GFX_PIPELINE | AMD_PG_SUPPORT_CP | From 607523d19c9d67ba4cf7bdaced644f11ed04992c Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 10 Mar 2017 12:13:04 +1000 Subject: [PATCH 042/297] drm/amdgpu: fix parser init error path to avoid crash in parser fini MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we don't reset the chunk info in the error path, the subsequent fini path will double free. Reviewed-by: Christian König Signed-off-by: Dave Airlie Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index d2d0f60ff36d..99424cb8020b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -240,6 +240,8 @@ free_partial_kdata: for (; i >= 0; i--) drm_free_large(p->chunks[i].kdata); kfree(p->chunks); + p->chunks = NULL; + p->nchunks = 0; put_ctx: amdgpu_ctx_put(p->ctx); free_chunk: From 3fb632e40d7667d8bedfabc28850ac06d5493f54 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Mar 2017 11:27:23 +0800 Subject: [PATCH 043/297] md: fix super_offset endianness in super_1_rdev_size_change The sb->super_offset should be big-endian, but the rdev->sb_start is in host byte order, so fix this by adding cpu_to_le64. Signed-off-by: Jason Yan Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index cd89ad3c3a0d..6e76d97a8fc3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1879,7 +1879,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); - sb->super_offset = rdev->sb_start; + sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, From 1345921393ba23b60d3fcf15933e699232ad25ae Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Mar 2017 11:49:12 +0800 Subject: [PATCH 044/297] md: fix incorrect use of lexx_to_cpu in does_sb_need_changing The sb->layout is of type __le32, so we shoud use le32_to_cpu. Signed-off-by: Jason Yan Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6e76d97a8fc3..f6ae1d67bcd0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2287,7 +2287,7 @@ static bool does_sb_need_changing(struct mddev *mddev) /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || - (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; From 0134ed4fb9e78672ee9f7b18007114404c81e63f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 10 Mar 2017 13:24:22 -0700 Subject: [PATCH 045/297] device-dax: fix pmd/pte fault fallback handling Jeff Moyer reports: With a device dax alignment of 4KB or 2MB, I get sigbus when running the attached fio job file for the current kernel (4.11.0-rc1+). If I specify an alignment of 1GB, it works. I turned on debug output, and saw that it was failing in the huge fault code. dax dax1.0: dax_open dax dax1.0: dax_mmap dax dax1.0: dax_dev_huge_fault: fio: write (0x7f08f0a00000 - dax dax1.0: __dax_dev_pud_fault: phys_to_pgoff(0xffffffffcf60 dax dax1.0: dax_release fio config for reproduce: [global] ioengine=dev-dax direct=0 filename=/dev/dax0.0 bs=2m [write] rw=write [read] stonewall rw=read The driver fails to fallback when taking a fault that is larger than the device alignment, or handling a larger fault when a smaller mapping is already established. While we could support larger mappings for a device with a smaller alignment, that change is too large for the immediate fix. The simplest change is to force fallback until the fault size matches the alignment. Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap") Cc: Reported-by: Jeff Moyer Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/dax/dax.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 8d9829ff2a78..a284dc532e46 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -427,6 +427,7 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) int rc = VM_FAULT_SIGBUS; phys_addr_t phys; pfn_t pfn; + unsigned int fault_size = PAGE_SIZE; if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; @@ -437,6 +438,9 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } + if (fault_size != dax_region->align) + return VM_FAULT_SIGBUS; + phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); if (phys == -1) { dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, @@ -464,6 +468,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) phys_addr_t phys; pgoff_t pgoff; pfn_t pfn; + unsigned int fault_size = PMD_SIZE; if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; @@ -480,6 +485,16 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } + if (fault_size < dax_region->align) + return VM_FAULT_SIGBUS; + else if (fault_size > dax_region->align) + return VM_FAULT_FALLBACK; + + /* if we are outside of the VMA */ + if (pmd_addr < vmf->vma->vm_start || + (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + pgoff = linear_page_index(vmf->vma, pmd_addr); phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { From 70b085b06c4560a69e95607f77bb4c2b2e41943c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 10 Mar 2017 13:24:27 -0700 Subject: [PATCH 046/297] device-dax: fix pud fault fallback handling Jeff Moyer reports: With a device dax alignment of 4KB or 2MB, I get sigbus when running the attached fio job file for the current kernel (4.11.0-rc1+). If I specify an alignment of 1GB, it works. I turned on debug output, and saw that it was failing in the huge fault code. dax dax1.0: dax_open dax dax1.0: dax_mmap dax dax1.0: dax_dev_huge_fault: fio: write (0x7f08f0a00000 - dax dax1.0: __dax_dev_pud_fault: phys_to_pgoff(0xffffffffcf60) dax dax1.0: dax_release fio config for reproduce: [global] ioengine=dev-dax direct=0 filename=/dev/dax0.0 bs=2m [write] rw=write [read] stonewall rw=read The driver fails to fallback when taking a fault that is larger than the device alignment, or handling a larger fault when a smaller mapping is already established. While we could support larger mappings for a device with a smaller alignment, that change is too large for the immediate fix. The simplest change is to force fallback until the fault size matches the alignment. Reported-by: Jeff Moyer Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/dax/dax.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index a284dc532e46..523fecec7bda 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -518,6 +518,8 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) phys_addr_t phys; pgoff_t pgoff; pfn_t pfn; + unsigned int fault_size = PUD_SIZE; + if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; @@ -534,6 +536,16 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } + if (fault_size < dax_region->align) + return VM_FAULT_SIGBUS; + else if (fault_size > dax_region->align) + return VM_FAULT_FALLBACK; + + /* if we are outside of the VMA */ + if (pud_addr < vmf->vma->vm_start || + (pud_addr + PUD_SIZE) > vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + pgoff = linear_page_index(vmf->vma, pud_addr); phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); if (phys == -1) { From 52084f89b38cdd896b59627c629915ef1a7bf615 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 9 Mar 2017 16:56:01 -0700 Subject: [PATCH 047/297] device-dax: fix debug output typo The debug output for return the return data of pgoff_to_phys() in the fault handlers has 'phys' and 'pgoff' incorrectly swapped. Reported-by: Jeff Moyer Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/dax/dax.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 523fecec7bda..80c6db279ae1 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -443,7 +443,7 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); if (phys == -1) { - dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, vmf->pgoff); return VM_FAULT_SIGBUS; } @@ -498,7 +498,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) pgoff = linear_page_index(vmf->vma, pmd_addr); phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { - dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); return VM_FAULT_SIGBUS; } @@ -549,7 +549,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) pgoff = linear_page_index(vmf->vma, pud_addr); phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); if (phys == -1) { - dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); return VM_FAULT_SIGBUS; } From c962cff17dfa11f4a8227ac16de2b28aea3312e4 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 3 Mar 2017 16:02:23 +0800 Subject: [PATCH 048/297] Revert "x86/acpi: Set persistent cpuid <-> nodeid mapping when booting" Revert: dc6db24d2476 ("x86/acpi: Set persistent cpuid <-> nodeid mapping when booting") The mapping of "cpuid <-> nodeid" is established at boot time via ACPI tables to keep associations of workqueues and other node related items consistent across cpu hotplug. But, ACPI tables are unreliable and failures with that boot time mapping have been reported on machines where the ACPI table and the physical information which is retrieved at actual hotplug is inconsistent. Revert the mapping implementation so it can be replaced with a less error prone approach. Signed-off-by: Dou Liyang Tested-by: Xiaolong Ye Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: guzheng1@huawei.com Cc: izumi.taku@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1488528147-2279-2-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 2 +- drivers/acpi/acpi_processor.c | 5 --- drivers/acpi/bus.c | 1 - drivers/acpi/processor_core.c | 73 ----------------------------------- include/linux/acpi.h | 3 -- 5 files changed, 1 insertion(+), 83 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ae32838cac5f..f6b0e87d2388 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -710,7 +710,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include -int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 4467a8089ab8..5d208a99d0c9 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -182,11 +182,6 @@ int __weak arch_register_cpu(int cpu) void __weak arch_unregister_cpu(int cpu) {} -int __weak acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) -{ - return -ENODEV; -} - static int acpi_processor_hotadd_init(struct acpi_processor *pr) { unsigned long long sta; diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 80cb5eb75b63..34fbe027e73a 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -1249,7 +1249,6 @@ static int __init acpi_init(void) acpi_wakeup_device_init(); acpi_debugger_init(); acpi_setup_sb_notify_handler(); - acpi_set_processor_mapping(); return 0; } diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 611a5585a902..a84386204659 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -278,79 +278,6 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id) } EXPORT_SYMBOL_GPL(acpi_get_cpuid); -#ifdef CONFIG_ACPI_HOTPLUG_CPU -static bool __init -map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) -{ - int type, id; - u32 acpi_id; - acpi_status status; - acpi_object_type acpi_type; - unsigned long long tmp; - union acpi_object object = { 0 }; - struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; - - status = acpi_get_type(handle, &acpi_type); - if (ACPI_FAILURE(status)) - return false; - - switch (acpi_type) { - case ACPI_TYPE_PROCESSOR: - status = acpi_evaluate_object(handle, NULL, NULL, &buffer); - if (ACPI_FAILURE(status)) - return false; - acpi_id = object.processor.proc_id; - - /* validate the acpi_id */ - if(acpi_processor_validate_proc_id(acpi_id)) - return false; - break; - case ACPI_TYPE_DEVICE: - status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); - if (ACPI_FAILURE(status)) - return false; - acpi_id = tmp; - break; - default: - return false; - } - - type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0; - - *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false); - id = acpi_map_cpuid(*phys_id, acpi_id); - - if (id < 0) - return false; - *cpuid = id; - return true; -} - -static acpi_status __init -set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context, - void **rv) -{ - phys_cpuid_t phys_id; - int cpu_id; - - if (!map_processor(handle, &phys_id, &cpu_id)) - return AE_ERROR; - - acpi_map_cpu2node(handle, cpu_id, phys_id); - return AE_OK; -} - -void __init acpi_set_processor_mapping(void) -{ - /* Set persistent cpu <-> node mapping for all processors. */ - acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, set_processor_node_mapping, - NULL, NULL, NULL); -} -#else -void __init acpi_set_processor_mapping(void) {} -#endif /* CONFIG_ACPI_HOTPLUG_CPU */ - #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base, u64 *phys_addr, int *ioapic_id) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 673acda012af..63a7519b00cc 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -294,11 +294,8 @@ bool acpi_processor_validate_proc_id(int proc_id); int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu); int acpi_unmap_cpu(int cpu); -int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ -void acpi_set_processor_mapping(void); - #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr); #endif From 09c3f2bd5c7e5f18687663acb6adc6b167484ca5 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 3 Mar 2017 16:02:24 +0800 Subject: [PATCH 049/297] Revert"x86/acpi: Enable MADT APIs to return disabled apicids" Revert: 8ad893faf2ea ("x86/acpi: Enable MADT APIs to return disabled apicids") Remove the leftovers of the boot time 'cpuid <-> nodeid' mapping approach. Signed-off-by: Dou Liyang Tested-by: Xiaolong Ye Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: guzheng1@huawei.com Cc: izumi.taku@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1488528147-2279-3-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/processor_core.c | 60 +++++++++++++---------------------- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index a84386204659..b933061b6b60 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void) } static int map_lapic_id(struct acpi_subtable_header *entry, - u32 acpi_id, phys_cpuid_t *apic_id, bool ignore_disabled) + u32 acpi_id, phys_cpuid_t *apic_id) { struct acpi_madt_local_apic *lapic = container_of(entry, struct acpi_madt_local_apic, header); - if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED)) + if (!(lapic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (lapic->processor_id != acpi_id) @@ -48,13 +48,12 @@ static int map_lapic_id(struct acpi_subtable_header *entry, } static int map_x2apic_id(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id, - bool ignore_disabled) + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id) { struct acpi_madt_local_x2apic *apic = container_of(entry, struct acpi_madt_local_x2apic, header); - if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED)) + if (!(apic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (device_declaration && (apic->uid == acpi_id)) { @@ -66,13 +65,12 @@ static int map_x2apic_id(struct acpi_subtable_header *entry, } static int map_lsapic_id(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id, - bool ignore_disabled) + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id) { struct acpi_madt_local_sapic *lsapic = container_of(entry, struct acpi_madt_local_sapic, header); - if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED)) + if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (device_declaration) { @@ -89,13 +87,12 @@ static int map_lsapic_id(struct acpi_subtable_header *entry, * Retrieve the ARM CPU physical identifier (MPIDR) */ static int map_gicc_mpidr(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr, - bool ignore_disabled) + int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr) { struct acpi_madt_generic_interrupt *gicc = container_of(entry, struct acpi_madt_generic_interrupt, header); - if (ignore_disabled && !(gicc->flags & ACPI_MADT_ENABLED)) + if (!(gicc->flags & ACPI_MADT_ENABLED)) return -ENODEV; /* device_declaration means Device object in DSDT, in the @@ -112,7 +109,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry, } static phys_cpuid_t map_madt_entry(struct acpi_table_madt *madt, - int type, u32 acpi_id, bool ignore_disabled) + int type, u32 acpi_id) { unsigned long madt_end, entry; phys_cpuid_t phys_id = PHYS_CPUID_INVALID; /* CPU hardware ID */ @@ -130,20 +127,16 @@ static phys_cpuid_t map_madt_entry(struct acpi_table_madt *madt, struct acpi_subtable_header *header = (struct acpi_subtable_header *)entry; if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) { - if (!map_lapic_id(header, acpi_id, &phys_id, - ignore_disabled)) + if (!map_lapic_id(header, acpi_id, &phys_id)) break; } else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) { - if (!map_x2apic_id(header, type, acpi_id, &phys_id, - ignore_disabled)) + if (!map_x2apic_id(header, type, acpi_id, &phys_id)) break; } else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) { - if (!map_lsapic_id(header, type, acpi_id, &phys_id, - ignore_disabled)) + if (!map_lsapic_id(header, type, acpi_id, &phys_id)) break; } else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) { - if (!map_gicc_mpidr(header, type, acpi_id, &phys_id, - ignore_disabled)) + if (!map_gicc_mpidr(header, type, acpi_id, &phys_id)) break; } entry += header->length; @@ -161,15 +154,14 @@ phys_cpuid_t __init acpi_map_madt_entry(u32 acpi_id) if (!madt) return PHYS_CPUID_INVALID; - rv = map_madt_entry(madt, 1, acpi_id, true); + rv = map_madt_entry(madt, 1, acpi_id); acpi_put_table((struct acpi_table_header *)madt); return rv; } -static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id, - bool ignore_disabled) +static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; @@ -190,38 +182,30 @@ static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id, header = (struct acpi_subtable_header *)obj->buffer.pointer; if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) - map_lapic_id(header, acpi_id, &phys_id, ignore_disabled); + map_lapic_id(header, acpi_id, &phys_id); else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) - map_lsapic_id(header, type, acpi_id, &phys_id, ignore_disabled); + map_lsapic_id(header, type, acpi_id, &phys_id); else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) - map_x2apic_id(header, type, acpi_id, &phys_id, ignore_disabled); + map_x2apic_id(header, type, acpi_id, &phys_id); else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) - map_gicc_mpidr(header, type, acpi_id, &phys_id, - ignore_disabled); + map_gicc_mpidr(header, type, acpi_id, &phys_id); exit: kfree(buffer.pointer); return phys_id; } -static phys_cpuid_t __acpi_get_phys_id(acpi_handle handle, int type, - u32 acpi_id, bool ignore_disabled) +phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id) { phys_cpuid_t phys_id; - phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled); + phys_id = map_mat_entry(handle, type, acpi_id); if (invalid_phys_cpuid(phys_id)) - phys_id = map_madt_entry(get_madt_table(), type, acpi_id, - ignore_disabled); + phys_id = map_madt_entry(get_madt_table(), type, acpi_id); return phys_id; } -phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id) -{ - return __acpi_get_phys_id(handle, type, acpi_id, true); -} - int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id) { #ifdef CONFIG_SMP From 2b85b3d22920db7473e5fed5719e7955c0ec323e Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 3 Mar 2017 16:02:25 +0800 Subject: [PATCH 050/297] x86/acpi: Restore the order of CPU IDs The following commits: f7c28833c2 ("x86/acpi: Enable acpi to register all possible cpus at boot time") and 8f54969dc8 ("x86/acpi: Introduce persistent storage for cpuid <-> apicid mapping") ... registered all the possible CPUs at boot time via ACPI tables to make the mapping of cpuid <-> apicid fixed. Both enabled and disabled CPUs could have a logical CPU ID after boot time. But, ACPI tables are unreliable. the number amd order of Local APIC entries which depends on the firmware is often inconsistent with the physical devices. Even if they are consistent, The disabled CPUs which take up some logical CPU IDs will also make the order discontinuous. Revert the part of disabled CPUs registration, keep the allocation logic of logical CPU IDs and also keep some code location changes. Signed-off-by: Dou Liyang Tested-by: Xiaolong Ye Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: guzheng1@huawei.com Cc: izumi.taku@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1488528147-2279-4-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 7 ++++++- arch/x86/kernel/apic/apic.c | 26 +++++++------------------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f6b0e87d2388..b2879cc23db4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -179,10 +179,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = __generic_processor_info(id, ver, enabled); + cpu = generic_processor_info(id, ver); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index aee7deddabd0..8ccb7ef512e0 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2063,7 +2063,7 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -int __generic_processor_info(int apicid, int version, bool enabled) +int generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2121,11 +2121,9 @@ int __generic_processor_info(int apicid, int version, bool enabled) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - if (enabled) { - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); - } + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2177,23 +2175,13 @@ int __generic_processor_info(int apicid, int version, bool enabled) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - - if (enabled) { - num_processors++; - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - } else { - disabled_cpus++; - } + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; return cpu; } -int generic_processor_info(int apicid, int version) -{ - return __generic_processor_info(apicid, version, true); -} - int hard_smp_processor_id(void) { return read_apic_id(); From 8c8cb30f49b86333d8e036e1945cf1a78c03577e Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 3 Mar 2017 16:02:26 +0800 Subject: [PATCH 051/297] acpi/processor: Implement DEVICE operator for processor enumeration ACPI allows to declare processors either with the PROCESSOR or with the DEVICE operator. The current implementation handles only the PROCESSOR operator. On a system which uses the DEVICE operator for processor enumeration the evaluation fails. Check for the ACPI type of the ACPI handle and evaluate PROCESSOR and DEVICE types separately. Signed-off-by: Dou Liyang Tested-by: Xiaolong Ye Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: guzheng1@huawei.com Cc: izumi.taku@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1488528147-2279-5-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/acpi_processor.c | 39 ++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 5d208a99d0c9..9a98d7e00200 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -633,25 +633,50 @@ static acpi_status __init acpi_processor_ids_walk(acpi_handle handle, void **rv) { acpi_status status; + acpi_object_type acpi_type; + unsigned long long uid; union acpi_object object = { 0 }; struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; - status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + status = acpi_get_type(handle, &acpi_type); if (ACPI_FAILURE(status)) - acpi_handle_info(handle, "Not get the processor object\n"); - else - processor_validated_ids_update(object.processor.proc_id); + return false; + + switch (acpi_type) { + case ACPI_TYPE_PROCESSOR: + status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) + goto err; + uid = object.processor.proc_id; + break; + + case ACPI_TYPE_DEVICE: + status = acpi_evaluate_integer(handle, "_UID", NULL, &uid); + if (ACPI_FAILURE(status)) + goto err; + break; + default: + goto err; + } + + processor_validated_ids_update(uid); + return true; + +err: + acpi_handle_info(handle, "Invalid processor object\n"); + return false; - return AE_OK; } -static void __init acpi_processor_check_duplicates(void) +void __init acpi_processor_check_duplicates(void) { - /* Search all processor nodes in ACPI namespace */ + /* check the correctness for all processors in ACPI namespace */ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, acpi_processor_ids_walk, NULL, NULL, NULL); + acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, acpi_processor_ids_walk, + NULL, NULL); } bool __init acpi_processor_validate_proc_id(int proc_id) From a77d6cd968497792e072b74dff45b891ba778ddb Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 3 Mar 2017 16:02:27 +0800 Subject: [PATCH 052/297] acpi/processor: Check for duplicate processor ids at hotplug time The check for duplicate processor ids happens at boot time based on the ACPI table contents, but the final sanity checks for a processor happen at hotplug time. At hotplug time, where the physical information is available, which might differ from the ACPI table information, a check for duplicate processor ids is missing. Add it to the hotplug checks and rename the function so it better reflects its purpose. Signed-off-by: Dou Liyang Tested-by: Xiaolong Ye Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: guzheng1@huawei.com Cc: izumi.taku@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1488528147-2279-6-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/acpi_processor.c | 13 ++++++++++--- include/linux/acpi.h | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 9a98d7e00200..0143135b3abe 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -280,6 +280,13 @@ static int acpi_processor_get_info(struct acpi_device *device) pr->acpi_id = value; } + if (acpi_duplicate_processor_id(pr->acpi_id)) { + dev_err(&device->dev, + "Failed to get unique processor _UID (0x%x)\n", + pr->acpi_id); + return -ENODEV; + } + pr->phys_id = acpi_get_phys_id(pr->handle, device_declaration, pr->acpi_id); if (invalid_phys_cpuid(pr->phys_id)) @@ -580,7 +587,7 @@ static struct acpi_scan_handler processor_container_handler = { static int nr_unique_ids __initdata; /* The number of the duplicate processor IDs */ -static int nr_duplicate_ids __initdata; +static int nr_duplicate_ids; /* Used to store the unique processor IDs */ static int unique_processor_ids[] __initdata = { @@ -588,7 +595,7 @@ static int unique_processor_ids[] __initdata = { }; /* Used to store the duplicate processor IDs */ -static int duplicate_processor_ids[] __initdata = { +static int duplicate_processor_ids[] = { [0 ... NR_CPUS - 1] = -1, }; @@ -679,7 +686,7 @@ void __init acpi_processor_check_duplicates(void) NULL, NULL); } -bool __init acpi_processor_validate_proc_id(int proc_id) +bool acpi_duplicate_processor_id(int proc_id) { int i; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 63a7519b00cc..9b05886f9773 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -287,7 +287,7 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) } /* Validate the processor object's proc_id */ -bool acpi_processor_validate_proc_id(int proc_id); +bool acpi_duplicate_processor_id(int proc_id); #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ From 33d8c15559df4f0bce25d7e16ebb5879e249f2e7 Mon Sep 17 00:00:00 2001 From: Romain Izard Date: Thu, 9 Mar 2017 17:58:38 +0100 Subject: [PATCH 053/297] Revert "clocksource/drivers/tcb_clksrc: Use 32 bit tcb as sched_clock" This reverts commit 7b9f1d16e6d1 ("clocksource/drivers/tcb_clksrc: Use 32 bit tcb as sched_clock"). In the current state, the kernel warns against a late registration of the new sched_clock, the printk clock resets after only a few minutes, and it seems that scheduling can be affected as well. Signed-off-by: Romain Izard Signed-off-by: Daniel Lezcano --- drivers/clocksource/tcb_clksrc.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c index 745844ee973e..d4ca9962a759 100644 --- a/drivers/clocksource/tcb_clksrc.c +++ b/drivers/clocksource/tcb_clksrc.c @@ -10,7 +10,6 @@ #include #include #include -#include /* @@ -57,14 +56,9 @@ static u64 tc_get_cycles(struct clocksource *cs) return (upper << 16) | lower; } -static u32 tc_get_cv32(void) -{ - return __raw_readl(tcaddr + ATMEL_TC_REG(0, CV)); -} - static u64 tc_get_cycles32(struct clocksource *cs) { - return tc_get_cv32(); + return __raw_readl(tcaddr + ATMEL_TC_REG(0, CV)); } static struct clocksource clksrc = { @@ -75,11 +69,6 @@ static struct clocksource clksrc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static u64 notrace tc_read_sched_clock(void) -{ - return tc_get_cv32(); -} - #ifdef CONFIG_GENERIC_CLOCKEVENTS struct tc_clkevt_device { @@ -350,9 +339,6 @@ static int __init tcb_clksrc_init(void) clksrc.read = tc_get_cycles32; /* setup ony channel 0 */ tcb_setup_single_chan(tc, best_divisor_idx); - - /* register sched_clock on chips with single 32 bit counter */ - sched_clock_register(tc_read_sched_clock, 32, divided_rate); } else { /* tclib will give us three clocks no matter what the * underlying platform supports. From f5fe1b51905df7cfe4fdfd85c5fb7bc5b71a094f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 10 Mar 2017 17:00:47 +1100 Subject: [PATCH 054/297] blk: Ensure users for current->bio_list can see the full list. Commit 79bd99596b73 ("blk: improve order of bio handling in generic_make_request()") changed current->bio_list so that it did not contain *all* of the queued bios, but only those submitted by the currently running make_request_fn. There are two places which walk the list and requeue selected bios, and others that check if the list is empty. These are no longer correct. So redefine current->bio_list to point to an array of two lists, which contain all queued bios, and adjust various code to test or walk both lists. Signed-off-by: NeilBrown Fixes: 79bd99596b73 ("blk: improve order of bio handling in generic_make_request()") Signed-off-by: Jens Axboe --- block/bio.c | 12 +++++++++--- block/blk-core.c | 30 ++++++++++++++++++------------ drivers/md/dm.c | 27 +++++++++++++++------------ drivers/md/raid10.c | 3 ++- 4 files changed, 44 insertions(+), 28 deletions(-) diff --git a/block/bio.c b/block/bio.c index 5eec5e08417f..e75878f8b14a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -376,10 +376,14 @@ static void punt_bios_to_rescuer(struct bio_set *bs) bio_list_init(&punt); bio_list_init(&nopunt); - while ((bio = bio_list_pop(current->bio_list))) + while ((bio = bio_list_pop(¤t->bio_list[0]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + current->bio_list[0] = nopunt; - *current->bio_list = nopunt; + bio_list_init(&nopunt); + while ((bio = bio_list_pop(¤t->bio_list[1]))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + current->bio_list[1] = nopunt; spin_lock(&bs->rescue_lock); bio_list_merge(&bs->rescue_list, &punt); @@ -466,7 +470,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * we retry with the original gfp_flags. */ - if (current->bio_list && !bio_list_empty(current->bio_list)) + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); diff --git a/block/blk-core.c b/block/blk-core.c index 0eeb99ef654f..d772c221cc17 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1973,7 +1973,14 @@ end_io: */ blk_qc_t generic_make_request(struct bio *bio) { - struct bio_list bio_list_on_stack; + /* + * bio_list_on_stack[0] contains bios submitted by the current + * make_request_fn. + * bio_list_on_stack[1] contains bios that were submitted before + * the current make_request_fn, but that haven't been processed + * yet. + */ + struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) @@ -1990,7 +1997,7 @@ blk_qc_t generic_make_request(struct bio *bio) * should be added at the tail */ if (current->bio_list) { - bio_list_add(current->bio_list, bio); + bio_list_add(¤t->bio_list[0], bio); goto out; } @@ -2009,18 +2016,17 @@ blk_qc_t generic_make_request(struct bio *bio) * bio_list, and call into ->make_request() again. */ BUG_ON(bio->bi_next); - bio_list_init(&bio_list_on_stack); - current->bio_list = &bio_list_on_stack; + bio_list_init(&bio_list_on_stack[0]); + current->bio_list = bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (likely(blk_queue_enter(q, false) == 0)) { - struct bio_list hold; struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ - hold = bio_list_on_stack; - bio_list_init(&bio_list_on_stack); + bio_list_on_stack[1] = bio_list_on_stack[0]; + bio_list_init(&bio_list_on_stack[0]); ret = q->make_request_fn(q, bio); blk_queue_exit(q); @@ -2030,19 +2036,19 @@ blk_qc_t generic_make_request(struct bio *bio) */ bio_list_init(&lower); bio_list_init(&same); - while ((bio = bio_list_pop(&bio_list_on_stack)) != NULL) + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) if (q == bdev_get_queue(bio->bi_bdev)) bio_list_add(&same, bio); else bio_list_add(&lower, bio); /* now assemble so we handle the lowest level first */ - bio_list_merge(&bio_list_on_stack, &lower); - bio_list_merge(&bio_list_on_stack, &same); - bio_list_merge(&bio_list_on_stack, &hold); + bio_list_merge(&bio_list_on_stack[0], &lower); + bio_list_merge(&bio_list_on_stack[0], &same); + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } else { bio_io_error(bio); } - bio = bio_list_pop(current->bio_list); + bio = bio_list_pop(&bio_list_on_stack[0]); } while (bio); current->bio_list = NULL; /* deactivate */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f4ffd1eb8f44..dfb75979e455 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -989,26 +989,29 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) struct dm_offload *o = container_of(cb, struct dm_offload, cb); struct bio_list list; struct bio *bio; + int i; INIT_LIST_HEAD(&o->cb.list); if (unlikely(!current->bio_list)) return; - list = *current->bio_list; - bio_list_init(current->bio_list); + for (i = 0; i < 2; i++) { + list = current->bio_list[i]; + bio_list_init(¤t->bio_list[i]); - while ((bio = bio_list_pop(&list))) { - struct bio_set *bs = bio->bi_pool; - if (unlikely(!bs) || bs == fs_bio_set) { - bio_list_add(current->bio_list, bio); - continue; + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(¤t->bio_list[i], bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); } - - spin_lock(&bs->rescue_lock); - bio_list_add(&bs->rescue_list, bio); - queue_work(bs->rescue_workqueue, &bs->rescue_work); - spin_unlock(&bs->rescue_lock); } } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 063c43d83b72..0536658c9d40 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -974,7 +974,8 @@ static void wait_barrier(struct r10conf *conf) !conf->barrier || (atomic_read(&conf->nr_pending) && current->bio_list && - !bio_list_empty(current->bio_list)), + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) From 80354c29025833acd72ddac1ffa21c6cb50128cd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sun, 12 Mar 2017 17:07:44 +0200 Subject: [PATCH 055/297] x86/platform/intel-mid: Correct MSI IRQ line for watchdog device The interrupt line used for the watchdog is 12, according to the official Intel Edison BSP code. And indeed after fixing it we start getting an interrupt and thus the watchdog starts working again: [ 191.699951] Kernel panic - not syncing: Kernel Watchdog Signed-off-by: Andy Shevchenko Cc: Borislav Petkov Cc: David Cohen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 78a3bb9e408b ("x86: intel-mid: add watchdog platform code for Merrifield") Link: http://lkml.kernel.org/r/20170312150744.45493-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 86edd1e941eb..9e304e2ea4f5 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -19,7 +19,7 @@ #include #include -#define TANGIER_EXT_TIMER0_MSI 15 +#define TANGIER_EXT_TIMER0_MSI 12 static struct platform_device wdt_dev = { .name = "intel_mid_wdt", From 9fa1d7537242bd580ffa99c4725a0407096aad26 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Tue, 28 Feb 2017 10:11:45 +0200 Subject: [PATCH 056/297] drm/omap: fix dmabuf mmap for dma_alloc'ed buffers omap_gem_dmabuf_mmap() returns an error (with a WARN) when called for a buffer which is allocated with dma_alloc_*(). This prevents dmabuf mmap from working on SoCs without DMM, e.g. AM4 and OMAP3. I could not find any reason for omap_gem_dmabuf_mmap() rejecting such buffers, and just removing the if() fixes the limitation. Signed-off-by: Tomi Valkeinen --- drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c index af267c35d813..ee5883f59be5 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c +++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c @@ -147,9 +147,6 @@ static int omap_gem_dmabuf_mmap(struct dma_buf *buffer, struct drm_gem_object *obj = buffer->priv; int ret = 0; - if (WARN_ON(!obj->filp)) - return -EINVAL; - ret = drm_gem_mmap_obj(obj, omap_gem_mmap_size(obj), vma); if (ret < 0) return ret; From 337ba7fbf0fbc12242359c4af114878618b90951 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sat, 25 Feb 2017 16:29:50 +0300 Subject: [PATCH 057/297] uapi: fix drm/omap_drm.h userspace compilation errors Consistently use types from linux/types.h like in other uapi drm/*_drm.h header files to fix the following drm/omap_drm.h userspace compilation errors: /usr/include/drm/omap_drm.h:36:2: error: unknown type name 'uint64_t' uint64_t param; /* in */ /usr/include/drm/omap_drm.h:37:2: error: unknown type name 'uint64_t' uint64_t value; /* in (set_param), out (get_param) */ /usr/include/drm/omap_drm.h:56:2: error: unknown type name 'uint32_t' uint32_t bytes; /* (for non-tiled formats) */ /usr/include/drm/omap_drm.h:58:3: error: unknown type name 'uint16_t' uint16_t width; /usr/include/drm/omap_drm.h:59:3: error: unknown type name 'uint16_t' uint16_t height; /usr/include/drm/omap_drm.h:65:2: error: unknown type name 'uint32_t' uint32_t flags; /* in */ /usr/include/drm/omap_drm.h:66:2: error: unknown type name 'uint32_t' uint32_t handle; /* out */ /usr/include/drm/omap_drm.h:67:2: error: unknown type name 'uint32_t' uint32_t __pad; /usr/include/drm/omap_drm.h:77:2: error: unknown type name 'uint32_t' uint32_t handle; /* buffer handle (in) */ /usr/include/drm/omap_drm.h:78:2: error: unknown type name 'uint32_t' uint32_t op; /* mask of omap_gem_op (in) */ /usr/include/drm/omap_drm.h:82:2: error: unknown type name 'uint32_t' uint32_t handle; /* buffer handle (in) */ /usr/include/drm/omap_drm.h:83:2: error: unknown type name 'uint32_t' uint32_t op; /* mask of omap_gem_op (in) */ /usr/include/drm/omap_drm.h:88:2: error: unknown type name 'uint32_t' uint32_t nregions; /usr/include/drm/omap_drm.h:89:2: error: unknown type name 'uint32_t' uint32_t __pad; /usr/include/drm/omap_drm.h:93:2: error: unknown type name 'uint32_t' uint32_t handle; /* buffer handle (in) */ /usr/include/drm/omap_drm.h:94:2: error: unknown type name 'uint32_t' uint32_t pad; /usr/include/drm/omap_drm.h:95:2: error: unknown type name 'uint64_t' uint64_t offset; /* mmap offset (out) */ /usr/include/drm/omap_drm.h:102:2: error: unknown type name 'uint32_t' uint32_t size; /* virtual size for mmap'ing (out) */ /usr/include/drm/omap_drm.h:103:2: error: unknown type name 'uint32_t' uint32_t __pad; Fixes: ef6503e89194 ("drm: Kbuild: add omap_drm.h to the installed headers") Signed-off-by: Dmitry V. Levin Signed-off-by: Tomi Valkeinen --- include/uapi/drm/omap_drm.h | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/include/uapi/drm/omap_drm.h b/include/uapi/drm/omap_drm.h index 407cb55df6ac..7fb97863c945 100644 --- a/include/uapi/drm/omap_drm.h +++ b/include/uapi/drm/omap_drm.h @@ -33,8 +33,8 @@ extern "C" { #define OMAP_PARAM_CHIPSET_ID 1 /* ie. 0x3430, 0x4430, etc */ struct drm_omap_param { - uint64_t param; /* in */ - uint64_t value; /* in (set_param), out (get_param) */ + __u64 param; /* in */ + __u64 value; /* in (set_param), out (get_param) */ }; #define OMAP_BO_SCANOUT 0x00000001 /* scanout capable (phys contiguous) */ @@ -53,18 +53,18 @@ struct drm_omap_param { #define OMAP_BO_TILED (OMAP_BO_TILED_8 | OMAP_BO_TILED_16 | OMAP_BO_TILED_32) union omap_gem_size { - uint32_t bytes; /* (for non-tiled formats) */ + __u32 bytes; /* (for non-tiled formats) */ struct { - uint16_t width; - uint16_t height; + __u16 width; + __u16 height; } tiled; /* (for tiled formats) */ }; struct drm_omap_gem_new { union omap_gem_size size; /* in */ - uint32_t flags; /* in */ - uint32_t handle; /* out */ - uint32_t __pad; + __u32 flags; /* in */ + __u32 handle; /* out */ + __u32 __pad; }; /* mask of operations: */ @@ -74,33 +74,33 @@ enum omap_gem_op { }; struct drm_omap_gem_cpu_prep { - uint32_t handle; /* buffer handle (in) */ - uint32_t op; /* mask of omap_gem_op (in) */ + __u32 handle; /* buffer handle (in) */ + __u32 op; /* mask of omap_gem_op (in) */ }; struct drm_omap_gem_cpu_fini { - uint32_t handle; /* buffer handle (in) */ - uint32_t op; /* mask of omap_gem_op (in) */ + __u32 handle; /* buffer handle (in) */ + __u32 op; /* mask of omap_gem_op (in) */ /* TODO maybe here we pass down info about what regions are touched * by sw so we can be clever about cache ops? For now a placeholder, * set to zero and we just do full buffer flush.. */ - uint32_t nregions; - uint32_t __pad; + __u32 nregions; + __u32 __pad; }; struct drm_omap_gem_info { - uint32_t handle; /* buffer handle (in) */ - uint32_t pad; - uint64_t offset; /* mmap offset (out) */ + __u32 handle; /* buffer handle (in) */ + __u32 pad; + __u64 offset; /* mmap offset (out) */ /* note: in case of tiled buffers, the user virtual size can be * different from the physical size (ie. how many pages are needed * to back the object) which is returned in DRM_IOCTL_GEM_OPEN.. * This size here is the one that should be used if you want to * mmap() the buffer: */ - uint32_t size; /* virtual size for mmap'ing (out) */ - uint32_t __pad; + __u32 size; /* virtual size for mmap'ing (out) */ + __u32 __pad; }; #define DRM_OMAP_GET_PARAM 0x00 From fd89b23a4632d3cbdee398048497e026edadfb71 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 6 Mar 2017 00:02:52 +0800 Subject: [PATCH 058/297] netfilter: nft_set_bitmap: fetch the element key based on the set->klen Currently we just assume the element key as a u32 integer, regardless of the set key length. This is incorrect, for example, the tcp port number is only 16 bits. So when we use the nft_payload expr to get the tcp dport and store it to dreg, the dport will be stored at 0~15 bits, and 16~31 bits will be padded with zero. So the reg->data[dreg] will be looked like as below: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | tcp dport | 0 | +-+-+-+-+-+-+-+-+-+-+-+-+ But for these big-endian systems, if we treate this register as a u32 integer, the element key will be larger than 65535, so the following lookup in bitmap set will cause out of bound access. Another issue is that if we add element with comments in bitmap set(although the comments will be ignored eventually), the element will vanish strangely. Because we treate the element key as a u32 integer, so the comments will become the part of the element key, then the element key will also be larger than 65535 and out of bound access will happen: # nft add element t s { 1 comment test } Since set->klen is 1 or 2, it's fine to treate the element key as a u8 or u16 integer. Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_bitmap.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 152d226552c1..9b024e22717b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -45,9 +45,17 @@ struct nft_bitmap { u8 bitmap[]; }; -static inline void nft_bitmap_location(u32 key, u32 *idx, u32 *off) +static inline void nft_bitmap_location(const struct nft_set *set, + const void *key, + u32 *idx, u32 *off) { - u32 k = (key << 1); + u32 k; + + if (set->klen == 2) + k = *(u16 *)key; + else + k = *(u8 *)key; + k <<= 1; *idx = k / BITS_PER_BYTE; *off = k % BITS_PER_BYTE; @@ -69,7 +77,7 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_cur(net); u32 idx, off; - nft_bitmap_location(*key, &idx, &off); + nft_bitmap_location(set, key, &idx, &off); return nft_bitmap_active(priv->bitmap, idx, off, genmask); } @@ -83,7 +91,7 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) return -EEXIST; @@ -102,7 +110,7 @@ static void nft_bitmap_remove(const struct net *net, u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); /* Enter 00 state. */ priv->bitmap[idx] &= ~(genmask << off); } @@ -116,7 +124,7 @@ static void nft_bitmap_activate(const struct net *net, u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); } @@ -128,7 +136,7 @@ static bool nft_bitmap_flush(const struct net *net, u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); @@ -161,10 +169,9 @@ static void *nft_bitmap_deactivate(const struct net *net, struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); struct nft_set_ext *ext; - u32 idx, off, key = 0; + u32 idx, off; - memcpy(&key, elem->key.val.data, set->klen); - nft_bitmap_location(key, &idx, &off); + nft_bitmap_location(set, elem->key.val.data, &idx, &off); if (!nft_bitmap_active(priv->bitmap, idx, off, genmask)) return NULL; From 10596608c4d62cb8c1c2b806debcbd32fe657e71 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 8 Mar 2017 22:54:18 +0800 Subject: [PATCH 059/297] netfilter: nf_tables: fix mismatch in big-endian system Currently, there are two different methods to store an u16 integer to the u32 data register. For example: u32 *dest = ®s->data[priv->dreg]; 1. *dest = 0; *(u16 *) dest = val_u16; 2. *dest = val_u16; For method 1, the u16 value will be stored like this, either in big-endian or little-endian system: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | Value | 0 | +-+-+-+-+-+-+-+-+-+-+-+-+ For method 2, in little-endian system, the u16 value will be the same as listed above. But in big-endian system, the u16 value will be stored like this: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | 0 | Value | +-+-+-+-+-+-+-+-+-+-+-+-+ So later we use "memcmp(®s->data[priv->sreg], data, 2);" to do compare in nft_cmp, nft_lookup expr ..., method 2 will get the wrong result in big-endian system, as 0~15 bits will always be zero. For the similar reason, when loading an u16 value from the u32 data register, we should use "*(u16 *) sreg;" instead of "(u16)*sreg;", the 2nd method will get the wrong value in the big-endian system. So introduce some wrapper functions to store/load an u8 or u16 integer to/from the u32 data register, and use them in the right place. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 29 +++++++++++++++++++++ net/ipv4/netfilter/nft_masq_ipv4.c | 8 +++--- net/ipv4/netfilter/nft_redir_ipv4.c | 8 +++--- net/ipv6/netfilter/nft_masq_ipv6.c | 8 +++--- net/ipv6/netfilter/nft_redir_ipv6.c | 8 +++--- net/netfilter/nft_ct.c | 18 +++++++------ net/netfilter/nft_meta.c | 40 +++++++++++++++-------------- net/netfilter/nft_nat.c | 8 +++--- 8 files changed, 80 insertions(+), 47 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2aa8a9d80fbe..70c5ca0c60b1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -103,6 +103,35 @@ struct nft_regs { }; }; +/* Store/load an u16 or u8 integer to/from the u32 data register. + * + * Note, when using concatenations, register allocation happens at 32-bit + * level. So for store instruction, pad the rest part with zero to avoid + * garbage values. + */ + +static inline void nft_reg_store16(u32 *dreg, u16 val) +{ + *dreg = 0; + *(u16 *)dreg = val; +} + +static inline void nft_reg_store8(u32 *dreg, u8 val) +{ + *dreg = 0; + *(u8 *)dreg = val; +} + +static inline u16 nft_reg_load16(u32 *sreg) +{ + return *(u16 *)sreg; +} + +static inline u8 nft_reg_load8(u32 *sreg) +{ + return *(u8 *)sreg; +} + static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index a0ea8aad1bf1..f18677277119 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -26,10 +26,10 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); } regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), &range, nft_out(pkt)); diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index 1650ed23c15d..5120be1d3118 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -26,10 +26,10 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, memset(&mr, 0, sizeof(mr)); if (priv->sreg_proto_min) { - mr.range[0].min.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - mr.range[0].max.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + mr.range[0].min.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + mr.range[0].max.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 6c5b5b1830a7..4146536e9c15 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -27,10 +27,10 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); } regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, nft_out(pkt)); diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index f5ac080fc084..a27e424f690d 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -26,10 +26,10 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min], - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max], + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index bf548a7a71ec..91585b5e5307 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -83,7 +83,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, switch (priv->key) { case NFT_CT_DIRECTION: - *dest = CTINFO2DIR(ctinfo); + nft_reg_store8(dest, CTINFO2DIR(ctinfo)); return; case NFT_CT_STATUS: *dest = ct->status; @@ -151,20 +151,22 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; } case NFT_CT_L3PROTOCOL: - *dest = nf_ct_l3num(ct); + nft_reg_store8(dest, nf_ct_l3num(ct)); return; case NFT_CT_PROTOCOL: - *dest = nf_ct_protonum(ct); + nft_reg_store8(dest, nf_ct_protonum(ct)); return; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); + u16 zoneid; if (priv->dir < IP_CT_DIR_MAX) - *dest = nf_ct_zone_id(zone, priv->dir); + zoneid = nf_ct_zone_id(zone, priv->dir); else - *dest = zone->id; + zoneid = zone->id; + nft_reg_store16(dest, zoneid); return; } #endif @@ -183,10 +185,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_PROTO_SRC: - *dest = (__force __u16)tuple->src.u.all; + nft_reg_store16(dest, (__force u16)tuple->src.u.all); return; case NFT_CT_PROTO_DST: - *dest = (__force __u16)tuple->dst.u.all; + nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; default: break; @@ -205,7 +207,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; - u16 value = regs->data[priv->sreg]; + u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e1f5ca9b423b..7b60e01f38ff 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -45,16 +45,15 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = skb->len; break; case NFT_META_PROTOCOL: - *dest = 0; - *(__be16 *)dest = skb->protocol; + nft_reg_store16(dest, (__force u16)skb->protocol); break; case NFT_META_NFPROTO: - *dest = nft_pf(pkt); + nft_reg_store8(dest, nft_pf(pkt)); break; case NFT_META_L4PROTO: if (!pkt->tprot_set) goto err; - *dest = pkt->tprot; + nft_reg_store8(dest, pkt->tprot); break; case NFT_META_PRIORITY: *dest = skb->priority; @@ -85,14 +84,12 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_IIFTYPE: if (in == NULL) goto err; - *dest = 0; - *(u16 *)dest = in->type; + nft_reg_store16(dest, in->type); break; case NFT_META_OIFTYPE: if (out == NULL) goto err; - *dest = 0; - *(u16 *)dest = out->type; + nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: sk = skb_to_full_sk(skb); @@ -142,19 +139,19 @@ void nft_meta_get_eval(const struct nft_expr *expr, #endif case NFT_META_PKTTYPE: if (skb->pkt_type != PACKET_LOOPBACK) { - *dest = skb->pkt_type; + nft_reg_store8(dest, skb->pkt_type); break; } switch (nft_pf(pkt)) { case NFPROTO_IPV4: if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); else - *dest = PACKET_BROADCAST; + nft_reg_store8(dest, PACKET_BROADCAST); break; case NFPROTO_IPV6: - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); break; case NFPROTO_NETDEV: switch (skb->protocol) { @@ -168,14 +165,14 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; if (ipv4_is_multicast(iph->daddr)) - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); else - *dest = PACKET_BROADCAST; + nft_reg_store8(dest, PACKET_BROADCAST); break; } case htons(ETH_P_IPV6): - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); break; default: WARN_ON_ONCE(1); @@ -230,7 +227,9 @@ void nft_meta_set_eval(const struct nft_expr *expr, { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; - u32 value = regs->data[meta->sreg]; + u32 *sreg = ®s->data[meta->sreg]; + u32 value = *sreg; + u8 pkt_type; switch (meta->key) { case NFT_META_MARK: @@ -240,9 +239,12 @@ void nft_meta_set_eval(const struct nft_expr *expr, skb->priority = value; break; case NFT_META_PKTTYPE: - if (skb->pkt_type != value && - skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type)) - skb->pkt_type = value; + pkt_type = nft_reg_load8(sreg); + + if (skb->pkt_type != pkt_type && + skb_pkt_type_ok(pkt_type) && + skb_pkt_type_ok(skb->pkt_type)) + skb->pkt_type = pkt_type; break; case NFT_META_NFTRACE: skb->nf_trace = !!value; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 19a7bf3236f9..439e0bd152a0 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -65,10 +65,10 @@ static void nft_nat_eval(const struct nft_expr *expr, } if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } From 4ca60d08cbe65f501baad64af50fceba79c19fbb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Mar 2017 23:22:30 +0100 Subject: [PATCH 060/297] netfilter: bridge: honor frag_max_size when refragmenting consider a bridge with mtu 9000, but end host sending smaller packets to another host with mtu < 9000. In this case, after reassembly, bridge+defrag would refragment, and then attempt to send the reassembled packet as long as it was below 9k. Instead we have to cap by the largest fragment size seen. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 95087e6e8258..3c5185021c1c 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -721,18 +721,20 @@ static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge; - unsigned int mtu_reserved; + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + unsigned int mtu, mtu_reserved; mtu_reserved = nf_bridge_mtu_reduction(skb); + mtu = skb->dev->mtu; - if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { + if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) + mtu = nf_bridge->frag_max_size; + + if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } - nf_bridge = nf_bridge_info_get(skb); - /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ From 170a1fb9c01bc40b7e8fd57a32ac9a0e131ec5b6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 11 Mar 2017 00:25:26 -0500 Subject: [PATCH 061/297] netfilter: Force fake conntrack entry to be at least 8 bytes aligned Since the nfct and nfctinfo have been combined, the nf_conn structure must be at least 8 bytes aligned, as the 3 LSB bits are used for the nfctinfo. But there's a fake nf_conn structure to denote untracked connections, which is created by a PER_CPU construct. This does not guarantee that it will be 8 bytes aligned and can break the logic in determining the correct nfctinfo. I triggered this on a 32bit machine with the following error: BUG: unable to handle kernel NULL pointer dereference at 00000af4 IP: nf_ct_deliver_cached_events+0x1b/0xfb *pdpt = 0000000031962001 *pde = 0000000000000000 Oops: 0000 [#1] SMP [Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ipv6 crc_ccitt ppdev r8169 parport_pc parport OK ] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-test+ #75 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 task: c126ec00 task.stack: c1258000 EIP: nf_ct_deliver_cached_events+0x1b/0xfb EFLAGS: 00010202 CPU: 0 EAX: 0021cd01 EBX: 00000000 ECX: 27b0c767 EDX: 32bcb17a ESI: f34135c0 EDI: f34135c0 EBP: f2debd60 ESP: f2debd3c DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: 00000af4 CR3: 309a0440 CR4: 001406f0 Call Trace: ? ipv6_skip_exthdr+0xac/0xcb ipv6_confirm+0x10c/0x119 [nf_conntrack_ipv6] nf_hook_slow+0x22/0xc7 nf_hook+0x9a/0xad [ipv6] ? ip6t_do_table+0x356/0x379 [ip6_tables] ? ip6_fragment+0x9e9/0x9e9 [ipv6] ip6_output+0xee/0x107 [ipv6] ? ip6_fragment+0x9e9/0x9e9 [ipv6] dst_output+0x36/0x4d [ipv6] NF_HOOK.constprop.37+0xb2/0xba [ipv6] ? icmp6_dst_alloc+0x2c/0xfd [ipv6] ? local_bh_enable+0x14/0x14 [ipv6] mld_sendpack+0x1c5/0x281 [ipv6] ? mark_held_locks+0x40/0x5c mld_ifc_timer_expire+0x1f6/0x21e [ipv6] call_timer_fn+0x135/0x283 ? detach_if_pending+0x55/0x55 ? mld_dad_timer_expire+0x3e/0x3e [ipv6] __run_timers+0x111/0x14b ? mld_dad_timer_expire+0x3e/0x3e [ipv6] run_timer_softirq+0x1c/0x36 __do_softirq+0x185/0x37c ? test_ti_thread_flag.constprop.19+0xd/0xd do_softirq_own_stack+0x22/0x28 irq_exit+0x5a/0xa4 smp_apic_timer_interrupt+0x2a/0x34 apic_timer_interrupt+0x37/0x3c By using DEFINE/DECLARE_PER_CPU_ALIGNED we can enforce at least 8 byte alignment as all cache line sizes are at least 8 bytes or more. Fixes: a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage area") Signed-off-by: Steven Rostedt (VMware) Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 +- net/netfilter/nf_conntrack_core.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f540f9ad2af4..19605878da47 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -244,7 +244,7 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, u32 seq); /* Fake conntrack entry for untracked connections */ -DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +DECLARE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); static inline struct nf_conn *nf_ct_untracked_get(void) { return raw_cpu_ptr(&nf_conntrack_untracked); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 071b97fcbefb..ffb78e5f7b70 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -181,7 +181,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; seqcount_t nf_conntrack_generation __read_mostly; -DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used + * for the nfctinfo. We cheat by (ab)using the PER CPU cache line + * alignment to enforce this. + */ +DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); static unsigned int nf_conntrack_hash_rnd __read_mostly; From e920dde5160887d07b738f5a7f593b1fa9b1e32e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Mar 2017 18:32:31 +0100 Subject: [PATCH 062/297] netfilter: nft_set_bitmap: keep a list of dummy elements Element comments may come without any prior set flag, so we have to keep a list of dummy struct nft_set_ext to keep this information around. This is only useful for set dumps to userspace. From the packet path, this set type relies on the bitmap representation. This patch simplifies the logic since we don't need to allocate the dummy nft_set_ext structure anymore on the fly at the cost of increasing memory consumption because of the list of dummy struct nft_set_ext. Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_bitmap.c | 138 +++++++++++++++------------------ 1 file changed, 62 insertions(+), 76 deletions(-) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 9b024e22717b..8ebbc2940f4c 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -15,6 +15,11 @@ #include #include +struct nft_bitmap_elem { + struct list_head head; + struct nft_set_ext ext; +}; + /* This bitmap uses two bits to represent one element. These two bits determine * the element state in the current and the future generation. * @@ -41,8 +46,9 @@ * restore its previous state. */ struct nft_bitmap { - u16 bitmap_size; - u8 bitmap[]; + struct list_head list; + u16 bitmap_size; + u8 bitmap[]; }; static inline void nft_bitmap_location(const struct nft_set *set, @@ -82,21 +88,43 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, return nft_bitmap_active(priv->bitmap, idx, off, genmask); } +static struct nft_bitmap_elem * +nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, + u8 genmask) +{ + const struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *be; + + list_for_each_entry_rcu(be, &priv->list, head) { + if (memcmp(nft_set_ext_key(&be->ext), + nft_set_ext_key(&this->ext), set->klen) || + !nft_set_elem_active(&be->ext, genmask)) + continue; + + return be; + } + return NULL; +} + static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **_ext) + struct nft_set_ext **ext) { struct nft_bitmap *priv = nft_set_priv(set); - struct nft_set_ext *ext = elem->priv; + struct nft_bitmap_elem *new = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); - if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) + be = nft_bitmap_elem_find(set, new, genmask); + if (be) { + *ext = &be->ext; return -EEXIST; + } + nft_bitmap_location(set, nft_set_ext_key(&new->ext), &idx, &off); /* Enter 01 state. */ priv->bitmap[idx] |= (genmask << off); + list_add_tail_rcu(&new->head, &priv->list); return 0; } @@ -106,13 +134,14 @@ static void nft_bitmap_remove(const struct net *net, const struct nft_set_elem *elem) { struct nft_bitmap *priv = nft_set_priv(set); - struct nft_set_ext *ext = elem->priv; + struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 00 state. */ priv->bitmap[idx] &= ~(genmask << off); + list_del_rcu(&be->head); } static void nft_bitmap_activate(const struct net *net, @@ -120,73 +149,52 @@ static void nft_bitmap_activate(const struct net *net, const struct nft_set_elem *elem) { struct nft_bitmap *priv = nft_set_priv(set); - struct nft_set_ext *ext = elem->priv; + struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); + nft_set_elem_change_active(net, set, &be->ext); } static bool nft_bitmap_flush(const struct net *net, - const struct nft_set *set, void *ext) + const struct nft_set *set, void *_be) { struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); + struct nft_bitmap_elem *be = _be; u32 idx, off; - nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); + nft_set_elem_change_active(net, set, &be->ext); return true; } -static struct nft_set_ext *nft_bitmap_ext_alloc(const struct nft_set *set, - const struct nft_set_elem *elem) -{ - struct nft_set_ext_tmpl tmpl; - struct nft_set_ext *ext; - - nft_set_ext_prepare(&tmpl); - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); - - ext = kzalloc(tmpl.len, GFP_KERNEL); - if (!ext) - return NULL; - - nft_set_ext_init(ext, &tmpl); - memcpy(nft_set_ext_key(ext), elem->key.val.data, set->klen); - - return ext; -} - static void *nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *this = elem->priv, *be; u8 genmask = nft_genmask_next(net); - struct nft_set_ext *ext; u32 idx, off; nft_bitmap_location(set, elem->key.val.data, &idx, &off); - if (!nft_bitmap_active(priv->bitmap, idx, off, genmask)) - return NULL; - - /* We have no real set extension since this is a bitmap, allocate this - * dummy object that is released from the commit/abort path. - */ - ext = nft_bitmap_ext_alloc(set, elem); - if (!ext) + be = nft_bitmap_elem_find(set, this, genmask); + if (!be) return NULL; /* Enter 10 state. */ priv->bitmap[idx] &= ~(genmask << off); + nft_set_elem_change_active(net, set, &be->ext); - return ext; + return be; } static void nft_bitmap_walk(const struct nft_ctx *ctx, @@ -194,47 +202,23 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, struct nft_set_iter *iter) { const struct nft_bitmap *priv = nft_set_priv(set); - struct nft_set_ext_tmpl tmpl; + struct nft_bitmap_elem *be; struct nft_set_elem elem; - struct nft_set_ext *ext; - int idx, off; - u16 key; - nft_set_ext_prepare(&tmpl); - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + list_for_each_entry_rcu(be, &priv->list, head) { + if (iter->count < iter->skip) + goto cont; + if (!nft_set_elem_active(&be->ext, iter->genmask)) + goto cont; - for (idx = 0; idx < priv->bitmap_size; idx++) { - for (off = 0; off < BITS_PER_BYTE; off += 2) { - if (iter->count < iter->skip) - goto cont; + elem.priv = be; - if (!nft_bitmap_active(priv->bitmap, idx, off, - iter->genmask)) - goto cont; + iter->err = iter->fn(ctx, set, iter, &elem); - ext = kzalloc(tmpl.len, GFP_KERNEL); - if (!ext) { - iter->err = -ENOMEM; - return; - } - nft_set_ext_init(ext, &tmpl); - key = ((idx * BITS_PER_BYTE) + off) >> 1; - memcpy(nft_set_ext_key(ext), &key, set->klen); - - elem.priv = ext; - iter->err = iter->fn(ctx, set, iter, &elem); - - /* On set flush, this dummy extension object is released - * from the commit/abort path. - */ - if (!iter->flush) - kfree(ext); - - if (iter->err < 0) - return; + if (iter->err < 0) + return; cont: - iter->count++; - } + iter->count++; } } @@ -265,6 +249,7 @@ static int nft_bitmap_init(const struct nft_set *set, { struct nft_bitmap *priv = nft_set_priv(set); + INIT_LIST_HEAD(&priv->list); priv->bitmap_size = nft_bitmap_size(set->klen); return 0; @@ -290,6 +275,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, static struct nft_set_ops nft_bitmap_ops __read_mostly = { .privsize = nft_bitmap_privsize, + .elemsize = offsetof(struct nft_bitmap_elem, ext), .estimate = nft_bitmap_estimate, .init = nft_bitmap_init, .destroy = nft_bitmap_destroy, From 0067d4b020ea07a58540acb2c5fcd3364bf326e0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 13 Mar 2017 16:10:11 +0200 Subject: [PATCH 063/297] blk-mq: Fix tagset reinit in the presence of cpu hot-unplug In case cpu was unplugged, we need to make sure not to assume that the tags for that cpu are still allocated. so check for null tags when reinitializing a tagset. Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e48bc2c72615..9d97bfc4d465 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -295,6 +295,9 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) for (i = 0; i < set->nr_hw_queues; i++) { struct blk_mq_tags *tags = set->tags[i]; + if (!tags) + continue; + for (j = 0; j < tags->nr_tags; j++) { if (!tags->static_rqs[j]) continue; From 4a3a485b1ed0e109718cc8c9d094fa0f552de9b2 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Fri, 10 Mar 2017 12:09:49 -0800 Subject: [PATCH 064/297] writeback: fix memory leak in wb_queue_work() When WB_registered flag is not set, wb_queue_work() skips queuing the work, but does not perform the necessary clean up. In particular, if work->auto_free is true, it should free the memory. The leak condition can be reprouced by following these steps: mount /dev/sdb /mnt/sdb /* In qemu console: device_del sdb */ umount /dev/sdb Above will result in a wb_queue_work() call on an unregistered wb and thus leak memory. Reported-by: John Sperbeck Signed-off-by: Tahsin Erdogan Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ef600591d96f..63ee2940775c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -173,19 +173,33 @@ static void wb_wakeup(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } +static void finish_writeback_work(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + struct wb_completion *done = work->done; + + if (work->auto_free) + kfree(work); + if (done && atomic_dec_and_test(&done->cnt)) + wake_up_all(&wb->bdi->wb_waitq); +} + static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { trace_writeback_queue(wb, work); - spin_lock_bh(&wb->work_lock); - if (!test_bit(WB_registered, &wb->state)) - goto out_unlock; if (work->done) atomic_inc(&work->done->cnt); - list_add_tail(&work->list, &wb->work_list); - mod_delayed_work(bdi_wq, &wb->dwork, 0); -out_unlock: + + spin_lock_bh(&wb->work_lock); + + if (test_bit(WB_registered, &wb->state)) { + list_add_tail(&work->list, &wb->work_list); + mod_delayed_work(bdi_wq, &wb->dwork, 0); + } else + finish_writeback_work(wb, work); + spin_unlock_bh(&wb->work_lock); } @@ -1873,16 +1887,9 @@ static long wb_do_writeback(struct bdi_writeback *wb) set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { - struct wb_completion *done = work->done; - trace_writeback_exec(wb, work); - wrote += wb_writeback(wb, work); - - if (work->auto_free) - kfree(work); - if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(&wb->bdi->wb_waitq); + finish_writeback_work(wb, work); } /* From 6f1f622019f95d79d6e2f8bb3781144ad0aff75f Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Mon, 6 Mar 2017 17:49:42 +0800 Subject: [PATCH 065/297] nfs4: fix a typo of NFS_ATTR_FATTR_GROUP_NAME This typo cause a memory leak, and a bad client's group id. unreferenced object 0xffff96d8073998d0 (size 8): comm "kworker/0:3", pid 34224, jiffies 4295361338 (age 761.752s) hex dump (first 8 bytes): 30 00 39 07 d8 96 ff ff 0.9..... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc+0x140/0x220 [] xdr_stream_decode_string_dup+0x7c/0x110 [sunrpc] [] decode_getfattr_attrs+0x940/0x1630 [nfsv4] [] decode_getfattr_generic.constprop.108+0x9b/0x100 [nfsv4] [] nfs4_xdr_dec_open+0xcf/0x100 [nfsv4] [] rpcauth_unwrap_resp+0xa7/0xe0 [sunrpc] [] call_decode+0x1e0/0x810 [sunrpc] [] __rpc_execute+0x8d/0x420 [sunrpc] [] rpc_async_schedule+0x12/0x20 [sunrpc] [] process_one_work+0x197/0x430 [] worker_thread+0x4e/0x4a0 [] kthread+0x101/0x140 [] ret_from_fork+0x2c/0x40 [] 0xffffffffffffffff Fixes: 686a816ab6 ("NFSv4: Clean up owner/group attribute decode") Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f0369e362753..80ce289eea05 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3942,7 +3942,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, group_name->data); - return NFS_ATTR_FATTR_OWNER_NAME; + return NFS_ATTR_FATTR_GROUP_NAME; } else { len = xdr_stream_decode_opaque_inline(xdr, (void **)&p, XDR_MAX_NETOBJ); From 366a1569bff3fe14abfdf9285e31e05e091745f5 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Mon, 6 Mar 2017 22:29:14 +0800 Subject: [PATCH 066/297] NFSv4: fix a reference leak caused WARNING messages Because nfs4_opendata_access() has close the state when access is denied, so the state isn't leak. Rather than revert the commit a974deee47, I'd like clean the strange state close. [ 1615.094218] ------------[ cut here ]------------ [ 1615.094607] WARNING: CPU: 0 PID: 23702 at lib/list_debug.c:31 __list_add_valid+0x8e/0xa0 [ 1615.094913] list_add double add: new=ffff9d7901d9f608, prev=ffff9d7901d9f608, next=ffff9d7901ee8dd0. [ 1615.095458] Modules linked in: nfsv4(E) nfs(E) nfsd(E) tun bridge stp llc fuse ip_set nfnetlink vmw_vsock_vmci_transport vsock f2fs snd_seq_midi snd_seq_midi_event fscrypto coretemp ppdev crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_rapl_perf vmw_balloon snd_ens1371 joydev gameport snd_ac97_codec ac97_bus snd_seq snd_pcm snd_rawmidi snd_timer snd_seq_device snd soundcore nfit parport_pc parport acpi_cpufreq tpm_tis tpm_tis_core tpm i2c_piix4 vmw_vmci shpchp auth_rpcgss nfs_acl lockd(E) grace sunrpc(E) xfs libcrc32c vmwgfx drm_kms_helper ttm drm crc32c_intel mptspi e1000 serio_raw scsi_transport_spi mptscsih mptbase ata_generic pata_acpi fjes [last unloaded: nfs] [ 1615.097663] CPU: 0 PID: 23702 Comm: fstest Tainted: G W E 4.11.0-rc1+ #517 [ 1615.098015] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 1615.098807] Call Trace: [ 1615.099183] dump_stack+0x63/0x86 [ 1615.099578] __warn+0xcb/0xf0 [ 1615.099967] warn_slowpath_fmt+0x5f/0x80 [ 1615.100370] __list_add_valid+0x8e/0xa0 [ 1615.100760] nfs4_put_state_owner+0x75/0xc0 [nfsv4] [ 1615.101136] __nfs4_close+0x109/0x140 [nfsv4] [ 1615.101524] nfs4_close_state+0x15/0x20 [nfsv4] [ 1615.101949] nfs4_close_context+0x21/0x30 [nfsv4] [ 1615.102691] __put_nfs_open_context+0xb8/0x110 [nfs] [ 1615.103155] put_nfs_open_context+0x10/0x20 [nfs] [ 1615.103586] nfs4_file_open+0x13b/0x260 [nfsv4] [ 1615.103978] do_dentry_open+0x20a/0x2f0 [ 1615.104369] ? nfs4_copy_file_range+0x30/0x30 [nfsv4] [ 1615.104739] vfs_open+0x4c/0x70 [ 1615.105106] ? may_open+0x5a/0x100 [ 1615.105469] path_openat+0x623/0x1420 [ 1615.105823] do_filp_open+0x91/0x100 [ 1615.106174] ? __alloc_fd+0x3f/0x170 [ 1615.106568] do_sys_open+0x130/0x220 [ 1615.106920] ? __put_cred+0x3d/0x50 [ 1615.107256] SyS_open+0x1e/0x20 [ 1615.107588] entry_SYSCALL_64_fastpath+0x1a/0xa9 [ 1615.107922] RIP: 0033:0x7fab599069b0 [ 1615.108247] RSP: 002b:00007ffcf0600d78 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 [ 1615.108575] RAX: ffffffffffffffda RBX: 00007fab59bcfae0 RCX: 00007fab599069b0 [ 1615.108896] RDX: 0000000000000200 RSI: 0000000000000200 RDI: 00007ffcf060255e [ 1615.109211] RBP: 0000000000040010 R08: 0000000000000000 R09: 0000000000000016 [ 1615.109515] R10: 00000000000006a1 R11: 0000000000000246 R12: 0000000000041000 [ 1615.109806] R13: 0000000000040010 R14: 0000000000001000 R15: 0000000000002710 [ 1615.110152] ---[ end trace 96ed63b1306bf2f3 ]--- Fixes: a974deee47 ("NFSv4: Fix memory and state leak in...") Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1b183686c6d4..c1f5369cd339 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2258,8 +2258,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred, if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) return 0; - /* even though OPEN succeeded, access is denied. Close the file */ - nfs4_close_state(state, fmode); return -EACCES; } From aac66bf5f916f645bd57029490a72c3f91f2c274 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Mon, 6 Mar 2017 23:54:01 +0000 Subject: [PATCH 067/297] drm/i915: use correct node for handling cache domain eviction It looks like we were incorrectly comparing vma->node against itself instead of the target node, when evicting for a node on systems where we need guard pages between regions with different cache domains. As a consequence we can end up trying to needlessly evict neighbouring nodes, even if they have the same cache domain, and if they were pinned we would fail the eviction. Fixes: 625d988acc28 ("drm/i915: Extract reserving space in the GTT to a helper") Signed-off-by: Matthew Auld Cc: Chris Wilson Cc: Joonas Lahtinen Reviewed-by: Chris Wilson Link: http://patchwork.freedesktop.org/patch/msgid/20170306235414.23407-3-matthew.auld@intel.com Signed-off-by: Chris Wilson (cherry picked from commit fe65cbdbc97929e4a522716ed279a36783656142) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem_evict.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c index c181b1bb3d2c..3be2503aa042 100644 --- a/drivers/gpu/drm/i915/i915_gem_evict.c +++ b/drivers/gpu/drm/i915/i915_gem_evict.c @@ -293,12 +293,12 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, * those as well to make room for our guard pages. */ if (check_color) { - if (vma->node.start + vma->node.size == node->start) { - if (vma->node.color == node->color) + if (node->start + node->size == target->start) { + if (node->color == target->color) continue; } - if (vma->node.start == node->start + node->size) { - if (vma->node.color == node->color) + if (node->start == target->start + target->size) { + if (node->color == target->color) continue; } } From 3a0d137de035cc8c70194d9988ded61825b5ff8a Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 8 Mar 2017 13:00:07 +0100 Subject: [PATCH 068/297] drm/i915: Nuke skl_update_plane debug message from the pipe update critical section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit printks are slow so we should not be doing them from the vblank evade critical section. These could explain why we sometimes seem to blow past our 100 usec deadline. The problem has been there ever since commit c331879ce8ea ("drm/i915: skylake sprite plane scaling using shared scalers.") but it may not have been readily visible until commit e1edbd44e23b ("drm/i915: Complain if we take too long under vblank evasion.") increased our chances of noticing it. Signed-off-by: Maarten Lankhorst Cc: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/1488974407-25175-1-git-send-email-maarten.lankhorst@linux.intel.com Fixes: c331879ce8ea ("drm/i915: skylake sprite plane scaling using shared scalers") Cc: # v4.2+ Reviewed-by: Ville Syrjälä [mlankhorst: Add missing tags, point to the correct offending commit] (cherry picked from commit d38146b9ee16264ff9a88bf3391ab9f2f5af3646) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_sprite.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c index 9ef54688872a..9481ca9a3ae7 100644 --- a/drivers/gpu/drm/i915/intel_sprite.c +++ b/drivers/gpu/drm/i915/intel_sprite.c @@ -254,9 +254,6 @@ skl_update_plane(struct drm_plane *drm_plane, int scaler_id = plane_state->scaler_id; const struct intel_scaler *scaler; - DRM_DEBUG_KMS("plane = %d PS_PLANE_SEL(plane) = 0x%x\n", - plane_id, PS_PLANE_SEL(plane_id)); - scaler = &crtc_state->scaler_state.scalers[scaler_id]; I915_WRITE(SKL_PS_CTRL(pipe, scaler_id), From 6aef660370a9c246956ba6d01eebd8063c4214cb Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 10 Mar 2017 09:32:49 +0000 Subject: [PATCH 069/297] drm/i915: Fix forcewake active domain tracking In commit 003342a50021 ("drm/i915: Keep track of active forcewake domains in a bitmask") I forgot to adjust the newly introduce fw_domains_active state across reset. This caused the assert_forcewakes_inactive to trigger during suspend and resume if there were user held forcewakes. v2: Bitmask checks are required since vfuncs are not always present. v3: Move bitmask tracking to get/put vfunc for simplicity. (Chris Wilson) Signed-off-by: Tvrtko Ursulin Fixes: 003342a50021 ("drm/i915: Keep track of active forcewake domains in a bitmask") Testcase: igt/drv_suspend/forcewake Cc: Tvrtko Ursulin Cc: "Paneri, Praveen" Cc: Chris Wilson Cc: Daniel Vetter Cc: Jani Nikula Cc: intel-gfx@lists.freedesktop.org Cc: v4.10+ Reviewed-by: Chris Wilson Link: http://patchwork.freedesktop.org/patch/msgid/20170310093249.4484-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit b8473050805f35add97f3ff57570d55a01808df5) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_uncore.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index abe08885a5ba..b7ff592b14f5 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -119,6 +119,8 @@ fw_domains_get(struct drm_i915_private *dev_priv, enum forcewake_domains fw_doma for_each_fw_domain_masked(d, fw_domains, dev_priv) fw_domain_wait_ack(d); + + dev_priv->uncore.fw_domains_active |= fw_domains; } static void @@ -130,6 +132,8 @@ fw_domains_put(struct drm_i915_private *dev_priv, enum forcewake_domains fw_doma fw_domain_put(d); fw_domain_posting_read(d); } + + dev_priv->uncore.fw_domains_active &= ~fw_domains; } static void @@ -240,10 +244,8 @@ intel_uncore_fw_release_timer(struct hrtimer *timer) if (WARN_ON(domain->wake_count == 0)) domain->wake_count++; - if (--domain->wake_count == 0) { + if (--domain->wake_count == 0) dev_priv->uncore.funcs.force_wake_put(dev_priv, domain->mask); - dev_priv->uncore.fw_domains_active &= ~domain->mask; - } spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); @@ -454,10 +456,8 @@ static void __intel_uncore_forcewake_get(struct drm_i915_private *dev_priv, fw_domains &= ~domain->mask; } - if (fw_domains) { + if (fw_domains) dev_priv->uncore.funcs.force_wake_get(dev_priv, fw_domains); - dev_priv->uncore.fw_domains_active |= fw_domains; - } } /** @@ -968,7 +968,6 @@ static noinline void ___force_wake_auto(struct drm_i915_private *dev_priv, fw_domain_arm_timer(domain); dev_priv->uncore.funcs.force_wake_get(dev_priv, fw_domains); - dev_priv->uncore.fw_domains_active |= fw_domains; } static inline void __force_wake_auto(struct drm_i915_private *dev_priv, From 04166f48d9593af4513ae06c0f966c0cee300a20 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Mar 2017 13:24:03 +0100 Subject: [PATCH 070/297] Revert "netfilter: nf_tables: add flush field to struct nft_set_iter" This reverts commit 1f48ff6c5393aa7fe290faf5d633164f105b0aa7. This patch is not required anymore now that we keep a dummy list of set elements in the bitmap set implementation, so revert this before we forget this code has no clients. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - net/netfilter/nf_tables_api.c | 4 ---- 2 files changed, 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 70c5ca0c60b1..0136028652bd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -232,7 +232,6 @@ struct nft_set_elem { struct nft_set; struct nft_set_iter { u8 genmask; - bool flush; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5e0ccfd5bb37..434c739dfeca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3145,7 +3145,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; - iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) @@ -3399,7 +3398,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; - args.iter.flush = false; set->ops->walk(&ctx, set, &args.iter); nla_nest_end(skb, nest); @@ -3963,7 +3961,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct nft_set_iter iter = { .genmask = genmask, .fn = nft_flush_set, - .flush = true, }; set->ops->walk(&ctx, set, &iter); @@ -5114,7 +5111,6 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, iter.count = 0; iter.err = 0; iter.fn = nf_tables_loop_check_setelem; - iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) From c5f7c5a9a0f84c511a8a336491f9b8a3060b6517 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 6 Mar 2017 16:21:16 +0200 Subject: [PATCH 071/297] drivers, xen: convert grant_map.users from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Boris Ostrovsky --- drivers/xen/gntdev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index c77a0751a311..f3bf8f4e2d6c 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -86,7 +87,7 @@ struct grant_map { int index; int count; int flags; - atomic_t users; + refcount_t users; struct unmap_notify notify; struct ioctl_gntdev_grant_ref *grants; struct gnttab_map_grant_ref *map_ops; @@ -166,7 +167,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) add->index = 0; add->count = count; - atomic_set(&add->users, 1); + refcount_set(&add->users, 1); return add; @@ -212,7 +213,7 @@ static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) if (!map) return; - if (!atomic_dec_and_test(&map->users)) + if (!refcount_dec_and_test(&map->users)) return; atomic_sub(map->count, &pages_mapped); @@ -400,7 +401,7 @@ static void gntdev_vma_open(struct vm_area_struct *vma) struct grant_map *map = vma->vm_private_data; pr_debug("gntdev_vma_open %p\n", vma); - atomic_inc(&map->users); + refcount_inc(&map->users); } static void gntdev_vma_close(struct vm_area_struct *vma) @@ -1004,7 +1005,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) goto unlock_out; } - atomic_inc(&map->users); + refcount_inc(&map->users); vma->vm_ops = &gntdev_vmops; From 44fee88cea43d3c2cac962e0439cb10a3cabff6d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Mar 2017 15:57:12 +0100 Subject: [PATCH 072/297] x86/tsc: Fix ART for TSC_KNOWN_FREQ Subhransu reported that convert_art_to_tsc() isn't working for him. The ART to TSC relation is only set up for systems which use the refined TSC calibration. Systems with known TSC frequency (available via CPUID 15) are not using the refined calibration and therefor the ART to TSC relation is never established. Add the setup to the known frequency init path which skips ART calibration. The init code needs to be duplicated as for systems which use refined calibration the ART setup must be delayed until calibration has been done. The problem has been there since the ART support was introdduced, but only detected now because Subhransu tested the first time on hardware which has TSC frequency enumerated via CPUID 15. Note for stable: The conditional has changed from TSC_RELIABLE to TSC_KNOWN_FREQUENCY. [ tglx: Rewrote changelog and identified the proper 'Fixes' commit ] Fixes: f9677e0f8308 ("x86/tsc: Always Running Timer (ART) correlated clocksource") Reported-by: "Prusty, Subhransu S" Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Cc: christopher.s.hall@intel.com Cc: kevin.b.stanton@intel.com Cc: john.stultz@linaro.org Cc: akataria@vmware.com Link: http://lkml.kernel.org/r/20170313145712.GI3312@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 4f7a9833d8e5..c73a7f9e881a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1333,6 +1333,8 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } From 0d443b70cc92d741cbc1dcbf1079897b3d8bc3cc Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 7 Mar 2017 15:08:42 -0600 Subject: [PATCH 073/297] x86/platform: Remove warning message for duplicate NMI handlers Remove the WARNING message associated with multiple NMI handlers as there are at least two that are legitimate. These are the KGDB and the UV handlers and both want to be called if the NMI has not been claimed by any other NMI handler. Use of the UNKNOWN NMI call chain dramatically lowers the NMI call rate when high frequency NMI tools are in use, notably the perf tools. It is required on systems that cannot sustain a high NMI call rate without adversely affecting the system operation. Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Cc: Don Zickus Cc: Peter Zijlstra Cc: Russ Anderson Cc: Frank Ramsay Cc: Tony Ernst Link: http://lkml.kernel.org/r/20170307210841.730959611@asylum.americas.sgi.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index f088ea4c66e7..a723ae9440ab 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -166,11 +166,9 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) spin_lock_irqsave(&desc->lock, flags); /* - * most handlers of type NMI_UNKNOWN never return because - * they just assume the NMI is theirs. Just a sanity check - * to manage expectations + * Indicate if there are multiple registrations on the + * internal NMI handler call chains (SERR and IO_CHECK). */ - WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); From c836e5cf0d0880d6668966476c4ffe727f29f69a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 8 Mar 2017 13:24:21 +0200 Subject: [PATCH 074/297] x86/platform/intel-mid: Use common power off sequence Intel Medfield may use common for Intel MID devices power sequence. Remove unneded custom power off stub. While here, remove function forward declaration. Signed-off-by: Andy Shevchenko Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170308112422.67533-1-andriy.shevchenko@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/platform/intel-mid/mfld.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index e793fe509971..e42978d4deaf 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -17,16 +17,6 @@ #include "intel_mid_weak_decls.h" -static void penwell_arch_setup(void); -/* penwell arch ops */ -static struct intel_mid_ops penwell_ops = { - .arch_setup = penwell_arch_setup, -}; - -static void mfld_power_off(void) -{ -} - static unsigned long __init mfld_calibrate_tsc(void) { unsigned long fast_calibrate; @@ -63,9 +53,12 @@ static unsigned long __init mfld_calibrate_tsc(void) static void __init penwell_arch_setup(void) { x86_platform.calibrate_tsc = mfld_calibrate_tsc; - pm_power_off = mfld_power_off; } +static struct intel_mid_ops penwell_ops = { + .arch_setup = penwell_arch_setup, +}; + void *get_penwell_ops(void) { return &penwell_ops; From 859bb6d59066b96341020e0991f191631cabe59d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 8 Mar 2017 13:24:22 +0200 Subject: [PATCH 075/297] x86/platform/intel-mid: Add power button support for Merrifield Intel Merrifield platform has a Basin Cove PMIC to handle in particular power button events. Add necessary bits to enable it. Signed-off-by: Andy Shevchenko Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170308112422.67533-2-andriy.shevchenko@linux.intel.com Signed-off-by: Thomas Gleixner --- .../platform/intel-mid/device_libs/Makefile | 1 + .../device_libs/platform_mrfld_power_btn.c | 82 +++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index a7dbec4dce27..3dbde04febdc 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -26,5 +26,6 @@ obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o # MISC Devices obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o +obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_mrfld_power_btn.o obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c new file mode 100644 index 000000000000..a6c3705a28ad --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c @@ -0,0 +1,82 @@ +/* + * Intel Merrifield power button support + * + * (C) Copyright 2017 Intel Corporation + * + * Author: Andy Shevchenko + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include + +#include +#include + +static struct resource mrfld_power_btn_resources[] = { + { + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mrfld_power_btn_dev = { + .name = "msic_power_btn", + .id = PLATFORM_DEVID_NONE, + .num_resources = ARRAY_SIZE(mrfld_power_btn_resources), + .resource = mrfld_power_btn_resources, +}; + +static int mrfld_power_btn_scu_status_change(struct notifier_block *nb, + unsigned long code, void *data) +{ + if (code == SCU_DOWN) { + platform_device_unregister(&mrfld_power_btn_dev); + return 0; + } + + return platform_device_register(&mrfld_power_btn_dev); +} + +static struct notifier_block mrfld_power_btn_scu_notifier = { + .notifier_call = mrfld_power_btn_scu_status_change, +}; + +static int __init register_mrfld_power_btn(void) +{ + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return -ENODEV; + + /* + * We need to be sure that the SCU IPC is ready before + * PMIC power button device can be registered: + */ + intel_scu_notifier_add(&mrfld_power_btn_scu_notifier); + + return 0; +} +arch_initcall(register_mrfld_power_btn); + +static void __init *mrfld_power_btn_platform_data(void *info) +{ + struct resource *res = mrfld_power_btn_resources; + struct sfi_device_table_entry *pentry = info; + + res->start = res->end = pentry->irq; + return NULL; +} + +static const struct devs_id mrfld_power_btn_dev_id __initconst = { + .name = "bcove_power_btn", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .msic = 1, + .get_platform_data = &mrfld_power_btn_platform_data, +}; + +sfi_device(mrfld_power_btn_dev_id); From 6e7408acd04d06c04981c0c0fb5a2462b16fae4f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 12 Mar 2017 18:12:56 +0100 Subject: [PATCH 076/297] cpufreq: intel_pstate: Update pid_params.sample_rate_ns in pid_param_set() Fix the debugfs interface for PID tuning to actually update pid_params.sample_rate_ns on PID parameters updates, as changing pid_params.sample_rate_ms via debugfs has no effect now. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3d37219a0dd7..f9fe910f3b83 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -989,6 +989,7 @@ static void intel_pstate_update_policies(void) static int pid_param_set(void *data, u64 val) { *(u32 *)data = val; + pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC; intel_pstate_reset_all_pid(); return 0; } From be3606ff739d1c1be36389f8737c577ad87e1f57 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 13 Mar 2017 19:33:37 +0300 Subject: [PATCH 077/297] x86/kasan: Fix boot with KASAN=y and PROFILE_ANNOTATED_BRANCHES=y The kernel doesn't boot with both PROFILE_ANNOTATED_BRANCHES=y and KASAN=y options selected. With branch profiling enabled we end up calling ftrace_likely_update() before kasan_early_init(). ftrace_likely_update() is built with KASAN instrumentation, so calling it before kasan has been initialized leads to crash. Use DISABLE_BRANCH_PROFILING define to make sure that we don't call ftrace_likely_update() from early code before kasan_early_init(). Fixes: ef7f0d6a6ca8 ("x86_64: add KASan support") Reported-by: Fengguang Wu Signed-off-by: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Alexander Potapenko Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: lkp@01.org Cc: Dmitry Vyukov Link: http://lkml.kernel.org/r/20170313163337.1704-1-aryabinin@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head64.c | 1 + arch/x86/mm/kasan_init_64.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 54a2372f5dbb..b5785c197e53 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -4,6 +4,7 @@ * Copyright (C) 2000 Andrea Arcangeli SuSE */ +#define DISABLE_BRANCH_PROFILING #include #include #include diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8d63d7a104c3..4c90cfdc128b 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +#define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt #include #include From 3f8ed54aee491bbb83656592c2d0ad7b78d045ca Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 13 Mar 2017 18:30:12 -0700 Subject: [PATCH 078/297] cpufreq: intel_pstate: Correct frequency setting in the HWP mode In the functions intel_pstate_hwp_set(), min/max range from HWP capability MSR along with max_perf_pct and min_perf_pct, is used to set the HWP request MSR. In some cases this doesn't result in the correct HWP max/min in HWP request. For example: In the following case: HWP capabilities from MSR 0x771 0x70a1220 Here cpufreq min/max frequencies from above MSR dump are 700MHz and 3.2GHz respectively. This will result in hwp_min = 0x07 hwp_max = 0x20 To limit max frequency to 2GHz: perf_limits->max_perf_pct = 63 (2GHz as a percent of 3.2GHz rounded up) With the current calculation: adj_range = max_perf_pct * range / 100; adj_range = 63 * (32 - 7) / 100 adj_range = 15 max = hw_min + adj_range; max = 7 + 15 = 22 This will result in HWP request of 0x160f, which will result in a frequency cap of 2.2GHz not 2GHz. The problem with the above calculation is that hwp_min of 7 is treated as 0% in the range. But max_perf_pct is calculated with respect to minimum as 0 and max as 3.2GHz or hwp_max, so adding hwp_min to it will result in more than the desired. Since the min_perf_pct and max_perf_pct is already a percent of max frequency or hwp_max, this min/max HWP request value can be calculated directly applying these percentage to hwp_max. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f9fe910f3b83..ee12641ee010 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -845,7 +845,7 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { static void intel_pstate_hwp_set(struct cpufreq_policy *policy) { - int min, hw_min, max, hw_max, cpu, range, adj_range; + int min, hw_min, max, hw_max, cpu; struct perf_limits *perf_limits = limits; u64 value, cap; @@ -863,20 +863,17 @@ static void intel_pstate_hwp_set(struct cpufreq_policy *policy) hw_max = HWP_GUARANTEED_PERF(cap); else hw_max = HWP_HIGHEST_PERF(cap); - range = hw_max - hw_min; max_perf_pct = perf_limits->max_perf_pct; min_perf_pct = perf_limits->min_perf_pct; + min = hw_max * min_perf_pct / 100; rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); - adj_range = min_perf_pct * range / 100; - min = hw_min + adj_range; + value &= ~HWP_MIN_PERF(~0L); value |= HWP_MIN_PERF(min); - adj_range = max_perf_pct * range / 100; - max = hw_min + adj_range; - + max = hw_max * max_perf_pct / 100; value &= ~HWP_MAX_PERF(~0L); value |= HWP_MAX_PERF(max); From 64ff64b90e62c860772fd0be50b7cfcef1d8a9b2 Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Fri, 10 Mar 2017 03:22:12 -0800 Subject: [PATCH 079/297] scsi: megaraid_sas: enable intx only if msix request fails Without this fix, driver will enable INTx Interrupt pin even though MSI-x vectors are enabled. See below lspci output. DisINTx is unset for MSIx setup. lspci -s 85:00.0 -vvv |grep INT |grep Control Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx- After applying this fix, driver will enable INTx Interrupt pin only if Legacy interrupt method is required. See below lspci output. DisINTx is set for MSIx setup. lspci -s 85:00.0 -vvv |grep INT |grep Control Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx+ Signed-off-by: Kashyap Desai Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 7ac9a9ee9bd4..016ffcebaf66 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5034,10 +5034,12 @@ megasas_setup_irqs_msix(struct megasas_instance *instance, u8 is_probe) &instance->irq_context[j]); /* Retry irq register for IO_APIC*/ instance->msix_vectors = 0; - if (is_probe) + if (is_probe) { + pci_free_irq_vectors(instance->pdev); return megasas_setup_irqs_ioapic(instance); - else + } else { return -1; + } } } return 0; @@ -5277,9 +5279,11 @@ static int megasas_init_fw(struct megasas_instance *instance) MPI2_REPLY_POST_HOST_INDEX_OFFSET); } - i = pci_alloc_irq_vectors(instance->pdev, 1, 1, PCI_IRQ_LEGACY); - if (i < 0) - goto fail_setup_irqs; + if (!instance->msix_vectors) { + i = pci_alloc_irq_vectors(instance->pdev, 1, 1, PCI_IRQ_LEGACY); + if (i < 0) + goto fail_setup_irqs; + } dev_info(&instance->pdev->dev, "firmware supports msix\t: (%d)", fw_msix_count); From 49524b3c6e12375627ddd870613fcc6b24909898 Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Fri, 10 Mar 2017 03:22:13 -0800 Subject: [PATCH 080/297] scsi: megaraid_sas: add correct return type check for ldio hint logic for raid1 Return value check of atomic_dec_if_positive is required as it returns old value minus one. Without this fix, driver will send small ios to firmware path and that will be a performance issue. Not critical, but good to have r1_ldio_hint as default value in sdev private. Signed-off-by: Kashyap Desai Signed-off-by: Shivasharan S Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 3 +++ drivers/scsi/megaraid/megaraid_sas_fusion.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 016ffcebaf66..0016f12cc563 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -1963,6 +1963,9 @@ scan_target: if (!mr_device_priv_data) return -ENOMEM; sdev->hostdata = mr_device_priv_data; + + atomic_set(&mr_device_priv_data->r1_ldio_hint, + instance->r1_ldio_hint_default); return 0; } diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index 29650ba669da..ebd746e2d97c 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -2338,7 +2338,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, fp_possible = false; atomic_dec(&instance->fw_outstanding); } else if ((scsi_buff_len > MR_LARGE_IO_MIN_SIZE) || - atomic_dec_if_positive(&mrdev_priv->r1_ldio_hint)) { + (atomic_dec_if_positive(&mrdev_priv->r1_ldio_hint) > 0)) { fp_possible = false; atomic_dec(&instance->fw_outstanding); if (scsi_buff_len > MR_LARGE_IO_MIN_SIZE) From 874d025da667d19b728141437ccbefe9dbaf9e7b Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Fri, 10 Mar 2017 03:22:14 -0800 Subject: [PATCH 081/297] scsi: megaraid_sas: raid6 also require cpuSel check same as raid5 Without this fix, raid6 performance will not be optimal. Signed-off-by: Kashyap Desai Signed-off-by: Shivasharan S Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_fusion.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index ebd746e2d97c..f990ab4d45e1 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -2159,7 +2159,7 @@ megasas_set_raidflag_cpu_affinity(union RAID_CONTEXT_UNION *praid_context, cpu_sel = MR_RAID_CTX_CPUSEL_1; if (is_stream_detected(rctx_g35) && - (raid->level == 5) && + ((raid->level == 5) || (raid->level == 6)) && (raid->writeMode == MR_RL_WRITE_THROUGH_MODE) && (cpu_sel == MR_RAID_CTX_CPUSEL_FCFS)) cpu_sel = MR_RAID_CTX_CPUSEL_0; From 22487b66594f18f2a7c211f3ab8bb02dec74d37b Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Fri, 10 Mar 2017 03:22:15 -0800 Subject: [PATCH 082/297] scsi: megaraid_sas: Driver version upgrade Signed-off-by: Kashyap Desai Signed-off-by: Shivasharan S Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h index e7e5974e1a2c..2b209bbb4c91 100644 --- a/drivers/scsi/megaraid/megaraid_sas.h +++ b/drivers/scsi/megaraid/megaraid_sas.h @@ -35,8 +35,8 @@ /* * MegaRAID SAS Driver meta data */ -#define MEGASAS_VERSION "07.701.16.00-rc1" -#define MEGASAS_RELDATE "February 2, 2017" +#define MEGASAS_VERSION "07.701.17.00-rc1" +#define MEGASAS_RELDATE "March 2, 2017" /* * Device IDs From 0043c1dfbec7b6e2427409059b05347d6f51aa9f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Wed, 8 Feb 2017 09:24:25 +0000 Subject: [PATCH 083/297] serial: st-asc: Use new GPIOD API to obtain RTS pin The commits mentioned below adapt the GPIO API to allow more information to be passed directly through devm_get_gpiod_from_child() in the first instance. This facilitates the removal of subsequent calls, such as gpiod_direction_output(). This patch firstly moves to utilise the new API and secondly removes the now superfluous call do set the direction. Reported-by: Stephen Rothwell Suggested-by: Boris Brezillon Signed-off-by: Lee Jones [Also drop the header file dummies that only this driver was using] Acked-by: Greg Kroah-Hartman Signed-off-by: Linus Walleij --- drivers/tty/serial/st-asc.c | 11 ++++++----- include/linux/gpio/consumer.h | 16 ---------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/drivers/tty/serial/st-asc.c b/drivers/tty/serial/st-asc.c index bcf1d33e6ffe..c334bcc59c64 100644 --- a/drivers/tty/serial/st-asc.c +++ b/drivers/tty/serial/st-asc.c @@ -575,12 +575,13 @@ static void asc_set_termios(struct uart_port *port, struct ktermios *termios, pinctrl_select_state(ascport->pinctrl, ascport->states[NO_HW_FLOWCTRL]); - gpiod = devm_get_gpiod_from_child(port->dev, "rts", - &np->fwnode); - if (!IS_ERR(gpiod)) { - gpiod_direction_output(gpiod, 0); + gpiod = devm_fwnode_get_gpiod_from_child(port->dev, + "rts", + &np->fwnode, + GPIOD_OUT_LOW, + np->name); + if (!IS_ERR(gpiod)) ascport->rts = gpiod; - } } } diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 2484b2fcc6eb..933d93656605 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -143,15 +143,6 @@ struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev, struct fwnode_handle *child, enum gpiod_flags flags, const char *label); -/* FIXME: delete this helper when users are switched over */ -static inline struct gpio_desc *devm_get_gpiod_from_child(struct device *dev, - const char *con_id, struct fwnode_handle *child) -{ - return devm_fwnode_get_index_gpiod_from_child(dev, con_id, - 0, child, - GPIOD_ASIS, - "?"); -} #else /* CONFIG_GPIOLIB */ @@ -444,13 +435,6 @@ struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev, return ERR_PTR(-ENOSYS); } -/* FIXME: delete this when all users are switched over */ -static inline struct gpio_desc *devm_get_gpiod_from_child(struct device *dev, - const char *con_id, struct fwnode_handle *child) -{ - return ERR_PTR(-ENOSYS); -} - #endif /* CONFIG_GPIOLIB */ static inline From abf8315f71dc5a2ee56fb60830dcb2861982dc91 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Tue, 31 Jan 2017 16:18:42 +0200 Subject: [PATCH 084/297] drm/tilcdc: Fix hardcoded fail-return value in tilcdc_crtc_create() Fix badly hardcoded return return value under fail-label. All goto branches to the label set the "ret"-variable accordingly. Signed-off-by: Jyri Sarha Reviewed-by: Gabriel Krisman Bertazi --- drivers/gpu/drm/tilcdc/tilcdc_crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c index f80bf9385e41..abcbcd9f5851 100644 --- a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c +++ b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c @@ -1036,5 +1036,5 @@ int tilcdc_crtc_create(struct drm_device *dev) fail: tilcdc_crtc_destroy(crtc); - return -ENOMEM; + return ret; } From 11abbc9f39e002a2b25657e00abac8056cb39e93 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Wed, 1 Mar 2017 10:30:28 +0200 Subject: [PATCH 085/297] drm/tilcdc: Set framebuffer DMA address to HW only if CRTC is enabled Touching HW while clocks are off is a serious error and for instance breaks suspend functionality. After this patch tilcdc_crtc_update_fb() always updates the primary plane's framebuffer pointer, increases fb's reference count and stores vblank event. tilcdc_crtc_update_fb() only writes the fb's DMA address to HW if the crtc is enabled, as tilcdc_crtc_enable() takes care of writing the address on enable. This patch also refactors the tilcdc_crtc_update_fb() a bit. Number of subsequent small changes had made it almost unreadable. There should be no other functional changes but checking the CRTC's enable state. However, the locking goes a bit differently and some of the redundant checks have been removed in this new version. The enable_lock should be enough to protect the access to tilcdc_crtc->enabled. The irq_lock protects the access to last_vblank and next_fb. The check for vrefresh and last_vblank being valid is redundant, as the vrefresh should be always valid if the CRTC is enabled and now last_vblank should be too, because it is initialized to current time when CRTC raster is enabled. If for some reason the values are not correctly initialized the division by zero warning is quite appropriate. Signed-off-by: Jyri Sarha Reviewed-by: Tomi Valkeinen --- drivers/gpu/drm/tilcdc/tilcdc_crtc.c | 35 ++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c index abcbcd9f5851..d745e8b50fb8 100644 --- a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c +++ b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c @@ -464,6 +464,7 @@ static void tilcdc_crtc_enable(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; struct tilcdc_crtc *tilcdc_crtc = to_tilcdc_crtc(crtc); + unsigned long flags; WARN_ON(!drm_modeset_is_locked(&crtc->mutex)); mutex_lock(&tilcdc_crtc->enable_lock); @@ -484,7 +485,17 @@ static void tilcdc_crtc_enable(struct drm_crtc *crtc) tilcdc_write_mask(dev, LCDC_RASTER_CTRL_REG, LCDC_PALETTE_LOAD_MODE(DATA_ONLY), LCDC_PALETTE_LOAD_MODE_MASK); + + /* There is no real chance for a race here as the time stamp + * is taken before the raster DMA is started. The spin-lock is + * taken to have a memory barrier after taking the time-stamp + * and to avoid a context switch between taking the stamp and + * enabling the raster. + */ + spin_lock_irqsave(&tilcdc_crtc->irq_lock, flags); + tilcdc_crtc->last_vblank = ktime_get(); tilcdc_set(dev, LCDC_RASTER_CTRL_REG, LCDC_RASTER_ENABLE); + spin_unlock_irqrestore(&tilcdc_crtc->irq_lock, flags); drm_crtc_vblank_on(crtc); @@ -539,7 +550,6 @@ static void tilcdc_crtc_off(struct drm_crtc *crtc, bool shutdown) } drm_flip_work_commit(&tilcdc_crtc->unref_work, priv->wq); - tilcdc_crtc->last_vblank = 0; tilcdc_crtc->enabled = false; mutex_unlock(&tilcdc_crtc->enable_lock); @@ -602,7 +612,6 @@ int tilcdc_crtc_update_fb(struct drm_crtc *crtc, { struct tilcdc_crtc *tilcdc_crtc = to_tilcdc_crtc(crtc); struct drm_device *dev = crtc->dev; - unsigned long flags; WARN_ON(!drm_modeset_is_locked(&crtc->mutex)); @@ -614,28 +623,30 @@ int tilcdc_crtc_update_fb(struct drm_crtc *crtc, drm_framebuffer_reference(fb); crtc->primary->fb = fb; + tilcdc_crtc->event = event; - spin_lock_irqsave(&tilcdc_crtc->irq_lock, flags); + mutex_lock(&tilcdc_crtc->enable_lock); - if (crtc->hwmode.vrefresh && ktime_to_ns(tilcdc_crtc->last_vblank)) { + if (tilcdc_crtc->enabled) { + unsigned long flags; ktime_t next_vblank; s64 tdiff; - next_vblank = ktime_add_us(tilcdc_crtc->last_vblank, - 1000000 / crtc->hwmode.vrefresh); + spin_lock_irqsave(&tilcdc_crtc->irq_lock, flags); + next_vblank = ktime_add_us(tilcdc_crtc->last_vblank, + 1000000 / crtc->hwmode.vrefresh); tdiff = ktime_to_us(ktime_sub(next_vblank, ktime_get())); if (tdiff < TILCDC_VBLANK_SAFETY_THRESHOLD_US) tilcdc_crtc->next_fb = fb; + else + set_scanout(crtc, fb); + + spin_unlock_irqrestore(&tilcdc_crtc->irq_lock, flags); } - if (tilcdc_crtc->next_fb != fb) - set_scanout(crtc, fb); - - tilcdc_crtc->event = event; - - spin_unlock_irqrestore(&tilcdc_crtc->irq_lock, flags); + mutex_unlock(&tilcdc_crtc->enable_lock); return 0; } From 0977762f6d15f13caccc20d71a5dec47d098907d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 13 Mar 2017 13:44:35 -0700 Subject: [PATCH 086/297] md/r5cache: fix set_syndrome_sources() for data in cache Before this patch, device InJournal will be included in prexor (SYNDROME_SRC_WANT_DRAIN) but not in reconstruct (SYNDROME_SRC_WRITTEN). So it will break parity calculation. With srctype == SYNDROME_SRC_WRITTEN, we need include both dev with non-null ->written and dev with R5_InJournal. This fixes logic in 1e6d690(md/r5cache: caching phase of r5cache) Cc: stable@vger.kernel.org (v4.10+) Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6bfedfcf41c1..ed5cd705b985 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1401,7 +1401,8 @@ static int set_syndrome_sources(struct page **srcs, (test_bit(R5_Wantdrain, &dev->flags) || test_bit(R5_InJournal, &dev->flags))) || (srctype == SYNDROME_SRC_WRITTEN && - dev->written)) { + (dev->written || + test_bit(R5_InJournal, &dev->flags)))) { if (test_bit(R5_InJournal, &dev->flags)) srcs[slot] = sh->dev[i].orig_page; else From 9c62110454b088b4914ffe375c2dbc19643eac34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Mar 2017 11:51:59 -0600 Subject: [PATCH 087/297] blk-mq-sched: don't run the queue async from blk_mq_try_issue_directly() If we have scheduling enabled, we jump directly to insert-and-run. That's fine, but we run the queue async and we don't pass in information on whether we can block from this context or not. Fixup both these cases. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 159187a28d66..a4546f060e80 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1434,7 +1434,8 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) +static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, + bool may_sleep) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { @@ -1475,7 +1476,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) } insert: - blk_mq_sched_insert_request(rq, false, true, true, false); + blk_mq_sched_insert_request(rq, false, true, false, may_sleep); } /* @@ -1569,11 +1570,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - blk_mq_try_issue_directly(old_rq, &cookie); + blk_mq_try_issue_directly(old_rq, &cookie, false); rcu_read_unlock(); } else { srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); - blk_mq_try_issue_directly(old_rq, &cookie); + blk_mq_try_issue_directly(old_rq, &cookie, true); srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); } goto done; From 8c53ad2139137dd4bf506a2c2b888de3816e8f75 Mon Sep 17 00:00:00 2001 From: Rex Zhu Date: Mon, 13 Mar 2017 15:14:08 +0800 Subject: [PATCH 088/297] drm/amd/powerplay: fix copy error in smu7_clockpoweragting.c Signed-off-by: Rex Zhu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/smu7_clockpowergating.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_clockpowergating.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_clockpowergating.c index 8cf71f3c6d0e..261b828ad590 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_clockpowergating.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_clockpowergating.c @@ -178,7 +178,7 @@ int smu7_powergate_vce(struct pp_hwmgr *hwmgr, bool bgate) if (bgate) { cgs_set_powergating_state(hwmgr->device, AMD_IP_BLOCK_TYPE_VCE, - AMD_PG_STATE_UNGATE); + AMD_PG_STATE_GATE); cgs_set_clockgating_state(hwmgr->device, AMD_IP_BLOCK_TYPE_VCE, AMD_CG_STATE_GATE); From 11353b9d10392e79e32603d2178e75feb25eaf0d Mon Sep 17 00:00:00 2001 From: Zhilong Liu Date: Tue, 14 Mar 2017 15:52:26 +0800 Subject: [PATCH 089/297] md/raid1: fix a trivial typo in comments raid1.c: fix a trivial typo in comments of freeze_array(). Cc: Jack Wang Cc: Guoqing Jiang Cc: John Stoffel Acked-by: Coly Li Signed-off-by: Zhilong Liu Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c33e96e33b8e..a34f58772022 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,7 +1027,7 @@ static int get_unqueued_pending(struct r1conf *conf) static void freeze_array(struct r1conf *conf, int extra) { /* Stop sync I/O and normal I/O and wait for everything to - * go quite. + * go quiet. * This is called in two situations: * 1) management command handlers (reshape, remove disk, quiesce). * 2) one normal I/O request failed. From dc434e056fe1dada20df7ba07f32739d3a701adf Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 14 Mar 2017 16:06:45 +0100 Subject: [PATCH 090/297] cpu/hotplug: Serialize callback invocations proper The setup/remove_state/instance() functions in the hotplug core code are serialized against concurrent CPU hotplug, but unfortunately not serialized against themself. As a consequence a concurrent invocation of these function results in corruption of the callback machinery because two instances try to invoke callbacks on remote cpus at the same time. This results in missing callback invocations and initiator threads waiting forever on the completion. The obvious solution to replace get_cpu_online() with cpu_hotplug_begin() is not possible because at least one callsite calls into these functions from a get_online_cpu() locked region. Extend the protection scope of the cpuhp_state_mutex from solely protecting the state arrays to cover the callback invocation machinery as well. Fixes: 5b7aa87e0482 ("cpu/hotplug: Implement setup/removal interface") Reported-and-tested-by: Bart Van Assche Signed-off-by: Sebastian Andrzej Siewior Cc: hpa@zytor.com Cc: mingo@kernel.org Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/20170314150645.g4tdyoszlcbajmna@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index f7c063239fa5..37b223e4fc05 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1335,26 +1335,21 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, struct cpuhp_step *sp; int ret = 0; - mutex_lock(&cpuhp_state_mutex); - if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) { ret = cpuhp_reserve_state(state); if (ret < 0) - goto out; + return ret; state = ret; } sp = cpuhp_get_step(state); - if (name && sp->name) { - ret = -EBUSY; - goto out; - } + if (name && sp->name) + return -EBUSY; + sp->startup.single = startup; sp->teardown.single = teardown; sp->name = name; sp->multi_instance = multi_instance; INIT_HLIST_HEAD(&sp->list); -out: - mutex_unlock(&cpuhp_state_mutex); return ret; } @@ -1428,6 +1423,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) goto add_node; @@ -1447,16 +1443,14 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, if (ret) { if (sp->teardown.multi) cpuhp_rollback_install(cpu, state, node); - goto err; + goto unlock; } } add_node: ret = 0; - mutex_lock(&cpuhp_state_mutex); hlist_add_head(node, &sp->list); +unlock: mutex_unlock(&cpuhp_state_mutex); - -err: put_online_cpus(); return ret; } @@ -1491,6 +1485,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); @@ -1524,6 +1519,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, } } out: + mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the @@ -1547,6 +1543,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* @@ -1563,7 +1561,6 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, } remove: - mutex_lock(&cpuhp_state_mutex); hlist_del(node); mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); @@ -1571,6 +1568,7 @@ remove: return 0; } EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); + /** * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state * @state: The state to remove @@ -1589,6 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { WARN(!hlist_empty(&sp->list), "Error: Removing state %d which has instances left.\n", @@ -1613,6 +1612,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) } remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); + mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); } EXPORT_SYMBOL(__cpuhp_remove_state); From d434936e4cbb10181463622962f30b989d3e9e19 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Mar 2017 13:12:53 +0100 Subject: [PATCH 091/297] mm, x86: Fix native_pud_clear build error We still get a build error in random configurations, after this has been modified a few times: In file included from include/linux/mm.h:68:0, from include/linux/suspend.h:8, from arch/x86/kernel/asm-offsets.c:12: arch/x86/include/asm/pgtable.h:66:26: error: redefinition of 'native_pud_clear' #define pud_clear(pud) native_pud_clear(pud) My interpretation is that the build error comes from a typo in __PAGETABLE_PUD_FOLDED, so fix that typo now, and remove the incorrect #ifdef around the native_pud_clear definition. Fixes: 3e761a42e19c ("mm, x86: fix HIGHMEM64 && PARAVIRT build config for native_pud_clear()") Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages") Signed-off-by: Arnd Bergmann Acked-by: Dave Jiang Cc: Kees Cook Cc: Dave Hansen Cc: Hugh Dickins Cc: Andrew Morton Cc: Borislav Petkov Cc: Thomas Garnier Link: http://lkml.kernel.org/r/20170314121330.182155-1-arnd@arndb.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable-3level.h | 3 --- arch/x86/include/asm/pgtable.h | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 72277b1028a5..50d35e3185f5 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -121,12 +121,9 @@ static inline void native_pmd_clear(pmd_t *pmd) *(tmp + 1) = 0; } -#if !defined(CONFIG_SMP) || (defined(CONFIG_HIGHMEM64G) && \ - defined(CONFIG_PARAVIRT)) static inline void native_pud_clear(pud_t *pudp) { } -#endif static inline void pud_clear(pud_t *pudp) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cfb36b8c024..585ee0d42d18 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -62,7 +62,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); # define set_pud(pudp, pud) native_set_pud(pudp, pud) #endif -#ifndef __PAGETABLE_PMD_FOLDED +#ifndef __PAGETABLE_PUD_FOLDED #define pud_clear(pud) native_pud_clear(pud) #endif From c236c8e95a3d395b0494e7108f0d41cf36ec107c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Mar 2017 10:27:18 +0100 Subject: [PATCH 092/297] futex: Fix potential use-after-free in FUTEX_REQUEUE_PI While working on the futex code, I stumbled over this potential use-after-free scenario. Dmitry triggered it later with syzkaller. pi_mutex is a pointer into pi_state, which we drop the reference on in unqueue_me_pi(). So any access to that pointer after that is bad. Since other sites already do rt_mutex_unlock() with hb->lock held, see for example futex_lock_pi(), simply move the unlock before unqueue_me_pi(). Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Darren Hart Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170304093558.801744246@infradead.org Signed-off-by: Thomas Gleixner --- kernel/futex.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 229a744b1781..3a4775fd7468 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2815,7 +2815,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; - struct rt_mutex *pi_mutex = NULL; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; @@ -2907,6 +2906,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, spin_unlock(q.lock_ptr); } } else { + struct rt_mutex *pi_mutex; + /* * We have been woken up by futex_unlock_pi(), a timeout, or a * signal. futex_unlock_pi() will not destroy the lock_ptr nor @@ -2930,18 +2931,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; + /* + * If fixup_pi_state_owner() faulted and was unable to handle + * the fault, unlock the rt_mutex and return the fault to + * userspace. + */ + if (ret && rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } - /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock the rt_mutex and return the fault to userspace. - */ - if (ret == -EFAULT) { - if (pi_mutex && rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); - } else if (ret == -EINTR) { + if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling * futex_lock_pi() directly. We could restart this syscall, but From 9bbb25afeb182502ca4f2c4f3f88af0681b34cae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Mar 2017 10:27:19 +0100 Subject: [PATCH 093/297] futex: Add missing error handling to FUTEX_REQUEUE_PI Thomas spotted that fixup_pi_state_owner() can return errors and we fail to unlock the rt_mutex in that case. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Darren Hart Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170304093558.867401760@infradead.org Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index 3a4775fd7468..45858ec73941 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2898,6 +2898,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) + rt_mutex_unlock(&q.pi_state->pi_mutex); /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. From 87a6b2975f0d340c75b7488d22d61d2f98fb8abf Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 13 Mar 2017 23:27:47 -0500 Subject: [PATCH 094/297] x86/unwind: Fix last frame check for aligned function stacks Pavel Machek reported the following warning on x86-32: WARNING: kernel stack frame pointer at f50cdf98 in swapper/2:0 has bad value (null) The warning is caused by the unwinder not realizing that it reached the end of the stack, due to an unusual prologue which gcc sometimes generates for aligned stacks. The prologue is based on a gcc feature called the Dynamic Realign Argument Pointer (DRAP). It's almost always enabled for aligned stacks when -maccumulate-outgoing-args isn't set. This issue is similar to the one fixed by the following commit: 8023e0e2a48d ("x86/unwind: Adjust last frame check for aligned function stacks") ... but that fix was specific to x86-64. Make the fix more generic to cover x86-32 as well, and also ensure that the return address referred to by the frame pointer is a copy of the original return address. Fixes: acb4608ad186 ("x86/unwind: Create stack frames for saved syscall registers") Reported-by: Pavel Machek Signed-off-by: Josh Poimboeuf Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/50d4924db716c264b14f1633037385ec80bf89d2.1489465609.git.jpoimboe@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/unwind_frame.c | 36 ++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 478d15dbaee4..08339262b666 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -82,19 +82,43 @@ static size_t regs_size(struct pt_regs *regs) return sizeof(*regs); } +#ifdef CONFIG_X86_32 +#define GCC_REALIGN_WORDS 3 +#else +#define GCC_REALIGN_WORDS 1 +#endif + static bool is_last_task_frame(struct unwind_state *state) { - unsigned long bp = (unsigned long)state->bp; - unsigned long regs = (unsigned long)task_pt_regs(state->task); + unsigned long *last_bp = (unsigned long *)task_pt_regs(state->task) - 2; + unsigned long *aligned_bp = last_bp - GCC_REALIGN_WORDS; /* * We have to check for the last task frame at two different locations * because gcc can occasionally decide to realign the stack pointer and - * change the offset of the stack frame by a word in the prologue of a - * function called by head/entry code. + * change the offset of the stack frame in the prologue of a function + * called by head/entry code. Examples: + * + * : + * push %edi + * lea 0x8(%esp),%edi + * and $0xfffffff8,%esp + * pushl -0x4(%edi) + * push %ebp + * mov %esp,%ebp + * + * : + * lea 0x8(%rsp),%r10 + * and $0xfffffffffffffff0,%rsp + * pushq -0x8(%r10) + * push %rbp + * mov %rsp,%rbp + * + * Note that after aligning the stack, it pushes a duplicate copy of + * the return address before pushing the frame pointer. */ - return bp == regs - FRAME_HEADER_SIZE || - bp == regs - FRAME_HEADER_SIZE - sizeof(long); + return (state->bp == last_bp || + (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1))); } /* From 49ec8f5b6ae3ab60385492cad900ffc8a523c895 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 14 Mar 2017 15:20:53 +0100 Subject: [PATCH 095/297] x86/intel_rdt: Put group node in rdtgroup_kn_unlock The rdtgroup_kn_unlock waits for the last user to release and put its node. But it's calling kernfs_put on the node which calls the rdtgroup_kn_unlock, which might not be the group's directory node, but another group's file node. This race could be easily reproduced by running 2 instances of following script: mount -t resctrl resctrl /sys/fs/resctrl/ pushd /sys/fs/resctrl/ mkdir krava echo "krava" > krava/schemata rmdir krava popd umount /sys/fs/resctrl It triggers the slub debug error message with following command line config: slub_debug=,kernfs_node_cache. Call kernfs_put on the group's node to fix it. Fixes: 60cf5e101fd4 ("x86/intel_rdt: Add mkdir to resctrl file system") Signed-off-by: Jiri Olsa Cc: Fenghua Yu Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Shaohua Li Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1489501253-20248-1-git-send-email-jolsa@kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index c05509d38b1f..9ac2a5cdd9c2 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -727,7 +727,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { kernfs_unbreak_active_protection(kn); - kernfs_put(kn); + kernfs_put(rdtgrp->kn); kfree(rdtgrp); } else { kernfs_unbreak_active_protection(kn); From 655d9ca9ac075da1ef2a45012ba48a39f6eb1f58 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Mar 2017 22:27:11 +0100 Subject: [PATCH 096/297] drm: amd: remove broken include path The AMD ACP driver adds "-I../acp -I../acp/include" to the gcc command line, which makes no sense, since these are evaluated relative to the build directory. When we build with "make W=1", they instead cause a warning: cc1: error: ../acp/: No such file or directory [-Werror=missing-include-dirs] cc1: error: ../acp/include: No such file or directory [-Werror=missing-include-dirs] cc1: all warnings being treated as errors ../scripts/Makefile.build:289: recipe for target 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.o' failed ../scripts/Makefile.build:289: recipe for target 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.o' failed ../scripts/Makefile.build:289: recipe for target 'drivers/gpu/drm/amd/amdgpu/amdgpu_kms.o' failed This removes the subdir-ccflags variable that evidently did not serve any purpose here. Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/acp/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/acp/Makefile b/drivers/gpu/drm/amd/acp/Makefile index 8363cb57915b..8a08e81ee90d 100644 --- a/drivers/gpu/drm/amd/acp/Makefile +++ b/drivers/gpu/drm/amd/acp/Makefile @@ -3,6 +3,4 @@ # of AMDSOC/AMDGPU drm driver. # It provides the HW control for ACP related functionalities. -subdir-ccflags-y += -I$(AMDACPPATH)/ -I$(AMDACPPATH)/include - AMD_ACP_FILES := $(AMDACPPATH)/acp_hw.o From 630a04e79dd41ff746b545d4fc052e0abb836120 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Mar 2017 00:24:25 -0700 Subject: [PATCH 097/297] xfs: verify inline directory data forks When we're reading or writing the data fork of an inline directory, check the contents to make sure we're not overflowing buffers or eating garbage data. xfs/348 corrupts an inline symlink into an inline directory, triggering a buffer overflow bug. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- v2: add more checks consistent with _dir2_sf_check and make the verifier usable from anywhere. --- fs/xfs/libxfs/xfs_dir2_priv.h | 2 + fs/xfs/libxfs/xfs_dir2_sf.c | 87 ++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_fork.c | 26 ++++++++-- fs/xfs/libxfs/xfs_inode_fork.h | 2 +- fs/xfs/xfs_dir2_readdir.c | 11 ----- fs/xfs/xfs_inode.c | 12 +++-- 6 files changed, 122 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index d04547fcf274..eb00bc133bca 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -125,6 +125,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp, + int size); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index c6809ff41197..96b45cd6c63f 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -629,6 +629,93 @@ xfs_dir2_sf_check( } #endif /* DEBUG */ +/* Verify the consistency of an inline directory. */ +int +xfs_dir2_sf_verify( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int size) +{ + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next_sfep; + char *endp; + const struct xfs_dir_ops *dops; + xfs_ino_t ino; + int i; + int i8count; + int offset; + __uint8_t filetype; + + dops = xfs_dir_get_ops(mp, NULL); + + /* + * Give up if the directory is way too short. + */ + XFS_WANT_CORRUPTED_RETURN(mp, size > + offsetof(struct xfs_dir2_sf_hdr, parent)); + XFS_WANT_CORRUPTED_RETURN(mp, size >= + xfs_dir2_sf_hdr_size(sfp->i8count)); + + endp = (char *)sfp + size; + + /* Check .. entry */ + ino = dops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + offset = dops->data_first_offset; + + /* Check all reported entries */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + /* + * struct xfs_dir2_sf_entry has a variable length. + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ + XFS_WANT_CORRUPTED_RETURN(mp, + ((char *)sfep + sizeof(*sfep)) < endp); + + /* Don't allow names with known bad length. */ + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0); + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN); + + /* + * Check that the variable-length part of the structure is + * within the data buffer. The next entry starts after the + * name component, so nextentry is an acceptable test. + */ + next_sfep = dops->sf_nextentry(sfp, sfep); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep); + + /* Check that the offsets always increase. */ + XFS_WANT_CORRUPTED_RETURN(mp, + xfs_dir2_sf_get_offset(sfep) >= offset); + + /* Check the inode number. */ + ino = dops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + + /* Check the file type. */ + filetype = dops->sf_get_ftype(sfep); + XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX); + + offset = xfs_dir2_sf_get_offset(sfep) + + dops->data_entsize(sfep->namelen); + + sfep = next_sfep; + } + XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count); + XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp); + + /* Make sure this whole thing ought to be in local format. */ + XFS_WANT_CORRUPTED_RETURN(mp, offset + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize); + + return 0; +} + /* * Create a new (shortform) directory. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 25c1e078aef6..9653e964eda4 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -33,6 +33,8 @@ #include "xfs_trace.h" #include "xfs_attr_sf.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" kmem_zone_t *xfs_ifork_zone; @@ -320,6 +322,7 @@ xfs_iformat_local( int whichfork, int size) { + int error; /* * If the size is unreasonable, then something @@ -336,6 +339,14 @@ xfs_iformat_local( return -EFSCORRUPTED; } + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(ip->i_mount, + (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip), + size); + if (error) + return error; + } + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -856,7 +867,7 @@ xfs_iextents_copy( * In these cases, the format always takes precedence, because the * format indicates the current state of the fork. */ -void +int xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, @@ -866,6 +877,7 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; + int error; static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -874,7 +886,7 @@ xfs_iflush_fork( { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; if (!iip) - return; + return 0; ifp = XFS_IFORK_PTR(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, @@ -882,12 +894,19 @@ xfs_iflush_fork( */ if (!ifp) { ASSERT(whichfork == XFS_ATTR_FORK); - return; + return 0; } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_LOCAL: + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(mp, + (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data, + ifp->if_bytes); + if (error) + return error; + } if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); @@ -940,6 +959,7 @@ xfs_iflush_fork( ASSERT(0); break; } + return 0; } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365326d1..132dc59fdde6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -140,7 +140,7 @@ typedef struct xfs_ifork { struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); -void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, +int xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); void xfs_idata_realloc(struct xfs_inode *, int, int); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 003a99b83bd8..ad9396e516f6 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -71,22 +71,11 @@ xfs_dir2_sf_getdents( struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) - return -EFSCORRUPTED; - /* * If the block number in the offset is out of range, we're done. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7eaf1ef74e3c..c7fe2c2123ab 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3475,6 +3475,7 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); @@ -3557,9 +3558,14 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (error) + return error; + if (XFS_IFORK_Q(ip)) { + error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + if (error) + return error; + } xfs_inobp_check(mp, bp); /* From 8af42949d1681379c1a97d230de9242c1f4f326a Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Mon, 13 Mar 2017 07:41:55 +0900 Subject: [PATCH 098/297] openrisc: xchg: fix `computed is not used` warning When building allmodconfig this warning shows. fs/ocfs2/file.c: In function 'ocfs2_file_write_iter': ./arch/openrisc/include/asm/cmpxchg.h:81:3: warning: value computed is not used [-Wunused-value] ((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), sizeof(*(ptr)))) ^ Applying the same patch logic that was done to the cmpxchg macro. Signed-off-by: Stafford Horne --- arch/openrisc/include/asm/cmpxchg.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/openrisc/include/asm/cmpxchg.h b/arch/openrisc/include/asm/cmpxchg.h index 5fcb9ac72693..f0a5d8b844d6 100644 --- a/arch/openrisc/include/asm/cmpxchg.h +++ b/arch/openrisc/include/asm/cmpxchg.h @@ -77,7 +77,11 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr, return val; } -#define xchg(ptr, with) \ - ((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), sizeof(*(ptr)))) +#define xchg(ptr, with) \ + ({ \ + (__typeof__(*(ptr))) __xchg((unsigned long)(with), \ + (ptr), \ + sizeof(*(ptr))); \ + }) #endif /* __ASM_OPENRISC_CMPXCHG_H */ From 154e67cd8e8f964809d0e75e44bb121b169c75b3 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Mon, 13 Mar 2017 07:44:45 +0900 Subject: [PATCH 099/297] openrisc: fix issue handling 8 byte get_user calls Was getting the following error with allmodconfig: ERROR: "__get_user_bad" [lib/test_user_copy.ko] undefined! This was simply a missing break statement, causing an unwanted fall through. Signed-off-by: Stafford Horne --- arch/openrisc/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h index 140faa16685a..1311e6b13991 100644 --- a/arch/openrisc/include/asm/uaccess.h +++ b/arch/openrisc/include/asm/uaccess.h @@ -211,7 +211,7 @@ do { \ case 1: __get_user_asm(x, ptr, retval, "l.lbz"); break; \ case 2: __get_user_asm(x, ptr, retval, "l.lhz"); break; \ case 4: __get_user_asm(x, ptr, retval, "l.lwz"); break; \ - case 8: __get_user_asm2(x, ptr, retval); \ + case 8: __get_user_asm2(x, ptr, retval); break; \ default: (x) = __get_user_bad(); \ } \ } while (0) From 363dad58e4a0f72dce0bf12d361d617239a80317 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 14 Mar 2017 22:52:49 +0900 Subject: [PATCH 100/297] openrisc: Export symbols needed by modules This was detected by allmodconfig, errors reported: ERROR: "empty_zero_page" [net/ceph/libceph.ko] undefined! ERROR: "__ucmpdi2" [lib/842/842_decompress.ko] undefined! ERROR: "empty_zero_page" [fs/nfs/objlayout/objlayoutdriver.ko] undefined! ERROR: "empty_zero_page" [fs/exofs/exofs.ko] undefined! ERROR: "empty_zero_page" [fs/crypto/fscrypto.ko] undefined! ERROR: "__ucmpdi2" [fs/btrfs/btrfs.ko] undefined! ERROR: "pm_power_off" [drivers/regulator/act8865-regulator.ko] undefined! ERROR: "__ucmpdi2" [drivers/media/i2c/adv7842.ko] undefined! ERROR: "__clear_user" [drivers/md/dm-mod.ko] undefined! ERROR: "__clear_user" [net/netfilter/x_tables.ko] undefined! Signed-off-by: Stafford Horne --- arch/openrisc/kernel/or32_ksyms.c | 4 ++++ arch/openrisc/kernel/process.c | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/openrisc/kernel/or32_ksyms.c b/arch/openrisc/kernel/or32_ksyms.c index 5c4695d13542..ee3e604959e1 100644 --- a/arch/openrisc/kernel/or32_ksyms.c +++ b/arch/openrisc/kernel/or32_ksyms.c @@ -30,6 +30,7 @@ #include #include #include +#include #define DECLARE_EXPORT(name) extern void name(void); EXPORT_SYMBOL(name) @@ -42,6 +43,9 @@ DECLARE_EXPORT(__muldi3); DECLARE_EXPORT(__ashrdi3); DECLARE_EXPORT(__ashldi3); DECLARE_EXPORT(__lshrdi3); +DECLARE_EXPORT(__ucmpdi2); +EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(__copy_tofrom_user); +EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(memset); diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 828a29110459..f8da545854f9 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -90,6 +90,7 @@ void arch_cpu_idle(void) } void (*pm_power_off) (void) = machine_power_off; +EXPORT_SYMBOL(pm_power_off); /* * When a process does an "exec", machine state like FPU and debug From e4c204ced0ac25e02e58679f07096c5bac0b0d96 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 14 Mar 2017 16:18:34 +0100 Subject: [PATCH 101/297] cpufreq: intel_pstate: Avoid percentages in limits-related computations Currently, intel_pstate_update_perf_limits() first converts the policy minimum and maximum limits into percentages of the maximum turbo frequency (rounding up to an integer) and then converts these percentages to fractions (by using fixed-point arithmetic to divide them by 100). That introduces a rounding error unnecessarily, because the fractions can be obtained by carrying out fixed-point divisions directly on the input numbers. Rework the computations in intel_pstate_hwp_set() to use fractions instead of percentages (and drop redundant local variables from there) and modify intel_pstate_update_perf_limits() to compute the fractions directly and percentages out of them. While at it, introduce percent_ext_fp() for converting percentages to fractions (with extended number of fraction bits) and use it in the computations. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 56 +++++++++++++++++----------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index ee12641ee010..08e134ffba68 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -84,6 +84,11 @@ static inline u64 div_ext_fp(u64 x, u64 y) return div64_u64(x << EXT_FRAC_BITS, y); } +static inline int32_t percent_ext_fp(int percent) +{ + return div_ext_fp(percent, 100); +} + /** * struct sample - Store performance sample * @core_avg_perf: Ratio of APERF/MPERF which is the actual average @@ -850,7 +855,6 @@ static void intel_pstate_hwp_set(struct cpufreq_policy *policy) u64 value, cap; for_each_cpu(cpu, policy->cpus) { - int max_perf_pct, min_perf_pct; struct cpudata *cpu_data = all_cpu_data[cpu]; s16 epp; @@ -864,16 +868,14 @@ static void intel_pstate_hwp_set(struct cpufreq_policy *policy) else hw_max = HWP_HIGHEST_PERF(cap); - max_perf_pct = perf_limits->max_perf_pct; - min_perf_pct = perf_limits->min_perf_pct; - min = hw_max * min_perf_pct / 100; + min = fp_ext_toint(hw_max * perf_limits->min_perf); rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); value &= ~HWP_MIN_PERF(~0L); value |= HWP_MIN_PERF(min); - max = hw_max * max_perf_pct / 100; + max = fp_ext_toint(hw_max * perf_limits->max_perf); value &= ~HWP_MAX_PERF(~0L); value |= HWP_MAX_PERF(max); @@ -1223,7 +1225,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, limits->max_perf_pct); limits->max_perf_pct = max(limits->min_perf_pct, limits->max_perf_pct); - limits->max_perf = div_ext_fp(limits->max_perf_pct, 100); + limits->max_perf = percent_ext_fp(limits->max_perf_pct); intel_pstate_update_policies(); @@ -1260,7 +1262,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, limits->min_perf_pct); limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct); - limits->min_perf = div_ext_fp(limits->min_perf_pct, 100); + limits->min_perf = percent_ext_fp(limits->min_perf_pct); intel_pstate_update_policies(); @@ -2078,36 +2080,34 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu) static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy, struct perf_limits *limits) { + int32_t max_policy_perf, min_policy_perf; - limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100, - policy->cpuinfo.max_freq); - limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100); + max_policy_perf = div_ext_fp(policy->max, policy->cpuinfo.max_freq); + max_policy_perf = clamp_t(int32_t, max_policy_perf, 0, int_ext_tofp(1)); if (policy->max == policy->min) { - limits->min_policy_pct = limits->max_policy_pct; + min_policy_perf = max_policy_perf; } else { - limits->min_policy_pct = DIV_ROUND_UP(policy->min * 100, - policy->cpuinfo.max_freq); - limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, - 0, 100); + min_policy_perf = div_ext_fp(policy->min, + policy->cpuinfo.max_freq); + min_policy_perf = clamp_t(int32_t, min_policy_perf, + 0, max_policy_perf); } - /* Normalize user input to [min_policy_pct, max_policy_pct] */ - limits->min_perf_pct = max(limits->min_policy_pct, - limits->min_sysfs_pct); - limits->min_perf_pct = min(limits->max_policy_pct, - limits->min_perf_pct); - limits->max_perf_pct = min(limits->max_policy_pct, - limits->max_sysfs_pct); - limits->max_perf_pct = max(limits->min_policy_pct, - limits->max_perf_pct); + /* Normalize user input to [min_perf, max_perf] */ + limits->min_perf = max(min_policy_perf, + percent_ext_fp(limits->min_sysfs_pct)); + limits->min_perf = min(limits->min_perf, max_policy_perf); + limits->max_perf = min(max_policy_perf, + percent_ext_fp(limits->max_sysfs_pct)); + limits->max_perf = max(min_policy_perf, limits->max_perf); - /* Make sure min_perf_pct <= max_perf_pct */ - limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct); + /* Make sure min_perf <= max_perf */ + limits->min_perf = min(limits->min_perf, limits->max_perf); - limits->min_perf = div_ext_fp(limits->min_perf_pct, 100); - limits->max_perf = div_ext_fp(limits->max_perf_pct, 100); limits->max_perf = round_up(limits->max_perf, EXT_FRAC_BITS); limits->min_perf = round_up(limits->min_perf, EXT_FRAC_BITS); + limits->max_perf_pct = fp_ext_toint(limits->max_perf * 100); + limits->min_perf_pct = fp_ext_toint(limits->min_perf * 100); pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu, limits->max_perf_pct, limits->min_perf_pct); From 4494dbc6dec37817f2cc2aa7604039a9e87ada18 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 15 Mar 2017 22:22:08 +0800 Subject: [PATCH 102/297] netfilter: nft_ct: do cleanup work when NFTA_CT_DIRECTION is invalid We should jump to invoke __nft_ct_set_destroy() instead of just return error. Fixes: edee4f1e9245 ("netfilter: nft_ct: add zone id set support") Signed-off-by: Liping Zhang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 91585b5e5307..0264258c46fe 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -544,7 +544,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, case IP_CT_DIR_REPLY: break; default: - return -EINVAL; + err = -EINVAL; + goto err1; } } From 85b29008d8af6d94a0723aaa8d93cfb6e041158b Mon Sep 17 00:00:00 2001 From: Don Brace Date: Fri, 10 Mar 2017 14:35:11 -0600 Subject: [PATCH 103/297] scsi: hpsa: update check for logical volume status - Add in a new case for volume offline. Resolves internal testing bug for multilun array management. - Return correct status for failed TURs. Reviewed-by: Scott Benesh Reviewed-by: Scott Teel Signed-off-by: Don Brace Signed-off-by: Martin K. Petersen --- drivers/scsi/hpsa.c | 35 ++++++++++++++++------------------- drivers/scsi/hpsa_cmd.h | 2 ++ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 524a0c755ed7..90b76c4c6d36 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -3714,7 +3714,7 @@ exit_failed: * # (integer code indicating one of several NOT READY states * describing why a volume is to be kept offline) */ -static int hpsa_volume_offline(struct ctlr_info *h, +static unsigned char hpsa_volume_offline(struct ctlr_info *h, unsigned char scsi3addr[]) { struct CommandList *c; @@ -3735,7 +3735,7 @@ static int hpsa_volume_offline(struct ctlr_info *h, DEFAULT_TIMEOUT); if (rc) { cmd_free(h, c); - return 0; + return HPSA_VPD_LV_STATUS_UNSUPPORTED; } sense = c->err_info->SenseInfo; if (c->err_info->SenseLen > sizeof(c->err_info->SenseInfo)) @@ -3746,19 +3746,13 @@ static int hpsa_volume_offline(struct ctlr_info *h, cmd_status = c->err_info->CommandStatus; scsi_status = c->err_info->ScsiStatus; cmd_free(h, c); - /* Is the volume 'not ready'? */ - if (cmd_status != CMD_TARGET_STATUS || - scsi_status != SAM_STAT_CHECK_CONDITION || - sense_key != NOT_READY || - asc != ASC_LUN_NOT_READY) { - return 0; - } /* Determine the reason for not ready state */ ldstat = hpsa_get_volume_status(h, scsi3addr); /* Keep volume offline in certain cases: */ switch (ldstat) { + case HPSA_LV_FAILED: case HPSA_LV_UNDERGOING_ERASE: case HPSA_LV_NOT_AVAILABLE: case HPSA_LV_UNDERGOING_RPI: @@ -3780,7 +3774,7 @@ static int hpsa_volume_offline(struct ctlr_info *h, default: break; } - return 0; + return HPSA_LV_OK; } /* @@ -3853,10 +3847,10 @@ static int hpsa_update_device_info(struct ctlr_info *h, /* Do an inquiry to the device to see what it is. */ if (hpsa_scsi_do_inquiry(h, scsi3addr, 0, inq_buff, (unsigned char) OBDR_TAPE_INQ_SIZE) != 0) { - /* Inquiry failed (msg printed already) */ dev_err(&h->pdev->dev, - "hpsa_update_device_info: inquiry failed\n"); - rc = -EIO; + "%s: inquiry failed, device will be skipped.\n", + __func__); + rc = HPSA_INQUIRY_FAILED; goto bail_out; } @@ -3885,15 +3879,19 @@ static int hpsa_update_device_info(struct ctlr_info *h, if ((this_device->devtype == TYPE_DISK || this_device->devtype == TYPE_ZBC) && is_logical_dev_addr_mode(scsi3addr)) { - int volume_offline; + unsigned char volume_offline; hpsa_get_raid_level(h, scsi3addr, &this_device->raid_level); if (h->fw_support & MISC_FW_RAID_OFFLOAD_BASIC) hpsa_get_ioaccel_status(h, scsi3addr, this_device); volume_offline = hpsa_volume_offline(h, scsi3addr); - if (volume_offline < 0 || volume_offline > 0xff) - volume_offline = HPSA_VPD_LV_STATUS_UNSUPPORTED; - this_device->volume_offline = volume_offline & 0xff; + if (volume_offline == HPSA_LV_FAILED) { + rc = HPSA_LV_FAILED; + dev_err(&h->pdev->dev, + "%s: LV failed, device will be skipped.\n", + __func__); + goto bail_out; + } } else { this_device->raid_level = RAID_UNKNOWN; this_device->offload_config = 0; @@ -4379,8 +4377,7 @@ static void hpsa_update_scsi_devices(struct ctlr_info *h) goto out; } if (rc) { - dev_warn(&h->pdev->dev, - "Inquiry failed, skipping device.\n"); + h->drv_req_rescan = 1; continue; } diff --git a/drivers/scsi/hpsa_cmd.h b/drivers/scsi/hpsa_cmd.h index a584cdf07058..5961705eef76 100644 --- a/drivers/scsi/hpsa_cmd.h +++ b/drivers/scsi/hpsa_cmd.h @@ -156,6 +156,7 @@ #define CFGTBL_BusType_Fibre2G 0x00000200l /* VPD Inquiry types */ +#define HPSA_INQUIRY_FAILED 0x02 #define HPSA_VPD_SUPPORTED_PAGES 0x00 #define HPSA_VPD_LV_DEVICE_ID 0x83 #define HPSA_VPD_LV_DEVICE_GEOMETRY 0xC1 @@ -166,6 +167,7 @@ /* Logical volume states */ #define HPSA_VPD_LV_STATUS_UNSUPPORTED 0xff #define HPSA_LV_OK 0x0 +#define HPSA_LV_FAILED 0x01 #define HPSA_LV_NOT_AVAILABLE 0x0b #define HPSA_LV_UNDERGOING_ERASE 0x0F #define HPSA_LV_UNDERGOING_RPI 0x12 From 87b9e6aa87d9411f1059aa245c0c79976bc557ac Mon Sep 17 00:00:00 2001 From: Don Brace Date: Fri, 10 Mar 2017 14:35:17 -0600 Subject: [PATCH 104/297] scsi: hpsa: limit outstanding rescans Avoid rescan storms. No need to queue another if one is pending. Reviewed-by: Scott Benesh Reviewed-by: Scott Teel Reviewed-by: Tomas Henzl Signed-off-by: Don Brace Signed-off-by: Martin K. Petersen --- drivers/scsi/hpsa.c | 16 +++++++++++++++- drivers/scsi/hpsa.h | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 90b76c4c6d36..0a8ac68f4ca1 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -5555,7 +5555,7 @@ static void hpsa_scan_complete(struct ctlr_info *h) spin_lock_irqsave(&h->scan_lock, flags); h->scan_finished = 1; - wake_up_all(&h->scan_wait_queue); + wake_up(&h->scan_wait_queue); spin_unlock_irqrestore(&h->scan_lock, flags); } @@ -5573,11 +5573,23 @@ static void hpsa_scan_start(struct Scsi_Host *sh) if (unlikely(lockup_detected(h))) return hpsa_scan_complete(h); + /* + * If a scan is already waiting to run, no need to add another + */ + spin_lock_irqsave(&h->scan_lock, flags); + if (h->scan_waiting) { + spin_unlock_irqrestore(&h->scan_lock, flags); + return; + } + + spin_unlock_irqrestore(&h->scan_lock, flags); + /* wait until any scan already in progress is finished. */ while (1) { spin_lock_irqsave(&h->scan_lock, flags); if (h->scan_finished) break; + h->scan_waiting = 1; spin_unlock_irqrestore(&h->scan_lock, flags); wait_event(h->scan_wait_queue, h->scan_finished); /* Note: We don't need to worry about a race between this @@ -5587,6 +5599,7 @@ static void hpsa_scan_start(struct Scsi_Host *sh) */ } h->scan_finished = 0; /* mark scan as in progress */ + h->scan_waiting = 0; spin_unlock_irqrestore(&h->scan_lock, flags); if (unlikely(lockup_detected(h))) @@ -8789,6 +8802,7 @@ reinit_after_soft_reset: init_waitqueue_head(&h->event_sync_wait_queue); mutex_init(&h->reset_mutex); h->scan_finished = 1; /* no scan currently in progress */ + h->scan_waiting = 0; pci_set_drvdata(pdev, h); h->ndevices = 0; diff --git a/drivers/scsi/hpsa.h b/drivers/scsi/hpsa.h index bf6cdc106654..6f04f2ad4125 100644 --- a/drivers/scsi/hpsa.h +++ b/drivers/scsi/hpsa.h @@ -201,6 +201,7 @@ struct ctlr_info { dma_addr_t errinfo_pool_dhandle; unsigned long *cmd_pool_bits; int scan_finished; + u8 scan_waiting : 1; spinlock_t scan_lock; wait_queue_head_t scan_wait_queue; From 2ef2884980873081a4edae92f9d88dd580c85f6e Mon Sep 17 00:00:00 2001 From: Don Brace Date: Fri, 10 Mar 2017 14:35:23 -0600 Subject: [PATCH 105/297] scsi: hpsa: do not timeout reset operations Resets can take longer than DEFAULT_TIMEOUT. Reviewed-by: Scott Benesh Reviewed-by: Scott Teel Reviewed-by: Tomas Henzl Signed-off-by: Don Brace Signed-off-by: Martin K. Petersen --- drivers/scsi/hpsa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 0a8ac68f4ca1..0d0be7754a65 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -2956,7 +2956,7 @@ static int hpsa_send_reset(struct ctlr_info *h, unsigned char *scsi3addr, /* fill_cmd can't fail here, no data buffer to map. */ (void) fill_cmd(c, reset_type, h, NULL, 0, 0, scsi3addr, TYPE_MSG); - rc = hpsa_scsi_do_simple_cmd(h, c, reply_queue, DEFAULT_TIMEOUT); + rc = hpsa_scsi_do_simple_cmd(h, c, reply_queue, NO_TIMEOUT); if (rc) { dev_warn(&h->pdev->dev, "Failed to send reset command\n"); goto out; From 949d7fa158b2b1af533bdb1af0dda8ab103ac58d Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Sun, 12 Mar 2017 12:22:02 +0200 Subject: [PATCH 106/297] scsi: ufs: don't check unsigned type for a negative value Fix compilation warning: drivers/scsi/ufs/ufshcd.c:7645:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if ((value < UFS_PM_LVL_0) || (value >= UFS_PM_LVL_MAX)) Signed-off-by: Tomas Winkler Reviewed-by: Subhash Jadavani Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 1359913bf840..e8c26e6e6237 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7642,7 +7642,7 @@ static inline ssize_t ufshcd_pm_lvl_store(struct device *dev, if (kstrtoul(buf, 0, &value)) return -EINVAL; - if ((value < UFS_PM_LVL_0) || (value >= UFS_PM_LVL_MAX)) + if (value >= UFS_PM_LVL_MAX) return -EINVAL; spin_lock_irqsave(hba->host->host_lock, flags); From 7d7080335f8d93a51e8238b6e85be8af4ba452b6 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 8 Mar 2017 14:36:01 -0800 Subject: [PATCH 107/297] scsi: lpfc: Finalize Kconfig options for nvme Reviewing the result of what was just added for Kconfig, we made a poor choice. It worked well for full kernel builds, but not so much for how it would be deployed on a distro. Here's the final result: - lpfc will compile in NVME initiator and/or NVME target support based on whether the kernel has the corresponding subsystem support. Kconfig is not used to drive this specifically for lpfc. - There is a module parameter, lpfc_enable_fc4_type, that indicates whether the ports will do FCP-only or FCP & NVME (NVME-only not yet possible due to dependency on fc transport). As FCP & NVME divvys up exchange resources, and given NVME will not be often initially, the default is changed to FCP only. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/Kconfig | 14 -------------- drivers/scsi/lpfc/lpfc_attr.c | 4 ++-- drivers/scsi/lpfc/lpfc_init.c | 7 +++++++ drivers/scsi/lpfc/lpfc_nvme.c | 8 ++++---- drivers/scsi/lpfc/lpfc_nvmet.c | 8 ++++---- 5 files changed, 17 insertions(+), 24 deletions(-) diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 4bf55b5d78be..3c52867dfe28 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -1253,20 +1253,6 @@ config SCSI_LPFC_DEBUG_FS This makes debugging information from the lpfc driver available via the debugfs filesystem. -config LPFC_NVME_INITIATOR - bool "Emulex LightPulse Fibre Channel NVME Initiator Support" - depends on SCSI_LPFC && NVME_FC - ---help--- - This enables NVME Initiator support in the Emulex lpfc driver. - -config LPFC_NVME_TARGET - bool "Emulex LightPulse Fibre Channel NVME Initiator Support" - depends on SCSI_LPFC && NVME_TARGET_FC - ---help--- - This enables NVME Target support in the Emulex lpfc driver. - Target enablement must still be enabled on a per adapter - basis by module parameters. - config SCSI_SIM710 tristate "Simple 53c710 SCSI support (Compaq, NCR machines)" depends on (EISA || MCA) && SCSI diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index fbd3a563be53..84aa62f1a4de 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -3315,9 +3315,9 @@ LPFC_ATTR_R(nvmet_mrq_post, LPFC_DEF_MRQ_POST, * lpfc_enable_fc4_type: Defines what FC4 types are supported. * Supported Values: 1 - register just FCP * 3 - register both FCP and NVME - * Supported values are [1,3]. Default value is 3 + * Supported values are [1,3]. Default value is 1 */ -LPFC_ATTR_R(enable_fc4_type, LPFC_ENABLE_BOTH, +LPFC_ATTR_R(enable_fc4_type, LPFC_ENABLE_FCP, LPFC_ENABLE_FCP, LPFC_ENABLE_BOTH, "Define fc4 type to register with fabric."); diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 2697d49da4d7..6cc561b04211 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -5891,10 +5891,17 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) /* Check to see if it matches any module parameter */ for (i = 0; i < lpfc_enable_nvmet_cnt; i++) { if (wwn == lpfc_enable_nvmet[i]) { +#if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "6017 NVME Target %016llx\n", wwn); phba->nvmet_support = 1; /* a match */ +#else + lpfc_printf_log(phba, KERN_ERR, LOG_INIT, + "6021 Can't enable NVME Target." + " NVME_TARGET_FC infrastructure" + " is not in kernel\n"); +#endif } } } diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 0a4c19081409..0024de1c6c1f 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -2149,7 +2149,7 @@ lpfc_nvme_create_localport(struct lpfc_vport *vport) /* localport is allocated from the stack, but the registration * call allocates heap memory as well as the private area. */ -#ifdef CONFIG_LPFC_NVME_INITIATOR +#if (IS_ENABLED(CONFIG_NVME_FC)) ret = nvme_fc_register_localport(&nfcp_info, &lpfc_nvme_template, &vport->phba->pcidev->dev, &localport); #else @@ -2190,7 +2190,7 @@ lpfc_nvme_create_localport(struct lpfc_vport *vport) void lpfc_nvme_destroy_localport(struct lpfc_vport *vport) { -#ifdef CONFIG_LPFC_NVME_INITIATOR +#if (IS_ENABLED(CONFIG_NVME_FC)) struct nvme_fc_local_port *localport; struct lpfc_nvme_lport *lport; struct lpfc_nvme_rport *rport = NULL, *rport_next = NULL; @@ -2274,7 +2274,7 @@ lpfc_nvme_update_localport(struct lpfc_vport *vport) int lpfc_nvme_register_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) { -#ifdef CONFIG_LPFC_NVME_INITIATOR +#if (IS_ENABLED(CONFIG_NVME_FC)) int ret = 0; struct nvme_fc_local_port *localport; struct lpfc_nvme_lport *lport; @@ -2403,7 +2403,7 @@ lpfc_nvme_register_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) void lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) { -#ifdef CONFIG_LPFC_NVME_INITIATOR +#if (IS_ENABLED(CONFIG_NVME_FC)) int ret; struct nvme_fc_local_port *localport; struct lpfc_nvme_lport *lport; diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index b7739a554fe0..7ca868f394da 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -671,7 +671,7 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba) lpfc_tgttemplate.target_features = NVMET_FCTGTFEAT_READDATA_RSP | NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED; -#ifdef CONFIG_LPFC_NVME_TARGET +#if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) error = nvmet_fc_register_targetport(&pinfo, &lpfc_tgttemplate, &phba->pcidev->dev, &phba->targetport); @@ -756,7 +756,7 @@ lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, void lpfc_nvmet_destroy_targetport(struct lpfc_hba *phba) { -#ifdef CONFIG_LPFC_NVME_TARGET +#if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) struct lpfc_nvmet_tgtport *tgtp; if (phba->nvmet_support == 0) @@ -788,7 +788,7 @@ static void lpfc_nvmet_unsol_ls_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, struct hbq_dmabuf *nvmebuf) { -#ifdef CONFIG_LPFC_NVME_TARGET +#if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) struct lpfc_nvmet_tgtport *tgtp; struct fc_frame_header *fc_hdr; struct lpfc_nvmet_rcv_ctx *ctxp; @@ -891,7 +891,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, struct rqb_dmabuf *nvmebuf, uint64_t isr_timestamp) { -#ifdef CONFIG_LPFC_NVME_TARGET +#if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) struct lpfc_nvmet_rcv_ctx *ctxp; struct lpfc_nvmet_tgtport *tgtp; struct fc_frame_header *fc_hdr; From fe8daf5fa715f7214952f06a387e4b7de818c5be Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Wed, 15 Mar 2017 13:47:50 +0900 Subject: [PATCH 108/297] fjes: Fix wrong netdevice feature flags This patch fixes netdev->features for Extended Socket network device. Currently Extended Socket network device's netdev->feature claims NETIF_F_HW_CSUM, however this is completely wrong. There's no feature of checksum offloading. That causes invalid TCP/UDP checksum and packet rejection when IP forwarding from Extended Socket network device to other network device. NETIF_F_HW_CSUM should be omitted. Signed-off-by: Taku Izumi Signed-off-by: David S. Miller --- drivers/net/fjes/fjes_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index b75d9cdcfb0c..c4b3c4b77a9c 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -1316,7 +1316,7 @@ static void fjes_netdev_setup(struct net_device *netdev) netdev->min_mtu = fjes_support_mtu[0]; netdev->max_mtu = fjes_support_mtu[3]; netdev->flags |= IFF_BROADCAST; - netdev->features |= NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_FILTER; + netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } static void fjes_irq_watch_task(struct work_struct *work) From 5f655322b1ba4bd46e26e307d04098f9c84df764 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 14 Mar 2017 11:47:29 -0400 Subject: [PATCH 109/297] parisc: support R_PARISC_SECREL32 relocation in modules The parisc kernel doesn't work with CONFIG_MODVERSIONS since the commit 71810db27c1c853b335675bee335d893bc3d324b. It can't load modules with the error: "module unix: Unknown relocation: 41". The commit changes __kcrctab from 64-bit valus to 32-bit values. The assembler generates R_PARISC_SECREL32 secrel relocation for them and the module loader doesn't support this relocation. This patch adds the R_PARISC_SECREL32 relocation to the module loader. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Helge Deller --- arch/parisc/kernel/module.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index a0ecdb4abcc8..c66c943d9322 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -620,6 +620,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, */ *loc = fsel(val, addend); break; + case R_PARISC_SECREL32: + /* 32-bit section relative address. */ + *loc = fsel(val, addend); + break; case R_PARISC_DPREL21L: /* left 21 bit of relative address */ val = lrsel(val - dp, addend); @@ -807,6 +811,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, */ *loc = fsel(val, addend); break; + case R_PARISC_SECREL32: + /* 32-bit section relative address. */ + *loc = fsel(val, addend); + break; case R_PARISC_FPTR64: /* 64-bit function address */ if(in_local(me, (void *)(val + addend))) { From 316ec0624f951166daedbe446988ef92ae72b59f Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sat, 11 Mar 2017 18:03:34 -0500 Subject: [PATCH 110/297] parisc: Optimize flush_kernel_vmap_range and invalidate_kernel_vmap_range The previously submitted patch did not resolve the random segmentation faults observed on the phantom buildd system. There are still unresolved problems with the Debian 4.8 and 4.9 kernels on C8000. The attached patch removes the flush of the offset map pages and does a whole data cache flush for large ranges. No other arch flushes the offset map in these routines as far as I can tell. I have not observed any random segmentation faults on rp3440 in two weeks of testing with 4.10.0 and 4.10.1. Signed-off-by: John David Anglin Cc: stable@vger.kernel.org # v4.8+ Signed-off-by: Helge Deller --- arch/parisc/include/asm/cacheflush.h | 23 ++--------------------- arch/parisc/kernel/cache.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 19c9c3c5f267..c7e15cc5c668 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -43,28 +43,9 @@ static inline void flush_kernel_dcache_page(struct page *page) #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); -/* vmap range flushes and invalidates. Architecturally, we don't need - * the invalidate, because the CPU should refuse to speculate once an - * area has been flushed, so invalidate is left empty */ -static inline void flush_kernel_vmap_range(void *vaddr, int size) -{ - unsigned long start = (unsigned long)vaddr; - flush_kernel_dcache_range_asm(start, start + size); -} -static inline void invalidate_kernel_vmap_range(void *vaddr, int size) -{ - unsigned long start = (unsigned long)vaddr; - void *cursor = vaddr; - - for ( ; cursor < vaddr + size; cursor += PAGE_SIZE) { - struct page *page = vmalloc_to_page(cursor); - - if (test_and_clear_bit(PG_dcache_dirty, &page->flags)) - flush_kernel_dcache_page(page); - } - flush_kernel_dcache_range_asm(start, start + size); -} +void flush_kernel_vmap_range(void *vaddr, int size); +void invalidate_kernel_vmap_range(void *vaddr, int size); #define flush_cache_vmap(start, end) flush_cache_all() #define flush_cache_vunmap(start, end) flush_cache_all() diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 0dc72d5de861..c32a09095216 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -616,3 +616,25 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); } } + +void flush_kernel_vmap_range(void *vaddr, int size) +{ + unsigned long start = (unsigned long)vaddr; + + if ((unsigned long)size > parisc_cache_flush_threshold) + flush_data_cache(); + else + flush_kernel_dcache_range_asm(start, start + size); +} +EXPORT_SYMBOL(flush_kernel_vmap_range); + +void invalidate_kernel_vmap_range(void *vaddr, int size) +{ + unsigned long start = (unsigned long)vaddr; + + if ((unsigned long)size > parisc_cache_flush_threshold) + flush_data_cache(); + else + flush_kernel_dcache_range_asm(start, start + size); +} +EXPORT_SYMBOL(invalidate_kernel_vmap_range); From 63d32d1e09cb2fc65b084b261976c06b40d19115 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 15 Mar 2017 21:10:17 +0100 Subject: [PATCH 111/297] parisc: Wire up statx system call Signed-off-by: Helge Deller --- arch/parisc/include/uapi/asm/unistd.h | 3 ++- arch/parisc/kernel/syscall_table.S | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/parisc/include/uapi/asm/unistd.h b/arch/parisc/include/uapi/asm/unistd.h index 6b0741e7a7ed..667c99421003 100644 --- a/arch/parisc/include/uapi/asm/unistd.h +++ b/arch/parisc/include/uapi/asm/unistd.h @@ -362,8 +362,9 @@ #define __NR_copy_file_range (__NR_Linux + 346) #define __NR_preadv2 (__NR_Linux + 347) #define __NR_pwritev2 (__NR_Linux + 348) +#define __NR_statx (__NR_Linux + 349) -#define __NR_Linux_syscalls (__NR_pwritev2 + 1) +#define __NR_Linux_syscalls (__NR_statx + 1) #define __IGNORE_select /* newselect */ diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index 3cfef1de8061..44aeaa9c039f 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -444,6 +444,7 @@ ENTRY_SAME(copy_file_range) ENTRY_COMP(preadv2) ENTRY_COMP(pwritev2) + ENTRY_SAME(statx) .ifne (. - 90b) - (__NR_Linux_syscalls * (91b - 90b)) From 0f424de1fd9bc4ab24bd1fe5430ab5618e803e31 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 14 Mar 2017 14:42:03 -0400 Subject: [PATCH 112/297] drm/radeon/si: add dpm quirk for Oland MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OLAND 0x1002:0x6604 0x1028:0x066F 0x00 seems to have problems with higher sclks. Acked-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/si_dpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index d12b8978142f..72e1588580a1 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -2984,6 +2984,12 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, (rdev->pdev->device == 0x6667)) { max_sclk = 75000; } + } else if (rdev->family == CHIP_OLAND) { + if ((rdev->pdev->device == 0x6604) && + (rdev->pdev->subsystem_vendor == 0x1028) && + (rdev->pdev->subsystem_device == 0x066F)) { + max_sclk = 75000; + } } if (rps->vce_active) { From 18a8de1bc37e97dff1c96ee6cf49adbd02a0f775 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 14 Mar 2017 19:24:19 -0400 Subject: [PATCH 113/297] drm/amdgpu/si: add dpm quirk for Oland MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OLAND 0x1002:0x6604 0x1028:0x066F 0x00 seems to have problems with higher sclks. Acked-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/si_dpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/si_dpm.c b/drivers/gpu/drm/amd/amdgpu/si_dpm.c index f55e45b52fbc..33b504bafb88 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/si_dpm.c @@ -3464,6 +3464,12 @@ static void si_apply_state_adjust_rules(struct amdgpu_device *adev, (adev->pdev->device == 0x6667)) { max_sclk = 75000; } + } else if (adev->asic_type == CHIP_OLAND) { + if ((adev->pdev->device == 0x6604) && + (adev->pdev->subsystem_vendor == 0x1028) && + (adev->pdev->subsystem_device == 0x066F)) { + max_sclk = 75000; + } } if (rps->vce_active) { From 801a6aa9a63c90724e8899982ad8c7f16be1e2cd Mon Sep 17 00:00:00 2001 From: Tom St Denis Date: Wed, 15 Mar 2017 05:34:25 -0400 Subject: [PATCH 114/297] drm/amd/amdgpu: Fix debugfs reg read/write address width MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MMIO space is wider now so we mask the lower 22 bits instead of 18. Signed-off-by: Tom St Denis Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4120b351a8e5..a3a105ec99e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2590,7 +2590,7 @@ static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user *buf, use_bank = 0; } - *pos &= 0x3FFFF; + *pos &= (1UL << 22) - 1; if (use_bank) { if ((sh_bank != 0xFFFFFFFF && sh_bank >= adev->gfx.config.max_sh_per_se) || @@ -2666,7 +2666,7 @@ static ssize_t amdgpu_debugfs_regs_write(struct file *f, const char __user *buf, use_bank = 0; } - *pos &= 0x3FFFF; + *pos &= (1UL << 22) - 1; if (use_bank) { if ((sh_bank != 0xFFFFFFFF && sh_bank >= adev->gfx.config.max_sh_per_se) || From 186ecf14e58befba434f0774eea89e35f64d3c6a Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 15 Mar 2017 21:48:42 +0100 Subject: [PATCH 115/297] parisc: Avoid compiler warnings with access_ok() Commit 09b871ffd4d8 (parisc: Define access_ok() as macro) missed to mark uaddr as used, which then gives compiler warnings about unused variables. Fix it by comparing uaddr to uaddr which then gets optimized away by the compiler. Signed-off-by: Helge Deller Fixes: 09b871ffd4d8 ("parisc: Define access_ok() as macro") --- arch/parisc/include/asm/uaccess.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index fb4382c28259..edfbf9d6a6dd 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -32,7 +32,8 @@ * that put_user is the same as __put_user, etc. */ -#define access_ok(type, uaddr, size) (1) +#define access_ok(type, uaddr, size) \ + ( (uaddr) == (uaddr) ) #define put_user __put_user #define get_user __get_user From 3d20f1f7bd575d147ffa75621fa560eea0aec690 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 15 Mar 2017 18:10:47 +0200 Subject: [PATCH 116/297] net/openvswitch: Set the ipv6 source tunnel key address attribute correctly When dealing with ipv6 source tunnel key address attribute (OVS_TUNNEL_KEY_ATTR_IPV6_SRC) we are wrongly setting the tunnel dst ip, fix that. Fixes: 6b26ba3a7d95 ('openvswitch: netlink attributes for IPv6 tunneling') Signed-off-by: Or Gerlitz Reported-by: Paul Blakey Acked-by: Jiri Benc Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 6f5fa50f716d..a08ff834676b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -604,7 +604,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, ipv4 = true; break; case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: - SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src, nla_get_in6_addr(a), is_mask); ipv6 = true; break; From 88d339e2d3be955848a034970970931a7ed33956 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 15 Mar 2017 18:39:46 +0100 Subject: [PATCH 117/297] MAINTAINERS: remove MACVLAN and VLAN entries macvlan.c file seems to be both in VLAN and MACVLAN DRIVER, so remove the MACVLAN DRIVER since this is redundant. I propose with this patch to remove the VLAN (802.1Q) entry so this just falls into the NETWORKING [GENERAL]. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- MAINTAINERS | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c776906f67a9..33875e07482b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7774,13 +7774,6 @@ F: include/net/mac80211.h F: net/mac80211/ F: drivers/net/wireless/mac80211_hwsim.[ch] -MACVLAN DRIVER -M: Patrick McHardy -L: netdev@vger.kernel.org -S: Maintained -F: drivers/net/macvlan.c -F: include/linux/if_macvlan.h - MAILBOX API M: Jassi Brar L: linux-kernel@vger.kernel.org @@ -13383,14 +13376,6 @@ W: https://linuxtv.org S: Maintained F: drivers/media/platform/vivid/* -VLAN (802.1Q) -M: Patrick McHardy -L: netdev@vger.kernel.org -S: Maintained -F: drivers/net/macvlan.c -F: include/linux/if_*vlan.h -F: net/8021q/ - VLYNQ BUS M: Florian Fainelli L: openwrt-devel@lists.openwrt.org (subscribers-only) From 5371bbf4b295eea334ed453efa286afa2c3ccff3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 15 Mar 2017 12:57:21 -0700 Subject: [PATCH 118/297] net: bcmgenet: Do not suspend PHY if Wake-on-LAN is enabled Suspending the PHY would be putting it in a low power state where it may no longer allow us to do Wake-on-LAN. Fixes: cc013fb48898 ("net: bcmgenet: correctly suspend and resume PHY device") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 69015fa50f20..365895ed3c3e 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3481,7 +3481,8 @@ static int bcmgenet_suspend(struct device *d) bcmgenet_netif_stop(dev); - phy_suspend(priv->phydev); + if (!device_may_wakeup(d)) + phy_suspend(priv->phydev); netif_device_detach(dev); @@ -3578,7 +3579,8 @@ static int bcmgenet_resume(struct device *d) netif_device_attach(dev); - phy_resume(priv->phydev); + if (!device_may_wakeup(d)) + phy_resume(priv->phydev); if (priv->eee.eee_enabled) bcmgenet_eee_enable_set(dev, true); From 622c36f143fc9566ba49d7cec994c2da1182d9e2 Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Wed, 15 Mar 2017 15:11:23 -0500 Subject: [PATCH 119/297] amd-xgbe: Fix jumbo MTU processing on newer hardware Newer hardware does not provide a cumulative payload length when multiple descriptors are needed to handle the data. Once the MTU increases beyond the size that can be handled by a single descriptor, the SKB does not get built properly by the driver. The driver will now calculate the size of the data buffers used by the hardware. The first buffer of the first descriptor is for packet headers or packet headers and data when the headers can't be split. Subsequent descriptors in a multi-descriptor chain will not use the first buffer. The second buffer is used by all the descriptors in the chain for payload data. Based on whether the driver is processing the first, intermediate, or last descriptor it can calculate the buffer usage and build the SKB properly. Tested and verified on both old and new hardware. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-common.h | 6 +- drivers/net/ethernet/amd/xgbe/xgbe-dev.c | 20 ++-- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 102 ++++++++++++-------- 3 files changed, 78 insertions(+), 50 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index 8a280e7d66bd..86f1626816ff 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -1148,8 +1148,8 @@ #define RX_PACKET_ATTRIBUTES_CSUM_DONE_WIDTH 1 #define RX_PACKET_ATTRIBUTES_VLAN_CTAG_INDEX 1 #define RX_PACKET_ATTRIBUTES_VLAN_CTAG_WIDTH 1 -#define RX_PACKET_ATTRIBUTES_INCOMPLETE_INDEX 2 -#define RX_PACKET_ATTRIBUTES_INCOMPLETE_WIDTH 1 +#define RX_PACKET_ATTRIBUTES_LAST_INDEX 2 +#define RX_PACKET_ATTRIBUTES_LAST_WIDTH 1 #define RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_INDEX 3 #define RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_WIDTH 1 #define RX_PACKET_ATTRIBUTES_CONTEXT_INDEX 4 @@ -1158,6 +1158,8 @@ #define RX_PACKET_ATTRIBUTES_RX_TSTAMP_WIDTH 1 #define RX_PACKET_ATTRIBUTES_RSS_HASH_INDEX 6 #define RX_PACKET_ATTRIBUTES_RSS_HASH_WIDTH 1 +#define RX_PACKET_ATTRIBUTES_FIRST_INDEX 7 +#define RX_PACKET_ATTRIBUTES_FIRST_WIDTH 1 #define RX_NORMAL_DESC0_OVT_INDEX 0 #define RX_NORMAL_DESC0_OVT_WIDTH 16 diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c index 937f37a5dcb2..24a687ce4388 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c @@ -1896,10 +1896,15 @@ static int xgbe_dev_read(struct xgbe_channel *channel) /* Get the header length */ if (XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, FD)) { + XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, + FIRST, 1); rdata->rx.hdr_len = XGMAC_GET_BITS_LE(rdesc->desc2, RX_NORMAL_DESC2, HL); if (rdata->rx.hdr_len) pdata->ext_stats.rx_split_header_packets++; + } else { + XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, + FIRST, 0); } /* Get the RSS hash */ @@ -1922,19 +1927,16 @@ static int xgbe_dev_read(struct xgbe_channel *channel) } } - /* Get the packet length */ - rdata->rx.len = XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, PL); - - if (!XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, LD)) { - /* Not all the data has been transferred for this packet */ - XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, - INCOMPLETE, 1); + /* Not all the data has been transferred for this packet */ + if (!XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, LD)) return 0; - } /* This is the last of the data for this packet */ XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, - INCOMPLETE, 0); + LAST, 1); + + /* Get the packet length */ + rdata->rx.len = XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, PL); /* Set checksum done indicator as appropriate */ if (netdev->features & NETIF_F_RXCSUM) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index ffea9859f5a7..a713abd9d03e 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1971,13 +1971,12 @@ static struct sk_buff *xgbe_create_skb(struct xgbe_prv_data *pdata, { struct sk_buff *skb; u8 *packet; - unsigned int copy_len; skb = napi_alloc_skb(napi, rdata->rx.hdr.dma_len); if (!skb) return NULL; - /* Start with the header buffer which may contain just the header + /* Pull in the header buffer which may contain just the header * or the header plus data */ dma_sync_single_range_for_cpu(pdata->dev, rdata->rx.hdr.dma_base, @@ -1986,30 +1985,49 @@ static struct sk_buff *xgbe_create_skb(struct xgbe_prv_data *pdata, packet = page_address(rdata->rx.hdr.pa.pages) + rdata->rx.hdr.pa.pages_offset; - copy_len = (rdata->rx.hdr_len) ? rdata->rx.hdr_len : len; - copy_len = min(rdata->rx.hdr.dma_len, copy_len); - skb_copy_to_linear_data(skb, packet, copy_len); - skb_put(skb, copy_len); - - len -= copy_len; - if (len) { - /* Add the remaining data as a frag */ - dma_sync_single_range_for_cpu(pdata->dev, - rdata->rx.buf.dma_base, - rdata->rx.buf.dma_off, - rdata->rx.buf.dma_len, - DMA_FROM_DEVICE); - - skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, - rdata->rx.buf.pa.pages, - rdata->rx.buf.pa.pages_offset, - len, rdata->rx.buf.dma_len); - rdata->rx.buf.pa.pages = NULL; - } + skb_copy_to_linear_data(skb, packet, len); + skb_put(skb, len); return skb; } +static unsigned int xgbe_rx_buf1_len(struct xgbe_ring_data *rdata, + struct xgbe_packet_data *packet) +{ + /* Always zero if not the first descriptor */ + if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, FIRST)) + return 0; + + /* First descriptor with split header, return header length */ + if (rdata->rx.hdr_len) + return rdata->rx.hdr_len; + + /* First descriptor but not the last descriptor and no split header, + * so the full buffer was used + */ + if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, LAST)) + return rdata->rx.hdr.dma_len; + + /* First descriptor and last descriptor and no split header, so + * calculate how much of the buffer was used + */ + return min_t(unsigned int, rdata->rx.hdr.dma_len, rdata->rx.len); +} + +static unsigned int xgbe_rx_buf2_len(struct xgbe_ring_data *rdata, + struct xgbe_packet_data *packet, + unsigned int len) +{ + /* Always the full buffer if not the last descriptor */ + if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, LAST)) + return rdata->rx.buf.dma_len; + + /* Last descriptor so calculate how much of the buffer was used + * for the last bit of data + */ + return rdata->rx.len - len; +} + static int xgbe_tx_poll(struct xgbe_channel *channel) { struct xgbe_prv_data *pdata = channel->pdata; @@ -2092,8 +2110,8 @@ static int xgbe_rx_poll(struct xgbe_channel *channel, int budget) struct napi_struct *napi; struct sk_buff *skb; struct skb_shared_hwtstamps *hwtstamps; - unsigned int incomplete, error, context_next, context; - unsigned int len, rdesc_len, max_len; + unsigned int last, error, context_next, context; + unsigned int len, buf1_len, buf2_len, max_len; unsigned int received = 0; int packet_count = 0; @@ -2103,7 +2121,7 @@ static int xgbe_rx_poll(struct xgbe_channel *channel, int budget) if (!ring) return 0; - incomplete = 0; + last = 0; context_next = 0; napi = (pdata->per_channel_irq) ? &channel->napi : &pdata->napi; @@ -2137,9 +2155,8 @@ read_again: received++; ring->cur++; - incomplete = XGMAC_GET_BITS(packet->attributes, - RX_PACKET_ATTRIBUTES, - INCOMPLETE); + last = XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, + LAST); context_next = XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, CONTEXT_NEXT); @@ -2148,7 +2165,7 @@ read_again: CONTEXT); /* Earlier error, just drain the remaining data */ - if ((incomplete || context_next) && error) + if ((!last || context_next) && error) goto read_again; if (error || packet->errors) { @@ -2160,16 +2177,22 @@ read_again: } if (!context) { - /* Length is cumulative, get this descriptor's length */ - rdesc_len = rdata->rx.len - len; - len += rdesc_len; + /* Get the data length in the descriptor buffers */ + buf1_len = xgbe_rx_buf1_len(rdata, packet); + len += buf1_len; + buf2_len = xgbe_rx_buf2_len(rdata, packet, len); + len += buf2_len; - if (rdesc_len && !skb) { + if (!skb) { skb = xgbe_create_skb(pdata, napi, rdata, - rdesc_len); - if (!skb) + buf1_len); + if (!skb) { error = 1; - } else if (rdesc_len) { + goto skip_data; + } + } + + if (buf2_len) { dma_sync_single_range_for_cpu(pdata->dev, rdata->rx.buf.dma_base, rdata->rx.buf.dma_off, @@ -2179,13 +2202,14 @@ read_again: skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rdata->rx.buf.pa.pages, rdata->rx.buf.pa.pages_offset, - rdesc_len, + buf2_len, rdata->rx.buf.dma_len); rdata->rx.buf.pa.pages = NULL; } } - if (incomplete || context_next) +skip_data: + if (!last || context_next) goto read_again; if (!skb) @@ -2243,7 +2267,7 @@ next_packet: } /* Check if we need to save state before leaving */ - if (received && (incomplete || context_next)) { + if (received && (!last || context_next)) { rdata = XGBE_GET_DESC_DATA(ring, ring->cur); rdata->state_saved = 1; rdata->state.skb = skb; From 22a0e18eac7a9e986fec76c60fa4a2926d1291e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Mar 2017 13:21:28 -0700 Subject: [PATCH 120/297] net: properly release sk_frag.page I mistakenly added the code to release sk->sk_frag in sk_common_release() instead of sk_destruct() TCP sockets using sk->sk_allocation == GFP_ATOMIC do no call sk_common_release() at close time, thus leaking one (order-3) page. iSCSI is using such sockets. Fixes: 5640f7685831 ("net: use a per task frag allocator") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index a96d5f7a5734..acb0d4137499 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1442,6 +1442,11 @@ static void __sk_destruct(struct rcu_head *head) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); @@ -2787,11 +2792,6 @@ void sk_common_release(struct sock *sk) sk_refcnt_debug_release(sk); - if (sk->sk_frag.page) { - put_page(sk->sk_frag.page); - sk->sk_frag.page = NULL; - } - sock_put(sk); } EXPORT_SYMBOL(sk_common_release); From 9b4f603e7a9f4282aec451063ffbbb8bb410dcd9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Mar 2017 00:12:16 +0100 Subject: [PATCH 121/297] cpufreq: Fix and clean up show_cpuinfo_cur_freq() There is a missing newline in show_cpuinfo_cur_freq(), so add it, but while at it clean that function up somewhat too. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Cc: All applicable --- drivers/cpufreq/cpufreq.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 38b9fdf854a4..b8ff617d449d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -680,9 +680,11 @@ static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy, char *buf) { unsigned int cur_freq = __cpufreq_get(policy); - if (!cur_freq) - return sprintf(buf, ""); - return sprintf(buf, "%u\n", cur_freq); + + if (cur_freq) + return sprintf(buf, "%u\n", cur_freq); + + return sprintf(buf, "\n"); } /** From 9a3fcf912ef7f5c6e18f9af6875dd13f7311f7aa Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 14 Mar 2017 09:50:35 +0200 Subject: [PATCH 122/297] iwlwifi: mvm: cleanup pending frames in DQA mode When a station is asleep, the fw will set it as "asleep". All queues that are used only by one station will be stopped by the fw. In pre-DQA mode this was relevant for aggregation queues. However, in DQA mode a queue is owned by one station only, so all queues will be stopped. As a result, we don't expect to get filtered frames back to mac80211 and don't have to maintain the entire pending_frames state logic, the same way as we do in aggregations. The correct behavior is to align DQA behavior with the aggregation queue behaviour pre-DQA: - Don't count pending frames. - Let mac80211 know we have frames in these queues so that it can properly handle trigger frames. When a trigger frame is received, mac80211 tells the driver to send frames from the queues using release_buffered_frames. The driver will tell the fw to let frames out even if the station is asleep. This is done by iwl_mvm_sta_modify_sleep_tx_count. Reported-and-tested-by: Jens Axboe Reported-by: Linus Torvalds Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- .../net/wireless/intel/iwlwifi/mvm/mac80211.c | 5 ++- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 11 ++--- drivers/net/wireless/intel/iwlwifi/mvm/sta.h | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 41 ++++++++----------- 4 files changed, 28 insertions(+), 31 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index d37b1695c64e..6927caecd48e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -2319,7 +2319,7 @@ iwl_mvm_mac_release_buffered_frames(struct ieee80211_hw *hw, { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); - /* Called when we need to transmit (a) frame(s) from agg queue */ + /* Called when we need to transmit (a) frame(s) from agg or dqa queue */ iwl_mvm_sta_modify_sleep_tx_count(mvm, sta, reason, num_frames, tids, more_data, true); @@ -2338,7 +2338,8 @@ static void __iwl_mvm_mac_sta_notify(struct ieee80211_hw *hw, for (tid = 0; tid < IWL_MAX_TID_COUNT; tid++) { struct iwl_mvm_tid_data *tid_data = &mvmsta->tid_data[tid]; - if (tid_data->state != IWL_AGG_ON && + if (!iwl_mvm_is_dqa_supported(mvm) && + tid_data->state != IWL_AGG_ON && tid_data->state != IWL_EMPTYING_HW_QUEUE_DELBA) continue; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index bd1dcc863d8f..b51a2853cc80 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -3135,7 +3135,7 @@ void iwl_mvm_sta_modify_sleep_tx_count(struct iwl_mvm *mvm, struct ieee80211_sta *sta, enum ieee80211_frame_release_type reason, u16 cnt, u16 tids, bool more_data, - bool agg) + bool single_sta_queue) { struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta); struct iwl_mvm_add_sta_cmd cmd = { @@ -3155,14 +3155,14 @@ void iwl_mvm_sta_modify_sleep_tx_count(struct iwl_mvm *mvm, for_each_set_bit(tid, &_tids, IWL_MAX_TID_COUNT) cmd.awake_acs |= BIT(tid_to_ucode_ac[tid]); - /* If we're releasing frames from aggregation queues then check if the - * all queues combined that we're releasing frames from have + /* If we're releasing frames from aggregation or dqa queues then check + * if all the queues that we're releasing frames from, combined, have: * - more frames than the service period, in which case more_data * needs to be set * - fewer than 'cnt' frames, in which case we need to adjust the * firmware command (but do that unconditionally) */ - if (agg) { + if (single_sta_queue) { int remaining = cnt; int sleep_tx_count; @@ -3172,7 +3172,8 @@ void iwl_mvm_sta_modify_sleep_tx_count(struct iwl_mvm *mvm, u16 n_queued; tid_data = &mvmsta->tid_data[tid]; - if (WARN(tid_data->state != IWL_AGG_ON && + if (WARN(!iwl_mvm_is_dqa_supported(mvm) && + tid_data->state != IWL_AGG_ON && tid_data->state != IWL_EMPTYING_HW_QUEUE_DELBA, "TID %d state is %d\n", tid, tid_data->state)) { diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h index 4be34f902278..1927ce607798 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h @@ -547,7 +547,7 @@ void iwl_mvm_sta_modify_sleep_tx_count(struct iwl_mvm *mvm, struct ieee80211_sta *sta, enum ieee80211_frame_release_type reason, u16 cnt, u16 tids, bool more_data, - bool agg); + bool single_sta_queue); int iwl_mvm_drain_sta(struct iwl_mvm *mvm, struct iwl_mvm_sta *mvmsta, bool drain); void iwl_mvm_sta_modify_disable_tx(struct iwl_mvm *mvm, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index dd2b4a300819..3f37075f4cde 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -7,7 +7,7 @@ * * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH - * Copyright(c) 2016 Intel Deutschland GmbH + * Copyright(c) 2016 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -34,6 +34,7 @@ * * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH + * Copyright(c) 2016 - 2017 Intel Deutschland GmbH * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -628,8 +629,10 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) * values. * Note that we don't need to make sure it isn't agg'd, since we're * TXing non-sta + * For DQA mode - we shouldn't increase it though */ - atomic_inc(&mvm->pending_frames[sta_id]); + if (!iwl_mvm_is_dqa_supported(mvm)) + atomic_inc(&mvm->pending_frames[sta_id]); return 0; } @@ -1005,11 +1008,8 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb, spin_unlock(&mvmsta->lock); - /* Increase pending frames count if this isn't AMPDU */ - if ((iwl_mvm_is_dqa_supported(mvm) && - mvmsta->tid_data[tx_cmd->tid_tspec].state != IWL_AGG_ON && - mvmsta->tid_data[tx_cmd->tid_tspec].state != IWL_AGG_STARTING) || - (!iwl_mvm_is_dqa_supported(mvm) && !is_ampdu)) + /* Increase pending frames count if this isn't AMPDU or DQA queue */ + if (!iwl_mvm_is_dqa_supported(mvm) && !is_ampdu) atomic_inc(&mvm->pending_frames[mvmsta->sta_id]); return 0; @@ -1079,12 +1079,13 @@ static void iwl_mvm_check_ratid_empty(struct iwl_mvm *mvm, lockdep_assert_held(&mvmsta->lock); if ((tid_data->state == IWL_AGG_ON || - tid_data->state == IWL_EMPTYING_HW_QUEUE_DELBA) && + tid_data->state == IWL_EMPTYING_HW_QUEUE_DELBA || + iwl_mvm_is_dqa_supported(mvm)) && iwl_mvm_tid_queued(tid_data) == 0) { /* - * Now that this aggregation queue is empty tell mac80211 so it - * knows we no longer have frames buffered for the station on - * this TID (for the TIM bitmap calculation.) + * Now that this aggregation or DQA queue is empty tell + * mac80211 so it knows we no longer have frames buffered for + * the station on this TID (for the TIM bitmap calculation.) */ ieee80211_sta_set_buffered(sta, tid, false); } @@ -1257,7 +1258,6 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm, u8 skb_freed = 0; u16 next_reclaimed, seq_ctl; bool is_ndp = false; - bool txq_agg = false; /* Is this TXQ aggregated */ __skb_queue_head_init(&skbs); @@ -1283,6 +1283,10 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm, info->flags |= IEEE80211_TX_STAT_ACK; break; case TX_STATUS_FAIL_DEST_PS: + /* In DQA, the FW should have stopped the queue and not + * return this status + */ + WARN_ON(iwl_mvm_is_dqa_supported(mvm)); info->flags |= IEEE80211_TX_STAT_TX_FILTERED; break; default: @@ -1387,15 +1391,6 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm, bool send_eosp_ndp = false; spin_lock_bh(&mvmsta->lock); - if (iwl_mvm_is_dqa_supported(mvm)) { - enum iwl_mvm_agg_state state; - - state = mvmsta->tid_data[tid].state; - txq_agg = (state == IWL_AGG_ON || - state == IWL_EMPTYING_HW_QUEUE_DELBA); - } else { - txq_agg = txq_id >= mvm->first_agg_queue; - } if (!is_ndp) { tid_data->next_reclaimed = next_reclaimed; @@ -1452,11 +1447,11 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm, * If the txq is not an AMPDU queue, there is no chance we freed * several skbs. Check that out... */ - if (txq_agg) + if (iwl_mvm_is_dqa_supported(mvm) || txq_id >= mvm->first_agg_queue) goto out; /* We can't free more than one frame at once on a shared queue */ - WARN_ON(!iwl_mvm_is_dqa_supported(mvm) && (skb_freed > 1)); + WARN_ON(skb_freed > 1); /* If we have still frames for this STA nothing to do here */ if (!atomic_sub_and_test(skb_freed, &mvm->pending_frames[sta_id])) From 4e841d3eb9294ce4137fdb5d0a88f1bceab9c212 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 10 Mar 2017 17:39:21 -0800 Subject: [PATCH 123/297] mwifiex: pcie: don't leak DMA buffers when removing When PCIe FLR support was added, much of the remove/release code for PCIe was migrated to ->down_dev(), but ->down_dev() is never called for device removal. Let's refactor the cleanup to be done in both cases. Also, drop the comments above mwifiex_cleanup_pcie(), because they were clearly wrong, and it's better to have clear and obvious code than to detail the code steps in comments anyway. Fixes: 4c5dae59d2e9 ("mwifiex: add PCIe function level reset support") Cc: Signed-off-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/pcie.c | 38 ++++++++++----------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/pcie.c b/drivers/net/wireless/marvell/mwifiex/pcie.c index a0d918094889..b8c990d10d6e 100644 --- a/drivers/net/wireless/marvell/mwifiex/pcie.c +++ b/drivers/net/wireless/marvell/mwifiex/pcie.c @@ -2739,6 +2739,21 @@ static void mwifiex_pcie_device_dump(struct mwifiex_adapter *adapter) schedule_work(&card->work); } +static void mwifiex_pcie_free_buffers(struct mwifiex_adapter *adapter) +{ + struct pcie_service_card *card = adapter->card; + const struct mwifiex_pcie_card_reg *reg = card->pcie.reg; + + if (reg->sleep_cookie) + mwifiex_pcie_delete_sleep_cookie_buf(adapter); + + mwifiex_pcie_delete_cmdrsp_buf(adapter); + mwifiex_pcie_delete_evtbd_ring(adapter); + mwifiex_pcie_delete_rxbd_ring(adapter); + mwifiex_pcie_delete_txbd_ring(adapter); + card->cmdrsp_buf = NULL; +} + /* * This function initializes the PCI-E host memory space, WCB rings, etc. * @@ -2850,13 +2865,6 @@ err_enable_dev: /* * This function cleans up the allocated card buffers. - * - * The following are freed by this function - - * - TXBD ring buffers - * - RXBD ring buffers - * - Event BD ring buffers - * - Command response ring buffer - * - Sleep cookie buffer */ static void mwifiex_cleanup_pcie(struct mwifiex_adapter *adapter) { @@ -2875,6 +2883,8 @@ static void mwifiex_cleanup_pcie(struct mwifiex_adapter *adapter) "Failed to write driver not-ready signature\n"); } + mwifiex_pcie_free_buffers(adapter); + if (pdev) { pci_iounmap(pdev, card->pci_mmap); pci_iounmap(pdev, card->pci_mmap1); @@ -3126,10 +3136,7 @@ err_cre_txbd: pci_iounmap(pdev, card->pci_mmap1); } -/* This function cleans up the PCI-E host memory space. - * Some code is extracted from mwifiex_unregister_dev() - * - */ +/* This function cleans up the PCI-E host memory space. */ static void mwifiex_pcie_down_dev(struct mwifiex_adapter *adapter) { struct pcie_service_card *card = adapter->card; @@ -3140,14 +3147,7 @@ static void mwifiex_pcie_down_dev(struct mwifiex_adapter *adapter) adapter->seq_num = 0; - if (reg->sleep_cookie) - mwifiex_pcie_delete_sleep_cookie_buf(adapter); - - mwifiex_pcie_delete_cmdrsp_buf(adapter); - mwifiex_pcie_delete_evtbd_ring(adapter); - mwifiex_pcie_delete_rxbd_ring(adapter); - mwifiex_pcie_delete_txbd_ring(adapter); - card->cmdrsp_buf = NULL; + mwifiex_pcie_free_buffers(adapter); } static struct mwifiex_if_ops pcie_ops = { From ba1c7e45ec224cc8d2df33ecaee1946d48e79231 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 10 Mar 2017 17:39:22 -0800 Subject: [PATCH 124/297] mwifiex: set adapter->dev before starting to use mwifiex_dbg() The mwifiex_dbg() log handler utilizes the struct device in adapter->dev. Without it, it decides not to print anything. As of commit 2e02b5814217 ("mwifiex: Allow mwifiex early access to device structure"), we started assigning that pointer only after we finished mwifiex_register() -- this effectively neuters any mwifiex_dbg() logging done before this point. Let's move the device assignment into mwifiex_register(). Fixes: 2e02b5814217 ("mwifiex: Allow mwifiex early access to device structure") Cc: Rajat Jain Signed-off-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 5ebca1d0cfc7..43d040e02e4d 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -57,8 +57,8 @@ MODULE_PARM_DESC(mfg_mode, "manufacturing mode enable:1, disable:0"); * In case of any errors during inittialization, this function also ensures * proper cleanup before exiting. */ -static int mwifiex_register(void *card, struct mwifiex_if_ops *if_ops, - void **padapter) +static int mwifiex_register(void *card, struct device *dev, + struct mwifiex_if_ops *if_ops, void **padapter) { struct mwifiex_adapter *adapter; int i; @@ -68,6 +68,7 @@ static int mwifiex_register(void *card, struct mwifiex_if_ops *if_ops, return -ENOMEM; *padapter = adapter; + adapter->dev = dev; adapter->card = card; /* Save interface specific operations in adapter */ @@ -1568,12 +1569,11 @@ mwifiex_add_card(void *card, struct completion *fw_done, { struct mwifiex_adapter *adapter; - if (mwifiex_register(card, if_ops, (void **)&adapter)) { + if (mwifiex_register(card, dev, if_ops, (void **)&adapter)) { pr_err("%s: software init failed\n", __func__); goto err_init_sw; } - adapter->dev = dev; mwifiex_probe_of(adapter); adapter->iface_type = iface_type; From 36908c4e5b1063eff3e11336fab544a76c625b69 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 10 Mar 2017 17:39:23 -0800 Subject: [PATCH 125/297] mwifiex: uninit wakeup info when removing device We manually init wakeup info, but we don't detach it on device removal. This means that if we (for example) rmmod + modprobe the driver, the device framework might return -EEXIST the second time, and we'll complain in the logs: [ 839.311881] mwifiex_pcie 0000:01:00.0: fail to init wakeup for mwifiex AFAICT, there's no other negative effect. But we can fix this by disabling wakeup on remove, similar to what a few other drivers do (e.g., the power supply framework). This code (and bug) has existed on SDIO for a while, but it got moved around and enabled for PCIe with commit 853402a00823 ("mwifiex: Enable WoWLAN for both sdio and pcie"). Signed-off-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 43d040e02e4d..b62e03d11c2e 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1718,6 +1718,9 @@ int mwifiex_remove_card(struct mwifiex_adapter *adapter) wiphy_unregister(adapter->wiphy); wiphy_free(adapter->wiphy); + if (adapter->irq_wakeup >= 0) + device_init_wakeup(adapter->dev, false); + /* Unregister device */ mwifiex_dbg(adapter, INFO, "info: unregister device\n"); From cf8c44d42c4f4f38468a53e9ce2a0314e7ebeaa1 Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Tue, 28 Feb 2017 18:54:31 +0530 Subject: [PATCH 126/297] MAINTAINERS: update for mwifiex driver maintainers Ganapathi & Xinming are starting to take a more active role in the mwifiex driver maintainership here onwards on account of organizational changes. CC: Xinming Hu CC: Ganapathi Bhat Signed-off-by: Amitkumar Karwar Signed-off-by: Nishant Sarmukadam Signed-off-by: Cathy Luo Signed-off-by: Kalle Valo --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 33875e07482b..078c38217daa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7846,6 +7846,8 @@ F: drivers/net/ethernet/marvell/mvneta.* MARVELL MWIFIEX WIRELESS DRIVER M: Amitkumar Karwar M: Nishant Sarmukadam +M: Ganapathi Bhat +M: Xinming Hu L: linux-wireless@vger.kernel.org S: Maintained F: drivers/net/wireless/marvell/mwifiex/ From 6bce725a78de1b171928ce66dec2bae4b569e5d1 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 8 Mar 2017 14:30:34 +0100 Subject: [PATCH 127/297] x86/mpx: Make unnecessarily global function static Make the function get_user_bd_entry() static as it is not used outside of arch/x86/mm/mpx.c This fixes a sparse warning. Signed-off-by: Tobias Klauser Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 5126dfd52b18..cd44ae727df7 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -590,7 +590,7 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, * we might run off the end of the bounds table if we are on * a 64-bit kernel and try to get 8 bytes. */ -int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, +static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, long __user *bd_entry_ptr) { u32 bd_entry_32; From dcc3b5ffe1b32771c9a22e2c916fb94c4fcf5b79 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 6 Mar 2017 21:51:28 -0800 Subject: [PATCH 128/297] sched/deadline: Add missing update_rq_clock() in dl_task_timer() The following warning can be triggered by hot-unplugging the CPU on which an active SCHED_DEADLINE task is running on: ------------[ cut here ]------------ WARNING: CPU: 7 PID: 0 at kernel/sched/sched.h:833 replenish_dl_entity+0x71e/0xc40 rq->clock_update_flags < RQCF_ACT_SKIP CPU: 7 PID: 0 Comm: swapper/7 Tainted: G B 4.11.0-rc1+ #24 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016 Call Trace: dump_stack+0x85/0xc4 __warn+0x172/0x1b0 warn_slowpath_fmt+0xb4/0xf0 ? __warn+0x1b0/0x1b0 ? debug_check_no_locks_freed+0x2c0/0x2c0 ? cpudl_set+0x3d/0x2b0 replenish_dl_entity+0x71e/0xc40 enqueue_task_dl+0x2ea/0x12e0 ? dl_task_timer+0x777/0x990 ? __hrtimer_run_queues+0x270/0xa50 dl_task_timer+0x316/0x990 ? enqueue_task_dl+0x12e0/0x12e0 ? enqueue_task_dl+0x12e0/0x12e0 __hrtimer_run_queues+0x270/0xa50 ? hrtimer_cancel+0x20/0x20 ? hrtimer_interrupt+0x119/0x600 hrtimer_interrupt+0x19c/0x600 ? trace_hardirqs_off+0xd/0x10 local_apic_timer_interrupt+0x74/0xe0 smp_apic_timer_interrupt+0x76/0xa0 apic_timer_interrupt+0x93/0xa0 The DL task will be migrated to a suitable later deadline rq once the DL timer fires and currnet rq is offline. The rq clock of the new rq should be updated. This patch fixes it by updating the rq clock after holding the new rq's rq lock. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Matt Fleming Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1488865888-15894-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 99b2c33a9fbc..c6db3fd727fe 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -638,6 +638,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); rf.cookie = lockdep_pin_lock(&rq->lock); + update_rq_clock(rq); /* * Now that the task has been migrated to the new RQ and we From 6e5f32f7a43f45ee55c401c0b9585eb01f9629a8 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 17 Feb 2017 12:07:30 +0000 Subject: [PATCH 129/297] sched/loadavg: Avoid loadavg spikes caused by delayed NO_HZ accounting If we crossed a sample window while in NO_HZ we will add LOAD_FREQ to the pending sample window time on exit, setting the next update not one window into the future, but two. This situation on exiting NO_HZ is described by: this_rq->calc_load_update < jiffies < calc_load_update In this scenario, what we should be doing is: this_rq->calc_load_update = calc_load_update [ next window ] But what we actually do is: this_rq->calc_load_update = calc_load_update + LOAD_FREQ [ next+1 window ] This has the effect of delaying load average updates for potentially up to ~9seconds. This can result in huge spikes in the load average values due to per-cpu uninterruptible task counts being out of sync when accumulated across all CPUs. It's safe to update the per-cpu active count if we wake between sample windows because any load that we left in 'calc_load_idle' will have been zero'd when the idle load was folded in calc_global_load(). This issue is easy to reproduce before, commit 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") just by forking short-lived process pipelines built from ps(1) and grep(1) in a loop. I'm unable to reproduce the spikes after that commit, but the bug still seems to be present from code review. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Fixes: commit 5167e8d ("sched/nohz: Rewrite and fix load-avg computation -- again") Link: http://lkml.kernel.org/r/20170217120731.11868-2-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/sched/loadavg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 7296b7308eca..3a55f3f9ffe4 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -202,8 +202,9 @@ void calc_load_exit_idle(void) struct rq *this_rq = this_rq(); /* - * If we're still before the sample window, we're done. + * If we're still before the pending sample window, we're done. */ + this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update)) return; @@ -212,7 +213,6 @@ void calc_load_exit_idle(void) * accounted through the nohz accounting, so skip the entire deal and * sync up for the next window. */ - this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update + 10)) this_rq->calc_load_update += LOAD_FREQ; } From caeb5882979bc6f3c8766fcf59c6269b38f521bc Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 17 Feb 2017 12:07:31 +0000 Subject: [PATCH 130/297] sched/loadavg: Use {READ,WRITE}_ONCE() for sample window 'calc_load_update' is accessed without any kind of locking and there's a clear assumption in the code that only a single value is read or written. Make this explicit by using READ_ONCE() and WRITE_ONCE(), and avoid unintentionally seeing multiple values, or having the load/stores split. Technically the loads in calc_global_*() don't require this since those are the only functions that update 'calc_load_update', but I've added the READ_ONCE() for consistency. Suggested-by: Peter Zijlstra Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20170217120731.11868-3-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/sched/loadavg.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 3a55f3f9ffe4..f15fb2bdbc0d 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -169,7 +169,7 @@ static inline int calc_load_write_idx(void) * If the folding window started, make sure we start writing in the * next idle-delta. */ - if (!time_before(jiffies, calc_load_update)) + if (!time_before(jiffies, READ_ONCE(calc_load_update))) idx++; return idx & 1; @@ -204,7 +204,7 @@ void calc_load_exit_idle(void) /* * If we're still before the pending sample window, we're done. */ - this_rq->calc_load_update = calc_load_update; + this_rq->calc_load_update = READ_ONCE(calc_load_update); if (time_before(jiffies, this_rq->calc_load_update)) return; @@ -308,13 +308,15 @@ calc_load_n(unsigned long load, unsigned long exp, */ static void calc_global_nohz(void) { + unsigned long sample_window; long delta, active, n; - if (!time_before(jiffies, calc_load_update + 10)) { + sample_window = READ_ONCE(calc_load_update); + if (!time_before(jiffies, sample_window + 10)) { /* * Catch-up, fold however many we are behind still */ - delta = jiffies - calc_load_update - 10; + delta = jiffies - sample_window - 10; n = 1 + (delta / LOAD_FREQ); active = atomic_long_read(&calc_load_tasks); @@ -324,7 +326,7 @@ static void calc_global_nohz(void) avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - calc_load_update += n * LOAD_FREQ; + WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ); } /* @@ -352,9 +354,11 @@ static inline void calc_global_nohz(void) { } */ void calc_global_load(unsigned long ticks) { + unsigned long sample_window; long active, delta; - if (time_before(jiffies, calc_load_update + 10)) + sample_window = READ_ONCE(calc_load_update); + if (time_before(jiffies, sample_window + 10)) return; /* @@ -371,7 +375,7 @@ void calc_global_load(unsigned long ticks) avenrun[1] = calc_load(avenrun[1], EXP_5, active); avenrun[2] = calc_load(avenrun[2], EXP_15, active); - calc_load_update += LOAD_FREQ; + WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); /* * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. From 17fcbd590d0c3e35bd9646e2215f86586378bc42 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 25 Feb 2017 01:17:53 +0100 Subject: [PATCH 131/297] locking/rwsem: Fix down_write_killable() for CONFIG_RWSEM_GENERIC_SPINLOCK=y We hang if SIGKILL has been sent, but the task is stuck in down_read() (after do_exit()), even though no task is doing down_write() on the rwsem in question: INFO: task libupnp:21868 blocked for more than 120 seconds. libupnp D 0 21868 1 0x08100008 ... Call Trace: __schedule() schedule() __down_read() do_exit() do_group_exit() __wake_up_parent() This bug has already been fixed for CONFIG_RWSEM_XCHGADD_ALGORITHM=y in the following commit: 04cafed7fc19 ("locking/rwsem: Fix down_write_killable()") ... however, this bug also exists for CONFIG_RWSEM_GENERIC_SPINLOCK=y. Signed-off-by: Niklas Cassel Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Andrew Morton Cc: Linus Torvalds Cc: Niklas Cassel Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: d47996082f52 ("locking/rwsem: Introduce basis for down_write_killable()") Link: http://lkml.kernel.org/r/1487981873-12649-1-git-send-email-niklass@axis.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-spinlock.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 7bc24d477805..c65f7989f850 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -213,10 +213,9 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) */ if (sem->count == 0) break; - if (signal_pending_state(state, current)) { - ret = -EINTR; - goto out; - } + if (signal_pending_state(state, current)) + goto out_nolock; + set_current_state(state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); @@ -224,12 +223,19 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) } /* got the lock */ sem->count = -1; -out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; + +out_nolock: + list_del(&waiter.list); + if (!list_empty(&sem->wait_list)) + __rwsem_do_wake(sem, 1); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return -EINTR; } void __sched __down_write(struct rw_semaphore *sem) From 5ac69d37784b237707a7b15d199cdb6c6fdb6780 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 2 Mar 2017 15:10:57 +0100 Subject: [PATCH 132/297] sched/deadline: Make sure the replenishment timer fires in the next period Currently, the replenishment timer is set to fire at the deadline of a task. Although that works for implicit deadline tasks because the deadline is equals to the begin of the next period, that is not correct for constrained deadline tasks (deadline < period). For instance: f.c: --------------- %< --------------- int main (void) { for(;;); } --------------- >% --------------- # gcc -o f f.c # trace-cmd record -e sched:sched_switch \ -e syscalls:sys_exit_sched_setattr \ chrt -d --sched-runtime 490000000 \ --sched-deadline 500000000 \ --sched-period 1000000000 0 ./f # trace-cmd report | grep "{pid of ./f}" After setting parameters, the task is replenished and continue running until being throttled: f-11295 [003] 13322.113776: sys_exit_sched_setattr: 0x0 The task is throttled after running 492318 ms, as expected: f-11295 [003] 13322.606094: sched_switch: f:11295 [-1] R ==> watchdog/3:32 [0] But then, the task is replenished 500719 ms after the first replenishment: -0 [003] 13322.614495: sched_switch: swapper/3:0 [120] R ==> f:11295 [-1] Running for 490277 ms: f-11295 [003] 13323.104772: sched_switch: f:11295 [-1] R ==> swapper/3:0 [120] Hence, in the first period, the task runs 2 * runtime, and that is a bug. During the first replenishment, the next deadline is set one period away. So the runtime / period starts to be respected. However, as the second replenishment took place in the wrong instant, the next replenishment will also be held in a wrong instant of time. Rather than occurring in the nth period away from the first activation, it is taking place in the (nth period - relative deadline). Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Romulo Silva de Oliveira Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/ac50d89887c25285b47465638354b63362f8adff.1488392936.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c6db3fd727fe..445e2787bf80 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, } } +static inline u64 dl_next_period(struct sched_dl_entity *dl_se) +{ + return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period; +} + /* * If the entity depleted all its runtime, and if we want it to sleep * while waiting for some new execution time to become available, we - * set the bandwidth enforcement timer to the replenishment instant + * set the bandwidth replenishment timer to the replenishment instant * and try to activate it. * * Notice that it is important for the caller to know if the timer @@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p) * that it is actually coming from rq->clock and not from * hrtimer's time base reading. */ - act = ns_to_ktime(dl_se->deadline); + act = ns_to_ktime(dl_next_period(dl_se)); now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); From df8eac8cafce7d086be3bd5cf5a838fa37594dfb Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 2 Mar 2017 15:10:58 +0100 Subject: [PATCH 133/297] sched/deadline: Throttle a constrained deadline task activated after the deadline During the activation, CBS checks if it can reuse the current task's runtime and period. If the deadline of the task is in the past, CBS cannot use the runtime, and so it replenishes the task. This rule works fine for implicit deadline tasks (deadline == period), and the CBS was designed for implicit deadline tasks. However, a task with constrained deadline (deadine < period) might be awakened after the deadline, but before the next period. In this case, replenishing the task would allow it to run for runtime / deadline. As in this case deadline < period, CBS enables a task to run for more than the runtime / period. In a very loaded system, this can cause a domino effect, making other tasks miss their deadlines. To avoid this problem, in the activation of a constrained deadline task after the deadline but before the next period, throttle the task and set the replenishing timer to the begin of the next period, unless it is boosted. Reproducer: --------------- %< --------------- int main (int argc, char **argv) { int ret; int flags = 0; unsigned long l = 0; struct timespec ts; struct sched_attr attr; memset(&attr, 0, sizeof(attr)); attr.size = sizeof(attr); attr.sched_policy = SCHED_DEADLINE; attr.sched_runtime = 2 * 1000 * 1000; /* 2 ms */ attr.sched_deadline = 2 * 1000 * 1000; /* 2 ms */ attr.sched_period = 2 * 1000 * 1000 * 1000; /* 2 s */ ts.tv_sec = 0; ts.tv_nsec = 2000 * 1000; /* 2 ms */ ret = sched_setattr(0, &attr, flags); if (ret < 0) { perror("sched_setattr"); exit(-1); } for(;;) { /* XXX: you may need to adjust the loop */ for (l = 0; l < 150000; l++); /* * The ideia is to go to sleep right before the deadline * and then wake up before the next period to receive * a new replenishment. */ nanosleep(&ts, NULL); } exit(0); } --------------- >% --------------- On my box, this reproducer uses almost 50% of the CPU time, which is obviously wrong for a task with 2/2000 reservation. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Romulo Silva de Oliveira Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/edf58354e01db46bf42df8d2dd32418833f68c89.1488392936.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 45 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 445e2787bf80..736d8b9d9bab 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -695,6 +695,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) timer->function = dl_task_timer; } +/* + * During the activation, CBS checks if it can reuse the current task's + * runtime and period. If the deadline of the task is in the past, CBS + * cannot use the runtime, and so it replenishes the task. This rule + * works fine for implicit deadline tasks (deadline == period), and the + * CBS was designed for implicit deadline tasks. However, a task with + * constrained deadline (deadine < period) might be awakened after the + * deadline, but before the next period. In this case, replenishing the + * task would allow it to run for runtime / deadline. As in this case + * deadline < period, CBS enables a task to run for more than the + * runtime / period. In a very loaded system, this can cause a domino + * effect, making other tasks miss their deadlines. + * + * To avoid this problem, in the activation of a constrained deadline + * task after the deadline but before the next period, throttle the + * task and set the replenishing timer to the begin of the next period, + * unless it is boosted. + */ +static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) +{ + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + + if (dl_time_before(dl_se->deadline, rq_clock(rq)) && + dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { + if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + return; + dl_se->dl_throttled = 1; + } +} + static int dl_runtime_exceeded(struct sched_dl_entity *dl_se) { @@ -928,6 +959,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } +static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline < dl_se->dl_period; +} + static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -953,6 +989,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } + /* + * Check if a constrained deadline task was activated + * after the deadline but before the next period. + * If that is the case, the task will be throttled and + * the replenishment timer will be set to the next period. + */ + if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + dl_check_constrained_dl(&p->dl); + /* * If p is throttled, we do nothing. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on From 2317d5f1c34913bac5971d93d69fb6c31bb74670 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Mar 2017 15:10:59 +0100 Subject: [PATCH 134/297] sched/deadline: Use deadline instead of period when calculating overflow I was testing Daniel's changes with his test case, and tweaked it a little. Instead of having the runtime equal to the deadline, I increased the deadline ten fold. Daniel's test case had: attr.sched_runtime = 2 * 1000 * 1000; /* 2 ms */ attr.sched_deadline = 2 * 1000 * 1000; /* 2 ms */ attr.sched_period = 2 * 1000 * 1000 * 1000; /* 2 s */ To make it more interesting, I changed it to: attr.sched_runtime = 2 * 1000 * 1000; /* 2 ms */ attr.sched_deadline = 20 * 1000 * 1000; /* 20 ms */ attr.sched_period = 2 * 1000 * 1000 * 1000; /* 2 s */ The results were rather surprising. The behavior that Daniel's patch was fixing came back. The task started using much more than .1% of the CPU. More like 20%. Looking into this I found that it was due to the dl_entity_overflow() constantly returning true. That's because it uses the relative period against relative runtime vs the absolute deadline against absolute runtime. runtime / (deadline - t) > dl_runtime / dl_period There's even a comment mentioning this, and saying that when relative deadline equals relative period, that the equation is the same as using deadline instead of period. That comment is backwards! What we really want is: runtime / (deadline - t) > dl_runtime / dl_deadline We care about if the runtime can make its deadline, not its period. And then we can say "when the deadline equals the period, the equation is the same as using dl_period instead of dl_deadline". After correcting this, now when the task gets enqueued, it can throttle correctly, and Daniel's fix to the throttling of sleeping deadline tasks works even when the runtime and deadline are not the same. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Romulo Silva de Oliveira Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/02135a27f1ae3fe5fd032568a5a2f370e190e8d7.1488392936.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 736d8b9d9bab..a2ce59015642 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * * This function returns true if: * - * runtime / (deadline - t) > dl_runtime / dl_period , + * runtime / (deadline - t) > dl_runtime / dl_deadline , * * IOW we can't recycle current parameters. * - * Notice that the bandwidth check is done against the period. For + * Notice that the bandwidth check is done against the deadline. For * task with deadline equal to period this is the same of using - * dl_deadline instead of dl_period in the equation above. + * dl_period instead of dl_deadline in the equation above. */ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se, u64 t) @@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * (pi_se->dl_runtime >> DL_SCALE); From ea90e0dc8cecba6359b481e24d9c37160f6f524f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Mar 2017 14:26:04 +0100 Subject: [PATCH 135/297] nl80211: fix dumpit error path RTNL deadlocks Sowmini pointed out Dmitry's RTNL deadlock report to me, and it turns out to be perfectly accurate - there are various error paths that miss unlock of the RTNL. To fix those, change the locking a bit to not be conditional in all those nl80211_prepare_*_dump() functions, but make those require the RTNL to start with, and fix the buggy error paths. This also let me use sparse (by appropriately overriding the rtnl_lock/rtnl_unlock functions) to validate the changes. Cc: stable@vger.kernel.org Reported-by: Sowmini Varadhan Reported-by: Dmitry Vyukov Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 127 ++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 71 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d7f8be4e321a..2312dc2ffdb9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -545,22 +545,18 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, { int err; - rtnl_lock(); - if (!cb->args[0]) { err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, genl_family_attrbuf(&nl80211_fam), nl80211_fam.maxattr, nl80211_policy); if (err) - goto out_unlock; + return err; *wdev = __cfg80211_wdev_from_attrs( sock_net(skb->sk), genl_family_attrbuf(&nl80211_fam)); - if (IS_ERR(*wdev)) { - err = PTR_ERR(*wdev); - goto out_unlock; - } + if (IS_ERR(*wdev)) + return PTR_ERR(*wdev); *rdev = wiphy_to_rdev((*wdev)->wiphy); /* 0 is the first index - add 1 to parse only once */ cb->args[0] = (*rdev)->wiphy_idx + 1; @@ -570,10 +566,8 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); struct wireless_dev *tmp; - if (!wiphy) { - err = -ENODEV; - goto out_unlock; - } + if (!wiphy) + return -ENODEV; *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; @@ -584,21 +578,11 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, } } - if (!*wdev) { - err = -ENODEV; - goto out_unlock; - } + if (!*wdev) + return -ENODEV; } return 0; - out_unlock: - rtnl_unlock(); - return err; -} - -static void nl80211_finish_wdev_dump(struct cfg80211_registered_device *rdev) -{ - rtnl_unlock(); } /* IE validation */ @@ -2608,17 +2592,17 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * int filter_wiphy = -1; struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; + int ret; rtnl_lock(); if (!cb->args[2]) { struct nl80211_dump_wiphy_state state = { .filter_wiphy = -1, }; - int ret; ret = nl80211_dump_wiphy_parse(skb, cb, &state); if (ret) - return ret; + goto out_unlock; filter_wiphy = state.filter_wiphy; @@ -2663,12 +2647,14 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * wp_idx++; } out: - rtnl_unlock(); - cb->args[0] = wp_idx; cb->args[1] = if_idx; - return skb->len; + ret = skb->len; + out_unlock: + rtnl_unlock(); + + return ret; } static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) @@ -4452,9 +4438,10 @@ static int nl80211_dump_station(struct sk_buff *skb, int sta_idx = cb->args[2]; int err; + rtnl_lock(); err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) - return err; + goto out_err; if (!wdev->netdev) { err = -EINVAL; @@ -4489,7 +4476,7 @@ static int nl80211_dump_station(struct sk_buff *skb, cb->args[2] = sta_idx; err = skb->len; out_err: - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return err; } @@ -5275,9 +5262,10 @@ static int nl80211_dump_mpath(struct sk_buff *skb, int path_idx = cb->args[2]; int err; + rtnl_lock(); err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) - return err; + goto out_err; if (!rdev->ops->dump_mpath) { err = -EOPNOTSUPP; @@ -5310,7 +5298,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return err; } @@ -5470,9 +5458,10 @@ static int nl80211_dump_mpp(struct sk_buff *skb, int path_idx = cb->args[2]; int err; + rtnl_lock(); err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) - return err; + goto out_err; if (!rdev->ops->dump_mpp) { err = -EOPNOTSUPP; @@ -5505,7 +5494,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return err; } @@ -7674,9 +7663,12 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) int start = cb->args[2], idx = 0; int err; + rtnl_lock(); err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); - if (err) + if (err) { + rtnl_unlock(); return err; + } wdev_lock(wdev); spin_lock_bh(&rdev->bss_lock); @@ -7699,7 +7691,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) wdev_unlock(wdev); cb->args[2] = idx; - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return skb->len; } @@ -7784,9 +7776,10 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) int res; bool radio_stats; + rtnl_lock(); res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (res) - return res; + goto out_err; /* prepare_wdev_dump parsed the attributes */ radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS]; @@ -7827,7 +7820,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) cb->args[2] = survey_idx; res = skb->len; out_err: - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return res; } @@ -11508,17 +11501,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, void *data = NULL; unsigned int data_len = 0; - rtnl_lock(); - if (cb->args[0]) { /* subtract the 1 again here */ struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); struct wireless_dev *tmp; - if (!wiphy) { - err = -ENODEV; - goto out_unlock; - } + if (!wiphy) + return -ENODEV; *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; @@ -11538,23 +11527,19 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, attrbuf, nl80211_fam.maxattr, nl80211_policy); if (err) - goto out_unlock; + return err; if (!attrbuf[NL80211_ATTR_VENDOR_ID] || - !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) { - err = -EINVAL; - goto out_unlock; - } + !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) + return -EINVAL; *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf); if (IS_ERR(*wdev)) *wdev = NULL; *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf); - if (IS_ERR(*rdev)) { - err = PTR_ERR(*rdev); - goto out_unlock; - } + if (IS_ERR(*rdev)) + return PTR_ERR(*rdev); vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]); subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]); @@ -11567,19 +11552,15 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd) continue; - if (!vcmd->dumpit) { - err = -EOPNOTSUPP; - goto out_unlock; - } + if (!vcmd->dumpit) + return -EOPNOTSUPP; vcmd_idx = i; break; } - if (vcmd_idx < 0) { - err = -EOPNOTSUPP; - goto out_unlock; - } + if (vcmd_idx < 0) + return -EOPNOTSUPP; if (attrbuf[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]); @@ -11596,9 +11577,6 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, /* keep rtnl locked in successful case */ return 0; - out_unlock: - rtnl_unlock(); - return err; } static int nl80211_vendor_cmd_dump(struct sk_buff *skb, @@ -11613,9 +11591,10 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb, int err; struct nlattr *vendor_data; + rtnl_lock(); err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev); if (err) - return err; + goto out; vcmd_idx = cb->args[2]; data = (void *)cb->args[3]; @@ -11624,15 +11603,21 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb, if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV | WIPHY_VENDOR_CMD_NEED_NETDEV)) { - if (!wdev) - return -EINVAL; + if (!wdev) { + err = -EINVAL; + goto out; + } if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV && - !wdev->netdev) - return -EINVAL; + !wdev->netdev) { + err = -EINVAL; + goto out; + } if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) { - if (!wdev_running(wdev)) - return -ENETDOWN; + if (!wdev_running(wdev)) { + err = -ENETDOWN; + goto out; + } } } From f717629c7f834ab2efa05c7dbf0826f1d7c32ade Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Thu, 16 Mar 2017 14:37:11 +0530 Subject: [PATCH 136/297] powerpc: Wire up statx() syscall Test runs on a ppc64 BE guest succeeded. linux/samples/statx/test-statx program was executed on the following file types, 1. Regular file 2. Directory 3. device file 4. symlink 5. Named pipe The test run also included invoking test-statx with the runtime options provided in the main() function of test-statx.c Signed-off-by: Chandan Rajendra Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/systbl.h | 1 + arch/powerpc/include/asm/unistd.h | 2 +- arch/powerpc/include/uapi/asm/unistd.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 4b369d83fe9c..1c9470881c4a 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -387,3 +387,4 @@ SYSCALL(copy_file_range) COMPAT_SYS_SPU(preadv2) COMPAT_SYS_SPU(pwritev2) SYSCALL(kexec_file_load) +SYSCALL(statx) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index eb1acee91a20..9ba11dbcaca9 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -12,7 +12,7 @@ #include -#define NR_syscalls 383 +#define NR_syscalls 384 #define __NR__exit __NR_exit diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h index 2f26335a3c42..b85f14228857 100644 --- a/arch/powerpc/include/uapi/asm/unistd.h +++ b/arch/powerpc/include/uapi/asm/unistd.h @@ -393,5 +393,6 @@ #define __NR_preadv2 380 #define __NR_pwritev2 381 #define __NR_kexec_file_load 382 +#define __NR_statx 383 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */ From e552a8389aa409e257b7dcba74f67f128f979ccc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Mar 2017 13:47:48 +0100 Subject: [PATCH 137/297] perf/core: Fix use-after-free in perf_release() Dmitry reported syzcaller tripped a use-after-free in perf_release(). After much puzzlement Oleg spotted the below scenario: Task1 Task2 fork() perf_event_init_task() /* ... */ goto bad_fork_$foo; /* ... */ perf_event_free_task() mutex_lock(ctx->lock) perf_free_event(B) perf_event_release_kernel(A) mutex_lock(A->child_mutex) list_for_each_entry(child, ...) { /* child == B */ ctx = B->ctx; get_ctx(ctx); mutex_unlock(A->child_mutex); mutex_lock(A->child_mutex) list_del_init(B->child_list) mutex_unlock(A->child_mutex) /* ... */ mutex_unlock(ctx->lock); put_ctx() /* >0 */ free_task(); mutex_lock(ctx->lock); mutex_lock(A->child_mutex); /* ... */ mutex_unlock(A->child_mutex); mutex_unlock(ctx->lock) put_ctx() /* 0 */ ctx->task && !TOMBSTONE put_task_struct() /* UAF */ This patch closes the hole by making perf_event_free_task() destroy the task <-> ctx relation such that perf_event_release_kernel() will no longer observe the now dead task. Spotted-by: Oleg Nesterov Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: fweisbec@gmail.com Cc: oleg@redhat.com Cc: stable@vger.kernel.org Fixes: c6e5b73242d2 ("perf: Synchronously clean up child events") Link: http://lkml.kernel.org/r/20170314155949.GE32474@worktop Link: http://lkml.kernel.org/r/20170316125823.140295131@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1031bdf9f012..4742909c56e6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10415,6 +10415,17 @@ void perf_event_free_task(struct task_struct *task) continue; mutex_lock(&ctx->mutex); + raw_spin_lock_irq(&ctx->lock); + /* + * Destroy the task <-> ctx relation and mark the context dead. + * + * This is important because even though the task hasn't been + * exposed yet the context has been (through child_list). + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ + raw_spin_unlock_irq(&ctx->lock); again: list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) From e7cc4865f0f31698ef2f7aac01a50e78968985b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Mar 2017 13:47:49 +0100 Subject: [PATCH 138/297] perf/core: Fix event inheritance on fork() While hunting for clues to a use-after-free, Oleg spotted that perf_event_init_context() can loose an error value with the result that fork() can succeed even though we did not fully inherit the perf event context. Spotted-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Dmitry Vyukov Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: oleg@redhat.com Cc: stable@vger.kernel.org Fixes: 889ff0150661 ("perf/core: Split context's event group list into pinned and non-pinned lists") Link: http://lkml.kernel.org/r/20170316125823.190342547@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4742909c56e6..fc7c9a85944d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10679,7 +10679,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) - break; + goto out_unlock; } /* @@ -10695,7 +10695,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) - break; + goto out_unlock; } raw_spin_lock_irqsave(&parent_ctx->lock, flags); @@ -10723,6 +10723,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) } raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); +out_unlock: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); From 15121c789e001168decac6483d192bdb7ea29e74 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Mar 2017 13:47:50 +0100 Subject: [PATCH 139/297] perf/core: Simplify perf_event_free_task() We have ctx->event_list that contains all events; no need to repeatedly iterate the group lists to find them all. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Dmitry Vyukov Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/20170316125823.239678244@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index fc7c9a85944d..5f21e5e09ba4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10426,21 +10426,11 @@ void perf_event_free_task(struct task_struct *task) WRITE_ONCE(ctx->task, TASK_TOMBSTONE); put_task_struct(task); /* cannot be last */ raw_spin_unlock_irq(&ctx->lock); -again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, - group_entry) - perf_free_event(event, ctx); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) perf_free_event(event, ctx); - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; - mutex_unlock(&ctx->mutex); - put_ctx(ctx); } } From d8a8cfc76919b6c830305266b23ba671623f37ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Mar 2017 13:47:51 +0100 Subject: [PATCH 140/297] perf/core: Better explain the inherit magic While going through the event inheritance code Oleg got confused. Add some comments to better explain the silent dissapearance of orphaned events. So what happens is that at perf_event_release_kernel() time; when an event looses its connection to userspace (and ceases to exist from the user's perspective) we can still have an arbitrary amount of inherited copies of the event. We want to synchronously find and remove all these child events. Since that requires a bit of lock juggling, there is the possibility that concurrent clone()s will create new child events. Therefore we first mark the parent event as DEAD, which marks all the extant child events as orphaned. We then avoid copying orphaned events; in order to avoid getting more of them. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Dmitry Vyukov Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/20170316125823.289567442@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5f21e5e09ba4..7298e149b732 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4254,7 +4254,7 @@ int perf_event_release_kernel(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); /* - * Mark this even as STATE_DEAD, there is no external reference to it + * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ @@ -10468,7 +10468,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) } /* - * inherit a event from parent task to child task: + * Inherit a event from parent task to child task. + * + * Returns: + * - valid pointer on success + * - NULL for orphaned events + * - IS_ERR() on error */ static struct perf_event * inherit_event(struct perf_event *parent_event, @@ -10562,6 +10567,16 @@ inherit_event(struct perf_event *parent_event, return child_event; } +/* + * Inherits an event group. + * + * This will quietly suppress orphaned events; !inherit_event() is not an error. + * This matches with perf_event_release_kernel() removing all child events. + * + * Returns: + * - 0 on success + * - <0 on error + */ static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, @@ -10576,6 +10591,11 @@ static int inherit_group(struct perf_event *parent_event, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); + /* + * @leader can be NULL here because of is_orphaned_event(). In this + * case inherit_event() will create individual events, similar to what + * perf_group_detach() would do anyway. + */ list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); @@ -10585,6 +10605,17 @@ static int inherit_group(struct perf_event *parent_event, return 0; } +/* + * Creates the child task context and tries to inherit the event-group. + * + * Clears @inherited_all on !attr.inherited or error. Note that we'll leave + * inherited_all set when we 'fail' to inherit an orphaned event; this is + * consistent with perf_event_release_kernel() removing all child events. + * + * Returns: + * - 0 on success + * - <0 on error + */ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, @@ -10607,7 +10638,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); if (!child_ctx) return -ENOMEM; From 29c8bbbd6e21daa0997d1c3ee886b897ee7ad652 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:43 +0000 Subject: [PATCH 141/297] afs: Fix missing put_page() In afs_writepages_region(), inside the loop where we find dirty pages to deal with, one of the if-statements is missing a put_page(). Signed-off-by: David Howells --- fs/afs/write.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/afs/write.c b/fs/afs/write.c index c83c1a0e851f..e919e64cd4e0 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -513,6 +513,7 @@ static int afs_writepages_region(struct address_space *mapping, if (PageWriteback(page) || !PageDirty(page)) { unlock_page(page); + put_page(page); continue; } From 5611ef280d814042825ee17688f5751266fc538b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:43 +0000 Subject: [PATCH 142/297] afs: Fix page overput in afs_fill_page() afs_fill_page() loads the page it wants to fill into the afs_read request without incrementing its refcount - but then calls afs_put_read() to clean up afterwards, which then releases a ref on the page. Fix this by getting a ref on the page before calling afs_vnode_fetch_data(). This causes sync after a write to hang in afs_writepages_region() because find_get_pages_tag() gets confused and doesn't return. Fixes: 196ee9cd2d04 ("afs: Make afs_fs_fetch_data() take a list of pages") Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne --- fs/afs/write.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/afs/write.c b/fs/afs/write.c index e919e64cd4e0..3ac52f6a96ff 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -101,6 +101,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, req->pos = pos; req->nr_pages = 1; req->pages[0] = page; + get_page(page); i_size = i_size_read(&vnode->vfs_inode); if (pos + PAGE_SIZE > i_size) From 6186f0788b31f44affceeedc7b48eb10faea120d Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 16 Mar 2017 16:27:43 +0000 Subject: [PATCH 143/297] afs: Populate group ID from vnode status The group was hard coded to GLOBAL_ROOT_GID; use the group ID that was received from the server. Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1e4897a048d2..299dbaeb2e2a 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -70,7 +70,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; - inode->i_gid = GLOBAL_ROOT_GID; + inode->i_gid = vnode->status.group; inode->i_size = vnode->status.size; inode->i_ctime.tv_sec = vnode->status.mtime_server; inode->i_ctime.tv_nsec = 0; From 627f46943ff90bcc32ddeb675d881c043c6fa2ae Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: [PATCH 144/297] afs: Adjust mode bits processing Mode bits for an afs file should not be enforced in the usual way. For files, the absence of user bits can restrict file access with respect to what is granted by the server. These bits apply regardless of the owner or the current uid; the rest of the mode bits (group, other) are ignored. Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/security.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/afs/security.c b/fs/afs/security.c index 8d010422dc89..bfa9d3428383 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -340,17 +340,22 @@ int afs_permission(struct inode *inode, int mask) } else { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; + if ((mask & MAY_EXEC) && !(inode->i_mode & S_IXUSR)) + goto permission_denied; if (mask & (MAY_EXEC | MAY_READ)) { if (!(access & AFS_ACE_READ)) goto permission_denied; + if (!(inode->i_mode & S_IRUSR)) + goto permission_denied; } else if (mask & MAY_WRITE) { if (!(access & AFS_ACE_WRITE)) goto permission_denied; + if (!(inode->i_mode & S_IWUSR)) + goto permission_denied; } } key_put(key); - ret = generic_permission(inode, mask); _leave(" = %d", ret); return ret; From bcd89270d93b7edebb5de5e5e7dca1a77a33496e Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: [PATCH 145/297] afs: Deal with an empty callback array Servers may send a callback array that is the same size as the FID array, or an empty array. If the callback count is 0, the code would attempt to read (fid_count * 12) bytes of data, which would fail and result in an unmarshalling error. This would lead to stale data for remotely modified files or directories. Store the callback array size in the internal afs_call structure and use that to determine the amount of data to read. Signed-off-by: Marc Dionne --- fs/afs/cmservice.c | 11 +++++------ fs/afs/internal.h | 5 ++++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 2edbdcbf6432..3062cceb5c2a 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -187,7 +187,6 @@ static int afs_deliver_cb_callback(struct afs_call *call) struct afs_callback *cb; struct afs_server *server; __be32 *bp; - u32 tmp; int ret, loop; _enter("{%u}", call->unmarshall); @@ -249,9 +248,9 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (ret < 0) return ret; - tmp = ntohl(call->tmp); - _debug("CB count: %u", tmp); - if (tmp != call->count && tmp != 0) + call->count2 = ntohl(call->tmp); + _debug("CB count: %u", call->count2); + if (call->count2 != call->count && call->count2 != 0) return -EBADMSG; call->offset = 0; call->unmarshall++; @@ -259,14 +258,14 @@ static int afs_deliver_cb_callback(struct afs_call *call) case 4: _debug("extract CB array"); ret = afs_extract_data(call, call->buffer, - call->count * 3 * 4, false); + call->count2 * 3 * 4, false); if (ret < 0) return ret; _debug("unmarshall CB array"); cb = call->request; bp = call->buffer; - for (loop = call->count; loop > 0; loop--, cb++) { + for (loop = call->count2; loop > 0; loop--, cb++) { cb->version = ntohl(*bp++); cb->expiry = ntohl(*bp++); cb->type = ntohl(*bp++); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5dfa56903a2d..8499870147ef 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -90,7 +90,10 @@ struct afs_call { unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned first_offset; /* offset into mapping[first] */ - unsigned last_to; /* amount of mapping[last] */ + union { + unsigned last_to; /* amount of mapping[last] */ + unsigned count2; /* count used in unmarshalling */ + }; unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ From 6db3ac3c4bc552837d232ec794559a2fae2815a0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: [PATCH 146/297] afs: Handle better the server returning excess or short data When an AFS server is given an FS.FetchData{,64} request to read data from a file, it is permitted by the protocol to return more or less than was requested. kafs currently relies on the latter behaviour in readpage{,s} to handle a partial page at the end of the file (we just ask for a whole page and clear space beyond the short read). However, we don't handle all cases. Add: (1) Handle excess data by discarding it rather than aborting. Note that we use a common static buffer to discard into so that the decryption algorithm advances the PCBC state. (2) Handle a short read that affects more than just the last page. Note that if a read comes up unexpectedly short of long, it's possible that the server's copy of the file changed - in which case the data version number will have been incremented and the callback will have been broken - in which case all the pages currently attached to the inode will be zapped anyway at some point. Signed-off-by: David Howells --- fs/afs/file.c | 7 +++++-- fs/afs/fsclient.c | 49 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index ba7b71fba34b..a38e1c30d110 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -184,10 +184,13 @@ int afs_page_filler(void *data, struct page *page) if (!req) goto enomem; + /* We request a full page. If the page is a partial one at the + * end of the file, the server will return a short read and the + * unmarshalling code will clear the unfilled space. + */ atomic_set(&req->usage, 1); req->pos = (loff_t)page->index << PAGE_SHIFT; - req->len = min_t(size_t, i_size_read(inode) - req->pos, - PAGE_SIZE); + req->len = PAGE_SIZE; req->nr_pages = 1; req->pages[0] = page; get_page(page); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index ac8e766978dc..bf8904a1a58f 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -16,6 +16,12 @@ #include "internal.h" #include "afs_fs.h" +/* + * We need somewhere to discard into in case the server helpfully returns more + * than we asked for in FS.FetchData{,64}. + */ +static u8 afs_discard_buffer[64]; + /* * decode an AFSFid block */ @@ -353,12 +359,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->actual_len |= ntohl(call->tmp); _debug("DATA length: %llu", req->actual_len); - /* Check that the server didn't want to send us extra. We - * might want to just discard instead, but that requires - * cooperation from AF_RXRPC. - */ - if (req->actual_len > req->len) - return -EBADMSG; req->remain = req->actual_len; call->offset = req->pos & (PAGE_SIZE - 1); @@ -368,6 +368,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->unmarshall++; begin_page: + ASSERTCMP(req->index, <, req->nr_pages); if (req->remain > PAGE_SIZE - call->offset) size = PAGE_SIZE - call->offset; else @@ -390,18 +391,37 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (req->page_done) req->page_done(call, req); if (req->remain > 0) { - req->index++; call->offset = 0; + req->index++; + if (req->index >= req->nr_pages) + goto begin_discard; goto begin_page; } } + goto no_more_data; + + /* Discard any excess data the server gave us */ + begin_discard: + case 4: + size = min_t(size_t, sizeof(afs_discard_buffer), req->remain); + call->count = size; + _debug("extract discard %u/%llu %zu/%u", + req->remain, req->actual_len, call->offset, call->count); + + call->offset = 0; + ret = afs_extract_data(call, afs_discard_buffer, call->count, true); + req->remain -= call->offset; + if (ret < 0) + return ret; + if (req->remain > 0) + goto begin_discard; no_more_data: call->offset = 0; - call->unmarshall++; + call->unmarshall = 5; /* extract the metadata */ - case 4: + case 5: ret = afs_extract_data(call, call->buffer, (21 + 3 + 6) * 4, false); if (ret < 0) @@ -416,16 +436,17 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->offset = 0; call->unmarshall++; - case 5: + case 6: break; } - if (call->count < PAGE_SIZE) { - buffer = kmap(req->pages[req->index]); - memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap(req->pages[req->index]); + for (; req->index < req->nr_pages; req->index++) { + if (call->count < PAGE_SIZE) + zero_user_segment(req->pages[req->index], + call->count, PAGE_SIZE); if (req->page_done) req->page_done(call, req); + call->count = 0; } _leave(" = 0 [done]"); From 3448e6521755862446aed28e29abf12565d8844e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: [PATCH 147/297] afs: Kill struct afs_read::pg_offset Kill struct afs_read::pg_offset as nothing uses it. It's unnecessary as pos can be masked off. Signed-off-by: David Howells --- fs/afs/internal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 8499870147ef..7784a8bc375c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -135,7 +135,6 @@ struct afs_read { atomic_t usage; unsigned int remain; /* Amount remaining */ unsigned int index; /* Which page we're reading into */ - unsigned int pg_offset; /* Offset in page we're at */ unsigned int nr_pages; void (*page_done)(struct afs_call *, struct afs_read *); struct page *pages[]; From e8e581a88c5f5fc7cf1f636d122b77fbcfc8c2f6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: [PATCH 148/297] afs: Handle a short write to an AFS page Handle the situation where afs_write_begin() is told to expect that a full-page write will be made, but this doesn't happen (EFAULT, CTRL-C, etc.), and so afs_write_end() sees a partial write took place. Currently, no attempt is to deal with the discrepency. Fix this by loading the gap from the server. Reported-by: Al Viro Signed-off-by: David Howells --- fs/afs/fsclient.c | 4 +++- fs/afs/internal.h | 2 +- fs/afs/write.c | 28 +++++++++++++++++++--------- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index bf8904a1a58f..6f917dd1238c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -393,8 +393,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (req->remain > 0) { call->offset = 0; req->index++; - if (req->index >= req->nr_pages) + if (req->index >= req->nr_pages) { + call->unmarshall = 4; goto begin_discard; + } goto begin_page; } } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 7784a8bc375c..dc2cb486e127 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -130,7 +130,7 @@ struct afs_call_type { */ struct afs_read { loff_t pos; /* Where to start reading */ - loff_t len; /* How much to read */ + loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ atomic_t usage; unsigned int remain; /* Amount remaining */ diff --git a/fs/afs/write.c b/fs/afs/write.c index 3ac52f6a96ff..ea66890fc188 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,10 +84,9 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, struct page *page) + loff_t pos, unsigned int len, struct page *page) { struct afs_read *req; - loff_t i_size; int ret; _enter(",,%llu", (unsigned long long)pos); @@ -99,16 +98,11 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, atomic_set(&req->usage, 1); req->pos = pos; + req->len = len; req->nr_pages = 1; req->pages[0] = page; get_page(page); - i_size = i_size_read(&vnode->vfs_inode); - if (pos + PAGE_SIZE > i_size) - req->len = i_size - pos; - else - req->len = PAGE_SIZE; - ret = afs_vnode_fetch_data(vnode, key, req); afs_put_read(req); if (ret < 0) { @@ -164,7 +158,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, /* page won't leak in error case: it eventually gets cleaned off LRU */ if (!PageUptodate(page) && len != PAGE_SIZE) { - ret = afs_fill_page(vnode, key, index << PAGE_SHIFT, page); + ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); @@ -258,7 +252,9 @@ int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct key *key = file->private_data; loff_t i_size, maybe_i_size; + int ret; _enter("{%x:%u},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); @@ -274,6 +270,20 @@ int afs_write_end(struct file *file, struct address_space *mapping, spin_unlock(&vnode->writeback_lock); } + if (!PageUptodate(page)) { + if (copied < len) { + /* Try and load any missing data from the server. The + * unmarshalling routine will take care of clearing any + * bits that are beyond the EOF. + */ + ret = afs_fill_page(vnode, key, pos + copied, + len - copied, page); + if (ret < 0) + return ret; + } + SetPageUptodate(page); + } + set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); From 58fed94dfb17e89556b5705f20f90e5b2971b6a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:45 +0000 Subject: [PATCH 149/297] afs: Flush outstanding writes when an fd is closed Flush outstanding writes in afs when an fd is closed. This is what NFS and CIFS do. Reported-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/file.c | 1 + fs/afs/internal.h | 1 + fs/afs/write.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+) diff --git a/fs/afs/file.c b/fs/afs/file.c index a38e1c30d110..b5829443ff69 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -30,6 +30,7 @@ static int afs_readpages(struct file *filp, struct address_space *mapping, const struct file_operations afs_file_operations = { .open = afs_open, + .flush = afs_flush, .release = afs_release, .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index dc2cb486e127..af1d91ec7f2c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -720,6 +720,7 @@ extern int afs_writepages(struct address_space *, struct writeback_control *); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_writeback_all(struct afs_vnode *); +extern int afs_flush(struct file *, fl_owner_t); extern int afs_fsync(struct file *, loff_t, loff_t, int); diff --git a/fs/afs/write.c b/fs/afs/write.c index ea66890fc188..f1450ea09406 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -757,6 +757,20 @@ out: return ret; } +/* + * Flush out all outstanding writes on a file opened for writing when it is + * closed. + */ +int afs_flush(struct file *file, fl_owner_t id) +{ + _enter(""); + + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; + + return vfs_fsync(file, 0); +} + /* * notification that a previously read-only page is about to become writable * - if it returns an error, the caller will deliver a bus error signal From 944c74f472f926785b1948efa0e73e2f1b3b539b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:45 +0000 Subject: [PATCH 150/297] afs: Distinguish mountpoints from symlinks by file mode alone In AFS, mountpoints appear as symlinks with mode 0644 and normal symlinks have mode 0777, so use this to distinguish them rather than reading the content and parsing it. In the case of a mountpoint, the symlink body is a formatted string indicating the location of the target volume. Note that with this, kAFS no longer 'pre-fetches' the contents of symlinks, so afs_readpage() may fail with an access-denial because when the VFS calls d_automount(), it wraps the call in an credentials override that sets the initial creds - thereby preventing access to the caller's keyrings and the authentication keys held therein. To this end, a patch reverting that change to the VFS is required also. Reported-by: Jeffrey Altman Signed-off-by: David Howells --- fs/afs/inode.c | 29 +++++++++++++------------- fs/afs/internal.h | 1 - fs/afs/mntpt.c | 53 ----------------------------------------------- 3 files changed, 15 insertions(+), 68 deletions(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 299dbaeb2e2a..ade6ec3873cf 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -54,8 +54,21 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_fop = &afs_dir_file_operations; break; case AFS_FTYPE_SYMLINK: - inode->i_mode = S_IFLNK | vnode->status.mode; - inode->i_op = &page_symlink_inode_operations; + /* Symlinks with a mode of 0644 are actually mountpoints. */ + if ((vnode->status.mode & 0777) == 0644) { + inode->i_flags |= S_AUTOMOUNT; + + spin_lock(&vnode->lock); + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + spin_unlock(&vnode->lock); + + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_mntpt_inode_operations; + inode->i_fop = &afs_mntpt_file_operations; + } else { + inode->i_mode = S_IFLNK | vnode->status.mode; + inode->i_op = &page_symlink_inode_operations; + } inode_nohighmem(inode); break; default: @@ -79,18 +92,6 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_generation = vnode->fid.unique; inode->i_version = vnode->status.data_version; inode->i_mapping->a_ops = &afs_fs_aops; - - /* check to see whether a symbolic link is really a mountpoint */ - if (vnode->status.type == AFS_FTYPE_SYMLINK) { - afs_mntpt_check_symlink(vnode, key); - - if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) { - inode->i_mode = S_IFDIR | vnode->status.mode; - inode->i_op = &afs_mntpt_inode_operations; - inode->i_fop = &afs_mntpt_file_operations; - } - } - return 0; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index af1d91ec7f2c..39de154fb42e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -559,7 +559,6 @@ extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; extern struct vfsmount *afs_d_automount(struct path *); -extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); extern void afs_mntpt_kill_timer(void); /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index d4fb0afc0097..bd3b65cde282 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -46,59 +46,6 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); static unsigned long afs_mntpt_expiry_timeout = 10 * 60; -/* - * check a symbolic link to see whether it actually encodes a mountpoint - * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately - */ -int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) -{ - struct page *page; - size_t size; - char *buf; - int ret; - - _enter("{%x:%u,%u}", - vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); - - /* read the contents of the symlink into the pagecache */ - page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, - afs_page_filler, key); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto out; - } - - ret = -EIO; - if (PageError(page)) - goto out_free; - - buf = kmap(page); - - /* examine the symlink's contents */ - size = vnode->status.size; - _debug("symlink to %*.*s", (int) size, (int) size, buf); - - if (size > 2 && - (buf[0] == '%' || buf[0] == '#') && - buf[size - 1] == '.' - ) { - _debug("symlink is a mountpoint"); - spin_lock(&vnode->lock); - set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - vnode->vfs_inode.i_flags |= S_AUTOMOUNT; - spin_unlock(&vnode->lock); - } - - ret = 0; - - kunmap(page); -out_free: - put_page(page); -out: - _leave(" = %d", ret); - return ret; -} - /* * no valid lookup procedure on this sort of dir */ From 1d7e4ebf291d3103466006926329e39aa355944f Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Thu, 16 Mar 2017 16:27:45 +0000 Subject: [PATCH 151/297] afs: inode: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: David Howells --- fs/afs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index ade6ec3873cf..e083e086b7ca 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -445,7 +445,7 @@ void afs_evict_inode(struct inode *inode) mutex_lock(&vnode->permits_lock); permits = vnode->permits; - rcu_assign_pointer(vnode->permits, NULL); + RCU_INIT_POINTER(vnode->permits, NULL); mutex_unlock(&vnode->permits_lock); if (permits) call_rcu(&permits->rcu, afs_zap_permits); From df8a09d1b8f9e693ec3f6b7e0162fc817f2cf0db Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Thu, 16 Mar 2017 16:27:45 +0000 Subject: [PATCH 152/297] afs: security: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: David Howells --- fs/afs/security.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/security.c b/fs/afs/security.c index bfa9d3428383..ecb86a670180 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -114,7 +114,7 @@ void afs_clear_permits(struct afs_vnode *vnode) mutex_lock(&vnode->permits_lock); permits = vnode->permits; - rcu_assign_pointer(vnode->permits, NULL); + RCU_INIT_POINTER(vnode->permits, NULL); mutex_unlock(&vnode->permits_lock); if (permits) From 8a79790bf0b7da216627ffb85f52cfb4adbf1e4e Mon Sep 17 00:00:00 2001 From: Tina Ruchandani Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: [PATCH 153/297] afs: Migrate vlocation fields to 64-bit get_seconds() returns real wall-clock seconds. On 32-bit systems this value will overflow in year 2038 and beyond. This patch changes afs's vlocation record to use ktime_get_real_seconds() instead, for the fields time_of_death and update_at. Signed-off-by: Tina Ruchandani Signed-off-by: David Howells --- fs/afs/callback.c | 7 ++++--- fs/afs/internal.h | 7 ++++--- fs/afs/server.c | 6 +++--- fs/afs/vlocation.c | 16 +++++++++------- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/afs/callback.c b/fs/afs/callback.c index b29447e03ede..25d404d22cae 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -362,7 +362,7 @@ static void afs_callback_updater(struct work_struct *work) { struct afs_server *server; struct afs_vnode *vnode, *xvnode; - time_t now; + time64_t now; long timeout; int ret; @@ -370,7 +370,7 @@ static void afs_callback_updater(struct work_struct *work) _enter(""); - now = get_seconds(); + now = ktime_get_real_seconds(); /* find the first vnode to update */ spin_lock(&server->cb_lock); @@ -424,7 +424,8 @@ static void afs_callback_updater(struct work_struct *work) /* and then reschedule */ _debug("reschedule"); - vnode->update_at = get_seconds() + afs_vnode_update_timeout; + vnode->update_at = ktime_get_real_seconds() + + afs_vnode_update_timeout; spin_lock(&server->cb_lock); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 39de154fb42e..97a16ce200be 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -249,7 +250,7 @@ struct afs_cache_vhash { */ struct afs_vlocation { atomic_t usage; - time_t time_of_death; /* time at which put reduced usage to 0 */ + time64_t time_of_death; /* time at which put reduced usage to 0 */ struct list_head link; /* link in cell volume location list */ struct list_head grave; /* link in master graveyard list */ struct list_head update; /* link in master update list */ @@ -260,7 +261,7 @@ struct afs_vlocation { struct afs_cache_vlocation vldb; /* volume information DB record */ struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ wait_queue_head_t waitq; /* status change waitqueue */ - time_t update_at; /* time at which record should be updated */ + time64_t update_at; /* time at which record should be updated */ spinlock_t lock; /* access lock */ afs_vlocation_state_t state; /* volume location state */ unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */ @@ -273,7 +274,7 @@ struct afs_vlocation { */ struct afs_server { atomic_t usage; - time_t time_of_death; /* time at which put reduced usage to 0 */ + time64_t time_of_death; /* time at which put reduced usage to 0 */ struct in_addr addr; /* server address */ struct afs_cell *cell; /* cell in which server resides */ struct list_head link; /* link in cell's server list */ diff --git a/fs/afs/server.c b/fs/afs/server.c index d4066ab7dd55..c001b1f2455f 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -242,7 +242,7 @@ void afs_put_server(struct afs_server *server) spin_lock(&afs_server_graveyard_lock); if (atomic_read(&server->usage) == 0) { list_move_tail(&server->grave, &afs_server_graveyard); - server->time_of_death = get_seconds(); + server->time_of_death = ktime_get_real_seconds(); queue_delayed_work(afs_wq, &afs_server_reaper, afs_server_timeout * HZ); } @@ -277,9 +277,9 @@ static void afs_reap_server(struct work_struct *work) LIST_HEAD(corpses); struct afs_server *server; unsigned long delay, expiry; - time_t now; + time64_t now; - now = get_seconds(); + now = ktime_get_real_seconds(); spin_lock(&afs_server_graveyard_lock); while (!list_empty(&afs_server_graveyard)) { diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index d7d8dd8c0b31..37b7c3b342a6 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -340,7 +340,8 @@ static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl) struct afs_vlocation *xvl; /* wait at least 10 minutes before updating... */ - vl->update_at = get_seconds() + afs_vlocation_update_timeout; + vl->update_at = ktime_get_real_seconds() + + afs_vlocation_update_timeout; spin_lock(&afs_vlocation_updates_lock); @@ -506,7 +507,7 @@ void afs_put_vlocation(struct afs_vlocation *vl) if (atomic_read(&vl->usage) == 0) { _debug("buried"); list_move_tail(&vl->grave, &afs_vlocation_graveyard); - vl->time_of_death = get_seconds(); + vl->time_of_death = ktime_get_real_seconds(); queue_delayed_work(afs_wq, &afs_vlocation_reap, afs_vlocation_timeout * HZ); @@ -543,11 +544,11 @@ static void afs_vlocation_reaper(struct work_struct *work) LIST_HEAD(corpses); struct afs_vlocation *vl; unsigned long delay, expiry; - time_t now; + time64_t now; _enter(""); - now = get_seconds(); + now = ktime_get_real_seconds(); spin_lock(&afs_vlocation_graveyard_lock); while (!list_empty(&afs_vlocation_graveyard)) { @@ -622,13 +623,13 @@ static void afs_vlocation_updater(struct work_struct *work) { struct afs_cache_vlocation vldb; struct afs_vlocation *vl, *xvl; - time_t now; + time64_t now; long timeout; int ret; _enter(""); - now = get_seconds(); + now = ktime_get_real_seconds(); /* find a record to update */ spin_lock(&afs_vlocation_updates_lock); @@ -684,7 +685,8 @@ static void afs_vlocation_updater(struct work_struct *work) /* and then reschedule */ _debug("reschedule"); - vl->update_at = get_seconds() + afs_vlocation_update_timeout; + vl->update_at = ktime_get_real_seconds() + + afs_vlocation_update_timeout; spin_lock(&afs_vlocation_updates_lock); From 56e714312e7dbd6bb83b2f78d3ec19a404c7649f Mon Sep 17 00:00:00 2001 From: Tina Ruchandani Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: [PATCH 154/297] afs: Prevent callback expiry timer overflow get_seconds() returns real wall-clock seconds. On 32-bit systems this value will overflow in year 2038 and beyond. This patch changes afs_vnode record to use ktime_get_real_seconds() instead, for the fields cb_expires and cb_expires_at. Signed-off-by: Tina Ruchandani Signed-off-by: David Howells --- fs/afs/fsclient.c | 2 +- fs/afs/inode.c | 7 ++++--- fs/afs/internal.h | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 6f917dd1238c..c05452a09398 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -145,7 +145,7 @@ static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode) vnode->cb_version = ntohl(*bp++); vnode->cb_expiry = ntohl(*bp++); vnode->cb_type = ntohl(*bp++); - vnode->cb_expires = vnode->cb_expiry + get_seconds(); + vnode->cb_expires = vnode->cb_expiry + ktime_get_real_seconds(); *_bp = bp; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e083e086b7ca..4079c832ff27 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -246,12 +246,13 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, vnode->cb_version = 0; vnode->cb_expiry = 0; vnode->cb_type = 0; - vnode->cb_expires = get_seconds(); + vnode->cb_expires = ktime_get_real_seconds(); } else { vnode->cb_version = cb->version; vnode->cb_expiry = cb->expiry; vnode->cb_type = cb->type; - vnode->cb_expires = vnode->cb_expiry + get_seconds(); + vnode->cb_expires = vnode->cb_expiry + + ktime_get_real_seconds(); } } @@ -324,7 +325,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) && !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { - if (vnode->cb_expires < get_seconds() + 10) { + if (vnode->cb_expires < ktime_get_real_seconds() + 10) { _debug("callback expired"); set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); } else { diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 97a16ce200be..832555003d03 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -377,8 +377,8 @@ struct afs_vnode { struct rb_node server_rb; /* link in server->fs_vnodes */ struct rb_node cb_promise; /* link in server->cb_promises */ struct work_struct cb_broken_work; /* work to be done on callback break */ - time_t cb_expires; /* time at which callback expires */ - time_t cb_expires_at; /* time used to order cb_promise */ + time64_t cb_expires; /* time at which callback expires */ + time64_t cb_expires_at; /* time used to order cb_promise */ unsigned cb_version; /* callback version */ unsigned cb_expiry; /* callback expiry time */ afs_callback_type_t cb_type; /* type of callback */ From 29f069853287dcb46eaf45a50dbf1232c1444ac6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: [PATCH 155/297] afs: Fix AFS read bug Fix a bug in AFS read whereby the request page afs_read::index isn't incremented after calling ->page_done() if ->remain reaches 0, indicating that the data read is complete. Without this a NULL pointer exception happens when ->page_done() is called twice for the last page because the page clearing loop will call it also and afs_readpages_page_done() clears the current entry in the page list. BUG: unable to handle kernel NULL pointer dereference at (null) IP: afs_readpages_page_done+0x21/0xa4 [kafs] PGD 0 Oops: 0002 [#1] SMP Modules linked in: kafs(E) CPU: 2 PID: 3002 Comm: md5sum Tainted: G E 4.10.0-fscache #485 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 task: ffff8804017d86c0 task.stack: ffff8803fc1d8000 RIP: 0010:afs_readpages_page_done+0x21/0xa4 [kafs] RSP: 0018:ffff8803fc1db978 EFLAGS: 00010282 RAX: ffff880405d39af8 RBX: 0000000000000000 RCX: ffff880407d83ed4 RDX: 0000000000000000 RSI: ffff880405d39a00 RDI: ffff880405c6f400 RBP: ffff8803fc1db988 R08: 0000000000000000 R09: 0000000000000001 R10: ffff8803fc1db820 R11: ffff88040cf56000 R12: ffff8804088f1780 R13: ffff8804017d86c0 R14: ffff8804088f1780 R15: 0000000000003840 FS: 00007f8154469700(0000) GS:ffff88041fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000004016ec000 CR4: 00000000001406e0 Call Trace: afs_deliver_fs_fetch_data+0x5b9/0x60e [kafs] ? afs_make_call+0x316/0x4e8 [kafs] ? afs_make_call+0x359/0x4e8 [kafs] afs_deliver_to_call+0x173/0x2e8 [kafs] ? afs_make_call+0x316/0x4e8 [kafs] afs_make_call+0x37a/0x4e8 [kafs] ? wake_up_q+0x4f/0x4f ? __init_waitqueue_head+0x36/0x49 afs_fs_fetch_data+0x21c/0x227 [kafs] ? afs_fs_fetch_data+0x21c/0x227 [kafs] afs_vnode_fetch_data+0xf3/0x1d2 [kafs] afs_readpages+0x314/0x3fd [kafs] __do_page_cache_readahead+0x208/0x2c5 ondemand_readahead+0x3a2/0x3b7 ? ondemand_readahead+0x3a2/0x3b7 page_cache_async_readahead+0x5e/0x67 generic_file_read_iter+0x23b/0x70c ? __inode_security_revalidate+0x2f/0x62 __vfs_read+0xc4/0xe8 vfs_read+0xd1/0x15a SyS_read+0x4c/0x89 do_syscall_64+0x80/0x191 entry_SYSCALL64_slow_path+0x25/0x25 Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne --- fs/afs/fsclient.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index c05452a09398..4314f9e63a2c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -390,9 +390,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (call->offset == PAGE_SIZE) { if (req->page_done) req->page_done(call, req); + req->index++; if (req->remain > 0) { call->offset = 0; - req->index++; if (req->index >= req->nr_pages) { call->unmarshall = 4; goto begin_discard; From 6a0e3999e5cb3daa0468073fcdee0767422a4056 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: [PATCH 156/297] afs: Make struct afs_read::remain 64-bit Make struct afs_read::remain 64-bit so that it can handle huge transfers if we ever request them or the server decides to give us a bit extra data (the other fields there are already 64-bit). Signed-off-by: David Howells Tested-by: Marc Dionne --- fs/afs/fsclient.c | 8 ++++---- fs/afs/internal.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 4314f9e63a2c..0778c5b6b59b 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -321,7 +321,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) void *buffer; int ret; - _enter("{%u,%zu/%u;%u/%llu}", + _enter("{%u,%zu/%u;%llu/%llu}", call->unmarshall, call->offset, call->count, req->remain, req->actual_len); @@ -379,7 +379,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* extract the returned data */ case 3: - _debug("extract data %u/%llu %zu/%u", + _debug("extract data %llu/%llu %zu/%u", req->remain, req->actual_len, call->offset, call->count); buffer = kmap(req->pages[req->index]); @@ -405,9 +405,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* Discard any excess data the server gave us */ begin_discard: case 4: - size = min_t(size_t, sizeof(afs_discard_buffer), req->remain); + size = min_t(loff_t, sizeof(afs_discard_buffer), req->remain); call->count = size; - _debug("extract discard %u/%llu %zu/%u", + _debug("extract discard %llu/%llu %zu/%u", req->remain, req->actual_len, call->offset, call->count); call->offset = 0; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 832555003d03..a6901360fb81 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -133,8 +133,8 @@ struct afs_read { loff_t pos; /* Where to start reading */ loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ + loff_t remain; /* Amount remaining */ atomic_t usage; - unsigned int remain; /* Amount remaining */ unsigned int index; /* Which page we're reading into */ unsigned int nr_pages; void (*page_done)(struct afs_call *, struct afs_read *); From 2f5705a5c805e7f761f2228820656bb9363a3d8c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: [PATCH 157/297] afs: Use a bvec rather than a kvec in afs_send_pages() Use a bvec rather than a kvec in afs_send_pages() as we don't then have to call kmap() in advance. This allows us to pass the array of contiguous pages that we extracted through to rxrpc in one go rather than passing a single page at a time. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 95 +++++++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 44 deletions(-) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 419ef05dcb5e..bf45307ff201 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -259,67 +259,74 @@ void afs_flat_call_destructor(struct afs_call *call) call->buffer = NULL; } +#define AFS_BVEC_MAX 8 + +/* + * Load the given bvec with the next few pages. + */ +static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, + struct bio_vec *bv, pgoff_t first, pgoff_t last, + unsigned offset) +{ + struct page *pages[AFS_BVEC_MAX]; + unsigned int nr, n, i, to, bytes = 0; + + nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX); + n = find_get_pages_contig(call->mapping, first, nr, pages); + ASSERTCMP(n, ==, nr); + + msg->msg_flags |= MSG_MORE; + for (i = 0; i < nr; i++) { + to = PAGE_SIZE; + if (first + i >= last) { + to = call->last_to; + msg->msg_flags &= ~MSG_MORE; + } + bv[i].bv_page = pages[i]; + bv[i].bv_len = to - offset; + bv[i].bv_offset = offset; + bytes += to - offset; + offset = 0; + } + + iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes); +} + /* * attach the data from a bunch of pages on an inode to a call */ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) { - struct page *pages[8]; - unsigned count, n, loop, offset, to; + struct bio_vec bv[AFS_BVEC_MAX]; + unsigned int bytes, nr, loop, offset; pgoff_t first = call->first, last = call->last; int ret; - _enter(""); - offset = call->first_offset; call->first_offset = 0; do { - _debug("attach %lx-%lx", first, last); + afs_load_bvec(call, msg, bv, first, last, offset); + offset = 0; + bytes = msg->msg_iter.count; + nr = msg->msg_iter.nr_segs; - count = last - first + 1; - if (count > ARRAY_SIZE(pages)) - count = ARRAY_SIZE(pages); - n = find_get_pages_contig(call->mapping, first, count, pages); - ASSERTCMP(n, ==, count); - - loop = 0; - do { - struct bio_vec bvec = {.bv_page = pages[loop], - .bv_offset = offset}; - msg->msg_flags = 0; - to = PAGE_SIZE; - if (first + loop >= last) - to = call->last_to; - else - msg->msg_flags = MSG_MORE; - bvec.bv_len = to - offset; - offset = 0; - - _debug("- range %u-%u%s", - offset, to, msg->msg_flags ? " [more]" : ""); - iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, - &bvec, 1, to - offset); - - /* have to change the state *before* sending the last - * packet as RxRPC might give us the reply before it - * returns from sending the request */ - if (first + loop >= last) - call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, - msg, to - offset); - if (ret < 0) - break; - } while (++loop < count); - first += count; - - for (loop = 0; loop < count; loop++) - put_page(pages[loop]); + /* Have to change the state *before* sending the last + * packet as RxRPC might give us the reply before it + * returns from sending the request. + */ + if (first + nr >= last) + call->state = AFS_CALL_AWAIT_REPLY; + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, + msg, bytes); + for (loop = 0; loop < nr; loop++) + put_page(bv[loop].bv_page); if (ret < 0) break; + + first += nr; } while (first <= last); - _leave(" = %d", ret); return ret; } From 146a1192783697810b63a1e41c4d59fc93387340 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:47 +0000 Subject: [PATCH 158/297] afs: Fix the maths in afs_fs_store_data() afs_fs_store_data() works out of the size of the write it's going to make, but it uses 32-bit unsigned subtraction in one place that gets automatically cast to loff_t. However, if to < offset, then the number goes negative, but as the result isn't signed, this doesn't get sign-extended to 64-bits when placed in a loff_t. Fix by casting the operands to loff_t. Signed-off-by: David Howells --- fs/afs/fsclient.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 0778c5b6b59b..d9234b767287 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1236,7 +1236,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, _enter(",%x,{%x:%u},,", key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); - size = to - offset; + size = (loff_t)to - (loff_t)offset; if (first != last) size += (loff_t)(last - first) << PAGE_SHIFT; pos = (loff_t)first << PAGE_SHIFT; From 1157f153f37a8586765034470e4f00a4a6c4ce6f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:47 +0000 Subject: [PATCH 159/297] afs: Invalid op ID should abort with RXGEN_OPCODE When we are given an invalid operation ID, we should abort that with RXGEN_OPCODE rather than RX_INVALID_OPERATION. Also map RXGEN_OPCODE to -ENOTSUPP. Signed-off-by: David Howells --- fs/afs/misc.c | 2 ++ fs/afs/rxrpc.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 91ea1aa0d8b3..100b207efc9e 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -84,6 +84,8 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGEN_OPCODE: return -ENOTSUPP; + default: return -EREMOTEIO; } } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index bf45307ff201..bf7761fe6ef5 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -465,7 +465,7 @@ static void afs_deliver_to_call(struct afs_call *call) abort_code, -ret, "KNC"); goto do_abort; case -ENOTSUPP: - abort_code = RX_INVALID_OPERATION; + abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(afs_socket, call->rxcall, abort_code, -ret, "KIV"); goto do_abort; From 70af0e3bd65142f9e674961c975451638a7ce1d5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:47 +0000 Subject: [PATCH 160/297] afs: Better abort and net error handling If we receive a network error, a remote abort or a protocol error whilst we're still transmitting data, make sure we return an appropriate error to the caller rather than ESHUTDOWN or ECONNABORTED. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index bf7761fe6ef5..22d26b369070 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -340,6 +340,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; + size_t offset; + u32 abort_code; int ret; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -388,9 +390,11 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, msg.msg_controllen = 0; msg.msg_flags = (call->send_pages ? MSG_MORE : 0); - /* have to change the state *before* sending the last packet as RxRPC - * might give us the reply before it returns from sending the - * request */ + /* We have to change the state *before* sending the last packet as + * rxrpc might give us the reply before it returns from sending the + * request. Further, if the send fails, we may already have been given + * a notification and may have collected it. + */ if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; ret = rxrpc_kernel_send_data(afs_socket, rxcall, @@ -412,7 +416,17 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, return afs_wait_for_call_to_complete(call); error_do_abort: - rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD"); + call->state = AFS_CALL_COMPLETE; + if (ret != -ECONNABORTED) { + rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, + -ret, "KSD"); + } else { + abort_code = 0; + offset = 0; + rxrpc_kernel_recv_data(afs_socket, rxcall, NULL, 0, &offset, + false, &abort_code); + ret = call->type->abort_to_error(abort_code); + } error_kill_call: afs_put_call(call); _leave(" = %d", ret); @@ -459,16 +473,18 @@ static void afs_deliver_to_call(struct afs_call *call) case -EINPROGRESS: case -EAGAIN: goto out; + case -ECONNABORTED: + goto call_complete; case -ENOTCONN: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(afs_socket, call->rxcall, abort_code, -ret, "KNC"); - goto do_abort; + goto save_error; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(afs_socket, call->rxcall, abort_code, -ret, "KIV"); - goto do_abort; + goto save_error; case -ENODATA: case -EBADMSG: case -EMSGSIZE: @@ -478,7 +494,7 @@ static void afs_deliver_to_call(struct afs_call *call) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(afs_socket, call->rxcall, abort_code, EBADMSG, "KUM"); - goto do_abort; + goto save_error; } } @@ -489,8 +505,9 @@ out: _leave(""); return; -do_abort: +save_error: call->error = ret; +call_complete: call->state = AFS_CALL_COMPLETE; goto done; } @@ -538,6 +555,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) _debug("call incomplete"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, RX_CALL_DEAD, -ret, abort_why); + } else if (call->error < 0) { + ret = call->error; } _debug("call complete"); From ab94f5d0dd6fd82e7eeca5e7c8096eaea0a0261f Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 16 Mar 2017 16:27:47 +0000 Subject: [PATCH 161/297] afs: Populate and use client modification time The inode timestamps should be set from the client time in the status received from the server, rather than the server time which is meant for internal server use. Set AFS_SET_MTIME and populate the mtime for operations that take an input status, such as file/dir creation and StoreData. If an input time is not provided the server will set the vnode times based on the current server time. In a situation where the server has some skew with the client, this could lead to the client seeing a timestamp in the future for a file that it just created or wrote. Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/fsclient.c | 18 +++++++++--------- fs/afs/inode.c | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index d9234b767287..19f76ae36982 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -111,7 +111,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_mode = mode; } - vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; + vnode->vfs_inode.i_ctime.tv_sec = status->mtime_client; vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_version = data_version; @@ -734,8 +734,8 @@ int afs_fs_create(struct afs_server *server, memset(bp, 0, padsz); bp = (void *) bp + padsz; } - *bp++ = htonl(AFS_SET_MODE); - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); + *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = htonl(mode & S_IALLUGO); /* unix mode */ @@ -1003,8 +1003,8 @@ int afs_fs_symlink(struct afs_server *server, memset(bp, 0, c_padsz); bp = (void *) bp + c_padsz; } - *bp++ = htonl(AFS_SET_MODE); - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); + *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = htonl(S_IRWXUGO); /* unix mode */ @@ -1203,8 +1203,8 @@ static int afs_fs_store_data64(struct afs_server *server, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - *bp++ = 0; /* mask */ - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MTIME); /* mask */ + *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = 0; /* unix mode */ @@ -1280,8 +1280,8 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - *bp++ = 0; /* mask */ - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MTIME); /* mask */ + *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = 0; /* unix mode */ diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 4079c832ff27..aae55dd15108 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -85,7 +85,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_uid = vnode->status.owner; inode->i_gid = vnode->status.group; inode->i_size = vnode->status.size; - inode->i_ctime.tv_sec = vnode->status.mtime_server; + inode->i_ctime.tv_sec = vnode->status.mtime_client; inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; From 68ae849d7e674b83610bc7fdf74b21621a09b9ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:48 +0000 Subject: [PATCH 162/297] afs: Don't set PG_error on local EINTR or ENOMEM when filling a page Don't set PG_error on a page if we get local EINTR or ENOMEM when filling a page for writing. Signed-off-by: David Howells --- fs/afs/file.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index b5829443ff69..0d5b8508869b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -212,7 +212,13 @@ int afs_page_filler(void *data, struct page *page) fscache_uncache_page(vnode->cache, page); #endif BUG_ON(PageFsCache(page)); - goto error; + + if (ret == -EINTR || + ret == -ENOMEM || + ret == -ERESTARTSYS || + ret == -EAGAIN) + goto error; + goto io_error; } SetPageUptodate(page); @@ -231,10 +237,12 @@ int afs_page_filler(void *data, struct page *page) _leave(" = 0"); return 0; +io_error: + SetPageError(page); + goto error; enomem: ret = -ENOMEM; error: - SetPageError(page); unlock_page(page); _leave(" = %d", ret); return ret; From 6d06b0d25209c80e99c1e89700f1e09694a3766b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:48 +0000 Subject: [PATCH 163/297] afs: Fix page leak in afs_write_begin() afs_write_begin() leaks a ref and a lock on a page if afs_fill_page() fails. Fix the leak by unlocking and releasing the page in the error path. Signed-off-by: David Howells --- fs/afs/write.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index f1450ea09406..6e13e96c3db0 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -154,12 +154,12 @@ int afs_write_begin(struct file *file, struct address_space *mapping, kfree(candidate); return -ENOMEM; } - *pagep = page; - /* page won't leak in error case: it eventually gets cleaned off LRU */ if (!PageUptodate(page) && len != PAGE_SIZE) { ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page); if (ret < 0) { + unlock_page(page); + put_page(page); kfree(candidate); _leave(" = %d [prep]", ret); return ret; @@ -167,6 +167,9 @@ int afs_write_begin(struct file *file, struct address_space *mapping, SetPageUptodate(page); } + /* page won't leak in error case: it eventually gets cleaned off LRU */ + *pagep = page; + try_again: spin_lock(&vnode->writeback_lock); From 7286a35e893176169b09715096a4aca557e2ccd2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:48 +0000 Subject: [PATCH 164/297] afs: Fix afs_kill_pages() Fix afs_kill_pages() in two ways: (1) If a writeback has been partially flushed, then if we try and kill the pages it contains, some of them may no longer be undergoing writeback and end_page_writeback() will assert. Fix this by checking to see whether the page in question is actually undergoing writeback before ending that writeback. (2) The loop that scans for pages to kill doesn't increase the first page index, and so the loop may not terminate, but it will try to process the same pages over and over again. Fix this by increasing the first page index to one after the last page we processed. Signed-off-by: David Howells --- fs/afs/write.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 6e13e96c3db0..134de0667898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -321,10 +321,14 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error, ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { - ClearPageUptodate(pv.pages[loop]); + struct page *page = pv.pages[loop]; + ClearPageUptodate(page); if (error) - SetPageError(pv.pages[loop]); - end_page_writeback(pv.pages[loop]); + SetPageError(page); + if (PageWriteback(page)) + end_page_writeback(page); + if (page->index >= first) + first = page->index + 1; } __pagevec_release(&pv); From 445783d0ec173a52bef2e9b129de7d716a19b9fa Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:48 +0000 Subject: [PATCH 165/297] afs: Fix an off-by-one error in afs_send_pages() afs_send_pages() should only put the call into the AFS_CALL_AWAIT_REPLY state if it has sent all the pages - but the check it makes is incorrect and sometimes it will finish the loop early. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 22d26b369070..b12da6aa5412 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -315,7 +315,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) * packet as RxRPC might give us the reply before it * returns from sending the request. */ - if (first + nr >= last) + if (first + nr - 1 >= last) call->state = AFS_CALL_AWAIT_REPLY; ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg, bytes); From 954cd6dc02a65065aecb7150962c0870c5b0e322 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:49 +0000 Subject: [PATCH 166/297] afs: Fix abort on signal while waiting for call completion Fix the way in which a call that's in progress and being waited for is aborted in the case that EINTR is detected. We should be sending RX_USER_ABORT rather than RX_CALL_DEAD as the abort code. Note that since the only two ways out of the loop are if the call completes or if a signal happens, the kill-the-call clause after the loop has finished can only happen in the case of EINTR. This means that we only have one abort case to deal with, not two, and the "KWC" case can never happen and so can be deleted. Note further that simply aborting the call isn't necessarily the best thing here since at this point: the request has been entirely sent and it's likely the server will do the operation anyway - whether we abort it or not. In future, we should punt the handling of the remainder of the call off to a background thread. Reported-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/rxrpc.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index b12da6aa5412..8f76b13d5549 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -517,7 +517,6 @@ call_complete: */ static int afs_wait_for_call_to_complete(struct afs_call *call) { - const char *abort_why; int ret; DECLARE_WAITQUEUE(myself, current); @@ -536,13 +535,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) continue; } - abort_why = "KWC"; - ret = call->error; - if (call->state == AFS_CALL_COMPLETE) - break; - abort_why = "KWI"; - ret = -EINTR; - if (signal_pending(current)) + if (call->state == AFS_CALL_COMPLETE || + signal_pending(current)) break; schedule(); } @@ -550,15 +544,14 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); - /* kill the call */ + /* Kill off the call if it's still live. */ if (call->state < AFS_CALL_COMPLETE) { - _debug("call incomplete"); + _debug("call interrupted"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_CALL_DEAD, -ret, abort_why); - } else if (call->error < 0) { - ret = call->error; + RX_USER_ABORT, -EINTR, "KWI"); } + ret = call->error; _debug("call complete"); afs_put_call(call); _leave(" = %d", ret); From 65a151094edeb04e8f5f6f1502028e2383e81bb8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:49 +0000 Subject: [PATCH 167/297] afs: ->writepage() shouldn't call clear_page_dirty_for_io() The ->writepage() op shouldn't call clear_page_dirty_for_io() as that has already been called by the caller. Fix afs_writepage() by moving the call out of afs_write_back_from_locked_page() to afs_writepages_region() where it is needed. Signed-off-by: David Howells --- fs/afs/write.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 134de0667898..e5f150bccfb5 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -231,7 +231,7 @@ flush_conflicting_wb: if (wb->state == AFS_WBACK_PENDING) wb->state = AFS_WBACK_CONFLICTING; spin_unlock(&vnode->writeback_lock); - if (PageDirty(page)) { + if (clear_page_dirty_for_io(page)) { ret = afs_write_back_from_locked_page(wb, page); if (ret < 0) { afs_put_writeback(candidate); @@ -353,8 +353,6 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb, _enter(",%lx", primary_page->index); count = 1; - if (!clear_page_dirty_for_io(primary_page)) - BUG(); if (test_set_page_writeback(primary_page)) BUG(); @@ -542,6 +540,8 @@ static int afs_writepages_region(struct address_space *mapping, wb->state = AFS_WBACK_WRITING; spin_unlock(&wb->vnode->writeback_lock); + if (!clear_page_dirty_for_io(page)) + BUG(); ret = afs_write_back_from_locked_page(wb, page); unlock_page(page); put_page(page); From c5051c7bc777dffa5661569dec5997f432b9a34a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:49 +0000 Subject: [PATCH 168/297] afs: Don't wait for page writeback with the page lock held Drop the page lock before waiting for page writeback. Signed-off-by: David Howells --- fs/afs/write.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index e5f150bccfb5..2d2fccd5044b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -518,17 +518,16 @@ static int afs_writepages_region(struct address_space *mapping, */ lock_page(page); - if (page->mapping != mapping) { + if (page->mapping != mapping || !PageDirty(page)) { unlock_page(page); put_page(page); continue; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || !PageDirty(page)) { + if (PageWriteback(page)) { unlock_page(page); + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); put_page(page); continue; } From 8f3dbfd79ed9ef9770305a7cc4e13dfd31ad2cd0 Mon Sep 17 00:00:00 2001 From: Kris Murphy Date: Thu, 16 Mar 2017 10:51:28 -0500 Subject: [PATCH 169/297] openvswitch: Add missing case OVS_TUNNEL_KEY_ATTR_PAD Added a case for OVS_TUNNEL_KEY_ATTR_PAD to the switch statement in ip_tun_from_nlattr in order to prevent the default case returning an error. Fixes: b46f6ded906e ("libnl: nla_put_be64(): align on a 64-bit area") Signed-off-by: Kris Murphy Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a08ff834676b..1105a838bab8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -665,6 +665,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_VXLAN_OPT; opts_type = type; break; + case OVS_TUNNEL_KEY_ATTR_PAD: + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); From 271df90e4e530c17f237b27034d6341cb2c2f536 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Thu, 16 Mar 2017 16:40:19 -0700 Subject: [PATCH 170/297] z3fold: fix spinlock unlocking in page reclaim Commmit 5a27aa822029 ("z3fold: add kref refcounting") introduced a bug in z3fold_reclaim_page() with function exit that may leave pool->lock spinlock held. Here comes the trivial fix. Fixes: 5a27aa822029 ("z3fold: add kref refcounting") Link: http://lkml.kernel.org/r/20170311222239.7b83d8e7ef1914e05497649f@gmail.com Reported-by: Alexey Khoroshilov Signed-off-by: Vitaly Wool Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/z3fold.c b/mm/z3fold.c index 8970a2fd3b1a..f9492bccfd79 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -667,6 +667,7 @@ next: z3fold_page_unlock(zhdr); spin_lock(&pool->lock); if (kref_put(&zhdr->refcount, release_z3fold_page)) { + spin_unlock(&pool->lock); atomic64_dec(&pool->pages_nr); return 0; } From 5be9b730b09c45c358bbfe7f51d254e306cccc07 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 Mar 2017 16:40:21 -0700 Subject: [PATCH 171/297] kasan: add a prototype of task_struct to avoid warning Add a prototype of task_struct to fix below warning on arm64. In file included from arch/arm64/kernel/probes/kprobes.c:19:0: include/linux/kasan.h:81:132: error: 'struct task_struct' declared inside parameter list will not be visible outside of this definition or declaration [-Werror] static inline void kasan_unpoison_task_stack(struct task_struct *task) {} As same as other types (kmem_cache, page, and vm_struct) this adds a prototype of task_struct data structure on top of kasan.h. [arnd] A related warning was fixed before, but now appears in a different line in the same file in v4.11-rc2. The patch from Masami Hiramatsu still seems appropriate, so let's take his version. Fixes: 71af2ed5eeea ("kasan, sched/headers: Remove from ") Link: https://patchwork.kernel.org/patch/9569839/ Link: http://lkml.kernel.org/r/20170313141517.3397802-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Signed-off-by: Masami Hiramatsu Acked-by: Alexander Potapenko Acked-by: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 1c823bef4c15..5734480c9590 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -6,6 +6,7 @@ struct kmem_cache; struct page; struct vm_struct; +struct task_struct; #ifdef CONFIG_KASAN From d0f33ac9ae7b2a727fb678235ae37baf1d0608d5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 16 Mar 2017 16:40:24 -0700 Subject: [PATCH 172/297] mm, x86: fix native_pud_clear build error We still get a build error in random configurations, after this has been modified a few times: In file included from include/linux/mm.h:68:0, from include/linux/suspend.h:8, from arch/x86/kernel/asm-offsets.c:12: arch/x86/include/asm/pgtable.h:66:26: error: redefinition of 'native_pud_clear' #define pud_clear(pud) native_pud_clear(pud) My interpretation is that the build error comes from a typo in __PAGETABLE_PUD_FOLDED, so fix that typo now, and remove the incorrect #ifdef around the native_pud_clear definition. Fixes: 3e761a42e19c ("mm, x86: fix HIGHMEM64 && PARAVIRT build config for native_pud_clear()") Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages") Link: http://lkml.kernel.org/r/20170314121330.182155-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Ackedy-by: Dave Jiang Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Garnier Cc: Kees Cook Cc: Dave Hansen Cc: Hugh Dickins Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-3level.h | 3 --- arch/x86/include/asm/pgtable.h | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 72277b1028a5..50d35e3185f5 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -121,12 +121,9 @@ static inline void native_pmd_clear(pmd_t *pmd) *(tmp + 1) = 0; } -#if !defined(CONFIG_SMP) || (defined(CONFIG_HIGHMEM64G) && \ - defined(CONFIG_PARAVIRT)) static inline void native_pud_clear(pud_t *pudp) { } -#endif static inline void pud_clear(pud_t *pudp) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cfb36b8c024..585ee0d42d18 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -62,7 +62,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); # define set_pud(pudp, pud) native_set_pud(pudp, pud) #endif -#ifndef __PAGETABLE_PMD_FOLDED +#ifndef __PAGETABLE_PUD_FOLDED #define pud_clear(pud) native_pud_clear(pud) #endif From 171012f561274784160f666f8398af8b42216e1f Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 16 Mar 2017 16:40:27 -0700 Subject: [PATCH 173/297] mm: don't warn when vmalloc() fails due to a fatal signal When vmalloc() fails it prints a very lengthy message with all the details about memory consumption assuming that it happened due to OOM. However, vmalloc() can also fail due to fatal signal pending. In such case the message is quite confusing because it suggests that it is OOM but the numbers suggest otherwise. The messages can also pollute console considerably. Don't warn when vmalloc() fails due to fatal signal pending. Link: http://lkml.kernel.org/r/20170313114425.72724-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Reviewed-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0dd80222b20b..0b057628a7ba 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1683,7 +1683,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (fatal_signal_pending(current)) { area->nr_pages = i; - goto fail; + goto fail_no_warn; } if (node == NUMA_NO_NODE) @@ -1709,6 +1709,7 @@ fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); +fail_no_warn: vfree(area->addr); return NULL; } From 55adc1d05dca9e949cdf46c747cb1e91c0e9143d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Mar 2017 16:40:30 -0700 Subject: [PATCH 174/297] mm: add private lock to serialize memory hotplug operations Commit bfc8c90139eb ("mem-hotplug: implement get/put_online_mems") introduced new functions get/put_online_mems() and mem_hotplug_begin/end() in order to allow similar semantics for memory hotplug like for cpu hotplug. The corresponding functions for cpu hotplug are get/put_online_cpus() and cpu_hotplug_begin/done() for cpu hotplug. The commit however missed to introduce functions that would serialize memory hotplug operations like they are done for cpu hotplug with cpu_maps_update_begin/done(). This basically leaves mem_hotplug.active_writer unprotected and allows concurrent writers to modify it, which may lead to problems as outlined by commit f931ab479dd2 ("mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}"). That commit was extended again with commit b5d24fda9c3d ("mm, devm_memremap_pages: hold device_hotplug lock over mem_hotplug_{begin, done}") which serializes memory hotplug operations for some call sites by using the device_hotplug lock. In addition with commit 3fc21924100b ("mm: validate device_hotplug is held for memory hotplug") a sanity check was added to mem_hotplug_begin() to verify that the device_hotplug lock is held. This in turn triggers the following warning on s390: WARNING: CPU: 6 PID: 1 at drivers/base/core.c:643 assert_held_device_hotplug+0x4a/0x58 Call Trace: assert_held_device_hotplug+0x40/0x58) mem_hotplug_begin+0x34/0xc8 add_memory_resource+0x7e/0x1f8 add_memory+0xda/0x130 add_memory_merged+0x15c/0x178 sclp_detect_standby_memory+0x2ae/0x2f8 do_one_initcall+0xa2/0x150 kernel_init_freeable+0x228/0x2d8 kernel_init+0x2a/0x140 kernel_thread_starter+0x6/0xc One possible fix would be to add more lock_device_hotplug() and unlock_device_hotplug() calls around each call site of mem_hotplug_begin/end(). But that would give the device_hotplug lock additional semantics it better should not have (serialize memory hotplug operations). Instead add a new memory_add_remove_lock which has the similar semantics like cpu_add_remove_lock for cpu hotplug. To keep things hopefully a bit easier the lock will be locked and unlocked within the mem_hotplug_begin/end() functions. Link: http://lkml.kernel.org/r/20170314125226.16779-2-heiko.carstens@de.ibm.com Signed-off-by: Heiko Carstens Reported-by: Sebastian Ott Acked-by: Dan Williams Acked-by: Rafael J. Wysocki Cc: Michal Hocko Cc: Vladimir Davydov Cc: Ben Hutchings Cc: Gerald Schaefer Cc: Martin Schwidefsky Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ---- mm/memory_hotplug.c | 6 +++++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index 06123234f118..07e85e5229da 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,11 +247,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); - lock_device_hotplug(); mem_hotplug_begin(); arch_remove_memory(align_start, align_size); mem_hotplug_done(); - unlock_device_hotplug(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); @@ -364,11 +362,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_pfn_remap; - lock_device_hotplug(); mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); mem_hotplug_done(); - unlock_device_hotplug(); if (error) goto err_add_memory; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 295479b792ec..6fa7208bcd56 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -125,9 +125,12 @@ void put_online_mems(void) } +/* Serializes write accesses to mem_hotplug.active_writer. */ +static DEFINE_MUTEX(memory_add_remove_lock); + void mem_hotplug_begin(void) { - assert_held_device_hotplug(); + mutex_lock(&memory_add_remove_lock); mem_hotplug.active_writer = current; @@ -147,6 +150,7 @@ void mem_hotplug_done(void) mem_hotplug.active_writer = NULL; mutex_unlock(&mem_hotplug.lock); memhp_lock_release(); + mutex_unlock(&memory_add_remove_lock); } /* add this memory to iomem resource */ From 15c9e10d9ad4d41d076148bbff1de7f659f68852 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Mar 2017 16:40:33 -0700 Subject: [PATCH 175/297] drivers core: remove assert_held_device_hotplug() The last caller of assert_held_device_hotplug() is gone, so remove it again. Link: http://lkml.kernel.org/r/20170314125226.16779-3-heiko.carstens@de.ibm.com Signed-off-by: Heiko Carstens Acked-by: Dan Williams Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Vladimir Davydov Cc: Ben Hutchings Cc: Gerald Schaefer Cc: Martin Schwidefsky Cc: Sebastian Ott Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/core.c | 5 ----- include/linux/device.h | 1 - 2 files changed, 6 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 684bda4d14a1..6bb60fb6a30b 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -639,11 +639,6 @@ int lock_device_hotplug_sysfs(void) return restart_syscall(); } -void assert_held_device_hotplug(void) -{ - lockdep_assert_held(&device_hotplug_lock); -} - #ifdef CONFIG_BLOCK static inline int device_is_not_partition(struct device *dev) { diff --git a/include/linux/device.h b/include/linux/device.h index 30c4570e928d..9ef518af5515 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1140,7 +1140,6 @@ static inline bool device_supports_offline(struct device *dev) extern void lock_device_hotplug(void); extern void unlock_device_hotplug(void); extern int lock_device_hotplug_sysfs(void); -void assert_held_device_hotplug(void); extern int device_offline(struct device *dev); extern int device_online(struct device *dev); extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); From 4cbe4dac82e423ecc9a0ba46af24a860853259f4 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 13 Mar 2017 19:29:08 +0200 Subject: [PATCH 176/297] net/mlx4_core: Avoid delays during VF driver device shutdown Some Hypervisors detach VFs from VMs by instantly causing an FLR event to be generated for a VF. In the mlx4 case, this will cause that VF's comm channel to be disabled before the VM has an opportunity to invoke the VF device's "shutdown" method. For such Hypervisors, there is a race condition between the VF's shutdown method and its internal-error detection/reset thread. The internal-error detection/reset thread (which runs every 5 seconds) also detects a disabled comm channel. If the internal-error detection/reset flow wins the race, we still get delays (while that flow tries repeatedly to detect comm-channel recovery). The cited commit fixed the command timeout problem when the internal-error detection/reset flow loses the race. This commit avoids the unneeded delays when the internal-error detection/reset flow wins. Fixes: d585df1c5ccf ("net/mlx4_core: Avoid command timeouts during VF driver device shutdown") Signed-off-by: Jack Morgenstein Reported-by: Simon Xiao Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 11 +++++++++++ drivers/net/ethernet/mellanox/mlx4/main.c | 11 +++++++++++ include/linux/mlx4/device.h | 1 + 3 files changed, 23 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index e8c105164931..0e0fa7030565 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -2305,6 +2305,17 @@ static int sync_toggles(struct mlx4_dev *dev) rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)); if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) { /* PCI might be offline */ + + /* If device removal has been requested, + * do not continue retrying. + */ + if (dev->persist->interface_state & + MLX4_INTERFACE_STATE_NOWAIT) { + mlx4_warn(dev, + "communication channel is offline\n"); + return -EIO; + } + msleep(100); wr_toggle = swab32(readl(&priv->mfunc.comm-> slave_write)); diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 21377c315083..703205475524 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1940,6 +1940,14 @@ static int mlx4_comm_check_offline(struct mlx4_dev *dev) (u32)(1 << COMM_CHAN_OFFLINE_OFFSET)); if (!offline_bit) return 0; + + /* If device removal has been requested, + * do not continue retrying. + */ + if (dev->persist->interface_state & + MLX4_INTERFACE_STATE_NOWAIT) + break; + /* There are cases as part of AER/Reset flow that PF needs * around 100 msec to load. We therefore sleep for 100 msec * to allow other tasks to make use of that CPU during this @@ -3955,6 +3963,9 @@ static void mlx4_remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(priv); int active_vfs = 0; + if (mlx4_is_slave(dev)) + persist->interface_state |= MLX4_INTERFACE_STATE_NOWAIT; + mutex_lock(&persist->interface_state_mutex); persist->interface_state |= MLX4_INTERFACE_STATE_DELETION; mutex_unlock(&persist->interface_state_mutex); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7e66e4f62858..1beb1ec2fbdf 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -476,6 +476,7 @@ enum { enum { MLX4_INTERFACE_STATE_UP = 1 << 0, MLX4_INTERFACE_STATE_DELETION = 1 << 1, + MLX4_INTERFACE_STATE_NOWAIT = 1 << 2, }; #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ From bc9ab9231ec8c08352ea860480523d88a221a68f Mon Sep 17 00:00:00 2001 From: David Arcari Date: Mon, 13 Mar 2017 19:07:16 -0400 Subject: [PATCH 177/297] net: ethernet: aquantia: set net_device mtu when mtu is changed When the aquantia device mtu is changed the net_device structure is not updated. As a result the ip command does not properly reflect the mtu change. Commit 5513e16421cb incorrectly assumed that __dev_set_mtu() was making the assignment ndev->mtu = new_mtu; This is not true in the case where the driver has a ndo_change_mtu routine. Fixes: 5513e16421cb ("net: ethernet: aquantia: Fixes for aq_ndev_change_mtu") Cc: Pavel Belous Signed-off-by: David Arcari Tested-by: Pavel Belous Reviewed-by: Jarod Wilson Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c b/drivers/net/ethernet/aquantia/atlantic/aq_main.c index dad63623be6a..d05fbfdce5e5 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c @@ -98,6 +98,7 @@ static int aq_ndev_change_mtu(struct net_device *ndev, int new_mtu) if (err < 0) goto err_exit; + ndev->mtu = new_mtu; if (netif_running(ndev)) { aq_ndev_close(ndev); From 61733c91c454a61be0ffc93fe46a5d5f2f048c1c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 13 Mar 2017 16:49:10 -0700 Subject: [PATCH 178/297] net: mpls: Fix nexthop alive tracking on down events Alive tracking of nexthops can account for a link twice if the carrier goes down followed by an admin down of the same link rendering multipath routes useless. This is similar to 79099aab38c8 for UNREGISTER events and DOWN events. Fix by tracking number of alive nexthops in mpls_ifdown similar to the logic in mpls_ifup. Checking the flags per nexthop once after all events have been processed is simpler than trying to maintian a running count through all event combinations. Also, WRITE_ONCE is used instead of ACCESS_ONCE to set rt_nhn_alive per a comment from checkpatch: WARNING: Prefer WRITE_ONCE(, ) over ACCESS_ONCE() = Fixes: c89359a42e2a4 ("mpls: support for dead routes") Signed-off-by: David Ahern Acked-by: Robert Shearman Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 33211f9a2656..6414079aa729 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1269,6 +1269,8 @@ static void mpls_ifdown(struct net_device *dev, int event) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); + unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN; + unsigned int alive; unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); @@ -1278,9 +1280,11 @@ static void mpls_ifdown(struct net_device *dev, int event) if (!rt) continue; + alive = 0; change_nexthops(rt) { if (rtnl_dereference(nh->nh_dev) != dev) - continue; + goto next; + switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: @@ -1288,13 +1292,16 @@ static void mpls_ifdown(struct net_device *dev, int event) /* fall through */ case NETDEV_CHANGE: nh->nh_flags |= RTNH_F_LINKDOWN; - if (event != NETDEV_UNREGISTER) - ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1; break; } if (event == NETDEV_UNREGISTER) RCU_INIT_POINTER(nh->nh_dev, NULL); +next: + if (!(nh->nh_flags & nh_flags)) + alive++; } endfor_nexthops(rt); + + WRITE_ONCE(rt->rt_nhn_alive, alive); } } From 4ee39733fbecf04cf9f346de2d64788c35028079 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 15 Mar 2017 18:14:33 -0700 Subject: [PATCH 179/297] net: ipv6: set route type for anycast routes Anycast routes have the RTF_ANYCAST flag set, but when dumping routes for userspace the route type is not set to RTN_ANYCAST. Make it so. Fixes: 58c4fb86eabcb ("[IPV6]: Flag RTF_ANYCAST for anycast routes") CC: Hideaki YOSHIFUJI Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 35c58b669ebd..9db1418993f2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3423,6 +3423,8 @@ static int rt6_fill_node(struct net *net, } else if (rt->rt6i_flags & RTF_LOCAL) rtm->rtm_type = RTN_LOCAL; + else if (rt->rt6i_flags & RTF_ANYCAST) + rtm->rtm_type = RTN_ANYCAST; else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) rtm->rtm_type = RTN_LOCAL; else From 9501df3cd9204f5859f649182431616a31ee88a1 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Wed, 15 Mar 2017 23:38:07 -0400 Subject: [PATCH 180/297] ibmvnic: Free tx/rx scrq pointer array when releasing sub-crqs The pointer array for the tx/rx sub crqs should be free'ed when releasing the tx/rx sub crqs. Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5f11b4dc95d2..b23d6545f835 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1257,6 +1257,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) release_sub_crq_queue(adapter, adapter->tx_scrq[i]); } + kfree(adapter->tx_scrq); adapter->tx_scrq = NULL; } @@ -1269,6 +1270,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) release_sub_crq_queue(adapter, adapter->rx_scrq[i]); } + kfree(adapter->rx_scrq); adapter->rx_scrq = NULL; } } From 4d4a6ac73e7466c2085c307fac41f74ce4568a45 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:10 +0000 Subject: [PATCH 181/297] rxrpc: Ignore BUSY packets on old calls If we receive a BUSY packet for a call we think we've just completed, the packet is handed off to the connection processor to deal with - but the connection processor doesn't expect a BUSY packet and so flags a protocol error. Fix this by simply ignoring the BUSY packet for the moment. The symptom of this may appear as a system call failing with EPROTO. This may be triggered by pressing ctrl-C under some circumstances. This comes about we abort calls due to interruption by a signal (which we shouldn't do, but that's going to be a large fix and mostly in fs/afs/). What happens is that we abort the call and may also abort follow up calls too (this needs offloading somehoe). So we see a transmission of something like the following sequence of packets: DATA for call N ABORT call N DATA for call N+1 ABORT call N+1 in very quick succession on the same channel. However, the peer may have deferred the processing of the ABORT from the call N to a background thread and thus sees the DATA message from the call N+1 coming in before it has cleared the channel. Thus it sends a BUSY packet[*]. [*] Note that some implementations (OpenAFS, for example) mark the BUSY packet with one plus the callNumber of the call prior to call N. Ordinarily, this would be call N, but there's no requirement for the calls on a channel to be numbered strictly sequentially (the number is required to increase). This is wrong and means that the callNumber in the BUSY packet should be ignored (it really ought to be N+1 since that's what it's in response to). Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/conn_event.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 3f9d8d7ec632..b099b64366f3 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -275,6 +275,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, rxrpc_conn_retransmit_call(conn, skb); return 0; + case RXRPC_PACKET_TYPE_BUSY: + /* Just ignore BUSY packets for now. */ + return 0; + case RXRPC_PACKET_TYPE_ABORT: if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &wtmp, sizeof(wtmp)) < 0) From d12c917691b45d9dffcfe7c2362d25caa40905fd Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 16 Mar 2017 10:32:42 -0700 Subject: [PATCH 182/297] bridge: resolve a false alarm of lockdep Andrei reported a false alarm of lockdep at net/bridge/br_fdb.c:109, this is because in Andrei's case, a spin_bug() was already triggered before this, therefore the debug_locks is turned off, lockdep_is_held() is no longer accurate after that. We should use lockdep_assert_held_once() instead of lockdep_is_held() to respect debug_locks. Fixes: 410b3d48f5111 ("bridge: fdb: add proper lock checks in searching functions") Reported-by: Andrei Vagin Signed-off-by: Cong Wang Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 2 +- net/bridge/br_private.h | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4f598dc2d916..6e08b7199dd7 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -106,7 +106,7 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; - WARN_ON_ONCE(!br_hash_lock_held(br)); + lockdep_assert_held_once(&br->hash_lock); rcu_read_lock(); fdb = fdb_find_rcu(head, addr, vid); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2288fca7756c..61368186edea 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -531,15 +531,6 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid); -static inline bool br_hash_lock_held(struct net_bridge *br) -{ -#ifdef CONFIG_LOCKDEP - return lockdep_is_held(&br->hash_lock); -#else - return true; -#endif -} - /* br_forward.c */ enum br_pkt_type { BR_PKT_UNICAST, From e14b4db7a567ff507453ecd9c64da51bbc2b6d23 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 16 Mar 2017 12:21:32 -0700 Subject: [PATCH 183/297] netvsc: fix race during initialization When device is being setup on boot, there is a small race where network device callback is registered, but the netvsc_device pointer is not set yet. This can cause a NULL ptr dereference if packet arrives during this window. Fixes: 46b4f7f5d1f7 ("netvsc: eliminate per-device outstanding send counter") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 4c1d8cca247b..8dd0b8770328 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1231,8 +1231,11 @@ void netvsc_channel_cb(void *context) return; net_device = net_device_to_netvsc_device(ndev); - if (unlikely(net_device->destroy) && - netvsc_channel_idle(net_device, q_idx)) + if (unlikely(!net_device)) + return; + + if (unlikely(net_device->destroy && + netvsc_channel_idle(net_device, q_idx))) return; /* commit_rd_index() -> hv_signal_on_read() needs this. */ From db7f00b8dba6d687b6ab1f2e9309acfd214fcb4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2017 15:43:19 -0700 Subject: [PATCH 184/297] tcp: tcp_get_info() should read tcp_time_stamp later Commit b369e7fd41f7 ("tcp: make TCP_INFO more consistent") moved lock_sock_fast() earlier in tcp_get_info() This has the minor effect that jiffies value being sampled at the beginning of tcp_get_info() is more likely to be off by one, and we report big tcpi_last_data_sent values (like 0xFFFFFFFF). Since we lock the socket, fetching tcp_time_stamp right before doing the jiffies_to_msecs() calls is enough to remove these wrong values. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cf4555581282..1e319a525d51 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2770,7 +2770,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now = tcp_time_stamp, intv; + u32 now, intv; u64 rate64; bool slow; u32 rate; @@ -2839,6 +2839,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; + now = tcp_time_stamp; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); From 8971e1c79d3f6c9a5e6f7a65c50c41f434a4dae6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 17 Mar 2017 16:02:35 +1100 Subject: [PATCH 185/297] powerpc/pseries: Don't give a warning when HPT resizing isn't available As of commit 438cc81a41e8 ("powerpc/pseries: Automatically resize HPT for memory hot add/remove"), when running on the pseries platform, we always attempt to use the PAPR extension to resize the hashed page table (HPT) when we add or remove memory. This is fine, but when the extension is not available we'll give a harmless, but scary warning. Instead check if the firmware supports HPT resizing before populating the mmu_hash_ops.resize_hpt pointer. Signed-off-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/lpar.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 251060cf1713..8b1fe895daa3 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -751,7 +751,9 @@ void __init hpte_init_pseries(void) mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range; mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all; mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; - mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; + + if (firmware_has_feature(FW_FEATURE_HPT_RESIZE)) + mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; } void radix_init_pseries(void) From 5dc855d44c2ad960a86f593c60461f1ae1566b6d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 16 Mar 2017 12:59:39 -0700 Subject: [PATCH 186/297] x86/perf: Fix CR4.PCE propagation to use active_mm instead of mm If one thread mmaps a perf event while another thread in the same mm is in some context where active_mm != mm (which can happen in the scheduler, for example), refresh_pce() would write the wrong value to CR4.PCE. This broke some PAPI tests. Reported-and-tested-by: Vince Weaver Signed-off-by: Andy Lutomirski Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 7911d3f7af14 ("perf/x86: Only allow rdpmc if a perf_event is mapped") Link: http://lkml.kernel.org/r/0c5b38a76ea50e405f9abe07a13dfaef87c173a1.1489694270.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1635c0c8df23..e07b36c5588a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2100,8 +2100,8 @@ static int x86_pmu_event_init(struct perf_event *event) static void refresh_pce(void *ignored) { - if (current->mm) - load_mm_cr4(current->mm); + if (current->active_mm) + load_mm_cr4(current->active_mm); } static void x86_pmu_event_mapped(struct perf_event *event) From 4b07372a32c0c1505a7634ad7e607d83340ef645 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 16 Mar 2017 12:59:40 -0700 Subject: [PATCH 187/297] x86/perf: Clarify why x86_pmu_event_mapped() isn't racy Naively, it looks racy, but ->mmap_sem saves it. Add a comment and a lockdep assertion. Signed-off-by: Andy Lutomirski Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/03a1e629063899168dfc4707f3bb6e581e21f5c6.1489694270.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e07b36c5588a..183a972f9210 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2109,6 +2109,18 @@ static void x86_pmu_event_mapped(struct perf_event *event) if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; + /* + * This function relies on not being called concurrently in two + * tasks in the same mm. Otherwise one task could observe + * perf_rdpmc_allowed > 1 and return all the way back to + * userspace with CR4.PCE clear while another task is still + * doing on_each_cpu_mask() to propagate CR4.PCE. + * + * For now, this can't happen because all callers hold mmap_sem + * for write. If this changes, we'll need a different solution. + */ + lockdep_assert_held_exclusive(¤t->mm->mmap_sem); + if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); } From 2cd29f2387be70de9feb4c9f8dbc7c0bd55748ce Mon Sep 17 00:00:00 2001 From: Robert Middleton Date: Wed, 15 Mar 2017 16:56:47 -0400 Subject: [PATCH 188/297] gpio:mcp23s08 Fixed missing interrupts When an interrupt occurs on an MCP23S08 chip, the INTF register will only contain one bit as causing the interrupt. If more than two pins change at the same time on the chip, this causes one of the pins to not be reported. This patch fixes the logic for checking if a pin has changed, so that multiple pins will always cause more than one change. Cc: stable@vger.kernel.org Signed-off-by: Robert Middleton Tested-by: Phil Reid Signed-off-by: Linus Walleij --- drivers/gpio/gpio-mcp23s08.c | 65 +++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpio-mcp23s08.c b/drivers/gpio/gpio-mcp23s08.c index bdb692345428..2a57d024481d 100644 --- a/drivers/gpio/gpio-mcp23s08.c +++ b/drivers/gpio/gpio-mcp23s08.c @@ -270,8 +270,10 @@ mcp23s08_direction_output(struct gpio_chip *chip, unsigned offset, int value) static irqreturn_t mcp23s08_irq(int irq, void *data) { struct mcp23s08 *mcp = data; - int intcap, intf, i; + int intcap, intf, i, gpio, gpio_orig, intcap_mask; unsigned int child_irq; + bool intf_set, intcap_changed, gpio_bit_changed, + defval_changed, gpio_set; mutex_lock(&mcp->lock); if (mcp_read(mcp, MCP_INTF, &intf) < 0) { @@ -287,14 +289,67 @@ static irqreturn_t mcp23s08_irq(int irq, void *data) } mcp->cache[MCP_INTCAP] = intcap; + + /* This clears the interrupt(configurable on S18) */ + if (mcp_read(mcp, MCP_GPIO, &gpio) < 0) { + mutex_unlock(&mcp->lock); + return IRQ_HANDLED; + } + gpio_orig = mcp->cache[MCP_GPIO]; + mcp->cache[MCP_GPIO] = gpio; mutex_unlock(&mcp->lock); + if (mcp->cache[MCP_INTF] == 0) { + /* There is no interrupt pending */ + return IRQ_HANDLED; + } + + dev_dbg(mcp->chip.parent, + "intcap 0x%04X intf 0x%04X gpio_orig 0x%04X gpio 0x%04X\n", + intcap, intf, gpio_orig, gpio); for (i = 0; i < mcp->chip.ngpio; i++) { - if ((BIT(i) & mcp->cache[MCP_INTF]) && - ((BIT(i) & intcap & mcp->irq_rise) || - (mcp->irq_fall & ~intcap & BIT(i)) || - (BIT(i) & mcp->cache[MCP_INTCON]))) { + /* We must check all of the inputs on the chip, + * otherwise we may not notice a change on >=2 pins. + * + * On at least the mcp23s17, INTCAP is only updated + * one byte at a time(INTCAPA and INTCAPB are + * not written to at the same time - only on a per-bank + * basis). + * + * INTF only contains the single bit that caused the + * interrupt per-bank. On the mcp23s17, there is + * INTFA and INTFB. If two pins are changed on the A + * side at the same time, INTF will only have one bit + * set. If one pin on the A side and one pin on the B + * side are changed at the same time, INTF will have + * two bits set. Thus, INTF can't be the only check + * to see if the input has changed. + */ + + intf_set = BIT(i) & mcp->cache[MCP_INTF]; + if (i < 8 && intf_set) + intcap_mask = 0x00FF; + else if (i >= 8 && intf_set) + intcap_mask = 0xFF00; + else + intcap_mask = 0x00; + + intcap_changed = (intcap_mask & + (BIT(i) & mcp->cache[MCP_INTCAP])) != + (intcap_mask & (BIT(i) & gpio_orig)); + gpio_set = BIT(i) & mcp->cache[MCP_GPIO]; + gpio_bit_changed = (BIT(i) & gpio_orig) != + (BIT(i) & mcp->cache[MCP_GPIO]); + defval_changed = (BIT(i) & mcp->cache[MCP_INTCON]) && + ((BIT(i) & mcp->cache[MCP_GPIO]) != + (BIT(i) & mcp->cache[MCP_DEFVAL])); + + if (((gpio_bit_changed || intcap_changed) && + (BIT(i) & mcp->irq_rise) && gpio_set) || + ((gpio_bit_changed || intcap_changed) && + (BIT(i) & mcp->irq_fall) && !gpio_set) || + defval_changed) { child_irq = irq_find_mapping(mcp->chip.irqdomain, i); handle_nested_irq(child_irq); } From e7ede72a6d40cb3a30c087142d79381ca8a31dab Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Mar 2017 22:53:37 +0100 Subject: [PATCH 189/297] perf symbols: Fix symbols__fixup_end heuristic for corner cases The current symbols__fixup_end() heuristic for the last entry in the rb tree is suboptimal as it leads to not being able to recognize the symbol in the call graph in a couple of corner cases, for example: i) If the symbol has a start address (f.e. exposed via kallsyms) that is at a page boundary, then the roundup(curr->start, 4096) for the last entry will result in curr->start == curr->end with a symbol length of zero. ii) If the symbol has a start address that is shortly before a page boundary, then also here, curr->end - curr->start will just be very few bytes, where it's unrealistic that we could perform a match against. Instead, change the heuristic to roundup(curr->start, 4096) + 4096, so that we can catch such corner cases and have a better chance to find that specific symbol. It's still just best effort as the real end of the symbol is unknown to us (and could even be at a larger offset than the current range), but better than the current situation. Alexei reported that he recently run into case i) with a JITed eBPF program (these are all page aligned) as the last symbol which wasn't properly shown in the call graph (while other eBPF program symbols in the rb tree were displayed correctly). Since this is a generic issue, lets try to improve the heuristic a bit. Reported-and-Tested-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Fixes: 2e538c4a1847 ("perf tools: Improve kernel/modules symbol lookup") Link: http://lkml.kernel.org/r/bb5c80d27743be6f12afc68405f1956a330e1bc9.1489614365.git.daniel@iogearbox.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 70e389bc4af7..9b4d8ba22fed 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -202,7 +202,7 @@ void symbols__fixup_end(struct rb_root *symbols) /* Last entry */ if (curr->end == curr->start) - curr->end = roundup(curr->start, 4096); + curr->end = roundup(curr->start, 4096) + 4096; } void __map_groups__fixup_end(struct map_groups *mg, enum map_type type) From 38a33101dd5fb8f07244e7e6dd04747a6cd2e3fb Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Thu, 9 Mar 2017 11:36:36 +0800 Subject: [PATCH 190/297] NFS: fix the fault nrequests decreasing for nfs_inode COPY The nfs_commit_file for NFSv4.2's COPY operation goes through the commit path for normal WRITE, but without increase nrequests, so, the nrequests decreased in nfs_commit_release_pages is fault. After that, the nrequests will be wrong. [ 5670.299881] ------------[ cut here ]------------ [ 5670.300295] WARNING: CPU: 0 PID: 27656 at fs/nfs/inode.c:127 nfs_clear_inode+0x66/0x90 [nfs] [ 5670.300558] Modules linked in: nfsv4(E) nfs(E) fscache(E) tun bridge stp llc fuse ip_set nfnetlink vmw_vsock_vmci_transport vsock snd_seq_midi snd_seq_midi_event ppdev f2fs coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_ens1371 intel_rapl_perf gameport snd_ac97_codec vmw_balloon ac97_bus snd_seq snd_pcm joydev snd_rawmidi snd_timer snd_seq_device snd soundcore nfit parport_pc parport acpi_cpufreq tpm_tis tpm_tis_core tpm i2c_piix4 vmw_vmci shpchp nfsd auth_rpcgss nfs_acl lockd grace sunrpc xfs libcrc32c vmwgfx drm_kms_helper ttm drm e1000 crc32c_intel mptspi scsi_transport_spi serio_raw mptscsih mptbase ata_generic pata_acpi fjes [last unloaded: fscache] [ 5670.302925] CPU: 0 PID: 27656 Comm: umount.nfs4 Tainted: G W E 4.11.0-rc1+ #519 [ 5670.303292] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 5670.304094] Call Trace: [ 5670.304510] dump_stack+0x63/0x86 [ 5670.304917] __warn+0xcb/0xf0 [ 5670.305276] warn_slowpath_null+0x1d/0x20 [ 5670.305661] nfs_clear_inode+0x66/0x90 [nfs] [ 5670.306093] nfs4_evict_inode+0x61/0x70 [nfsv4] [ 5670.306480] evict+0xbb/0x1c0 [ 5670.306888] dispose_list+0x4d/0x70 [ 5670.307233] evict_inodes+0x178/0x1a0 [ 5670.307579] generic_shutdown_super+0x44/0xf0 [ 5670.307985] nfs_kill_super+0x21/0x40 [nfs] [ 5670.308325] deactivate_locked_super+0x43/0x70 [ 5670.308698] deactivate_super+0x5a/0x60 [ 5670.309036] cleanup_mnt+0x3f/0x90 [ 5670.309407] __cleanup_mnt+0x12/0x20 [ 5670.309837] task_work_run+0x80/0xa0 [ 5670.310162] exit_to_usermode_loop+0x89/0x90 [ 5670.310497] syscall_return_slowpath+0xaa/0xb0 [ 5670.310875] entry_SYSCALL_64_fastpath+0xa7/0xa9 [ 5670.311197] RIP: 0033:0x7f1bb3617fe7 [ 5670.311545] RSP: 002b:00007ffecbabb828 EFLAGS: 00000206 ORIG_RAX: 00000000000000a6 [ 5670.311906] RAX: 0000000000000000 RBX: 0000000001dca1f0 RCX: 00007f1bb3617fe7 [ 5670.312239] RDX: 000000000000000c RSI: 0000000000000001 RDI: 0000000001dc83c0 [ 5670.312653] RBP: 0000000001dc83c0 R08: 0000000000000001 R09: 0000000000000000 [ 5670.312998] R10: 0000000000000755 R11: 0000000000000206 R12: 00007ffecbabc66a [ 5670.313335] R13: 0000000001dc83a0 R14: 0000000000000000 R15: 0000000000000000 [ 5670.313758] ---[ end trace bf4bfe7764e4eb40 ]--- Cc: linux-kernel@vger.kernel.org Fixes: 67911c8f18 ("NFS: Add nfs_commit_file()") Signed-off-by: Kinglong Mee Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e75b056f46f4..abb2c8a3be42 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1784,7 +1784,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) (long long)req_offset(req)); if (status < 0) { nfs_context_set_write_error(req->wb_context, status); - nfs_inode_remove_request(req); + if (req->wb_page) + nfs_inode_remove_request(req); dprintk_cont(", error = %d\n", status); goto next; } @@ -1793,7 +1794,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) * returned by the server against all stored verfs. */ if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ - nfs_inode_remove_request(req); + if (req->wb_page) + nfs_inode_remove_request(req); dprintk_cont(" OK\n"); goto next; } From eed50879d64ab1b9f76445dbab822e43a098b309 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 11 Mar 2017 15:52:47 -0500 Subject: [PATCH 191/297] xprtrdma: Squelch kbuild sparse complaint New complaint from kbuild for 4.9.y: net/sunrpc/xprtrdma/verbs.c:489:19: sparse: incompatible types in comparison expression (different type sizes) verbs.c: 489 max_sge = min(ia->ri_device->attrs.max_sge, RPCRDMA_MAX_SEND_SGES); I can't reproduce this running sparse here. Likewise, "make W=1 net/sunrpc/xprtrdma/verbs.o" never indicated any issue. A little poking suggests that because the range of its values is small, gcc can make the actual width of RPCRDMA_MAX_SEND_SGES smaller than the width of an unsigned integer. Fixes: 16f906d66cd7 ("xprtrdma: Reduce required number of send SGEs") Signed-off-by: Chuck Lever Cc: stable@kernel.org Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 81cd31acf690..3b332b395045 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -503,7 +503,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct ib_cq *sendcq, *recvcq; int rc; - max_sge = min(ia->ri_device->attrs.max_sge, RPCRDMA_MAX_SEND_SGES); + max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge, + RPCRDMA_MAX_SEND_SGES); if (max_sge < RPCRDMA_MIN_SEND_SGES) { pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); return -ENOMEM; From 05fae7bbc237bc7de0ee9c3dcf85b2572a80e3b5 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Mar 2017 10:48:13 +0800 Subject: [PATCH 192/297] nfs: make nfs4_cb_sv_ops static Fixes the following sparse warning: fs/nfs/callback.c:235:21: warning: symbol 'nfs4_cb_sv_ops' was not declared. Should it be static? Signed-off-by: Jason Yan Signed-off-by: Anna Schumaker --- fs/nfs/callback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 484bebc20bca..5c8a096d763e 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -231,12 +231,12 @@ static struct svc_serv_ops nfs41_cb_sv_ops = { .svo_module = THIS_MODULE, }; -struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = &nfs41_cb_sv_ops, }; #else -struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = NULL, }; From 63513232f8cd219dcaa5eafae028740ed3067d83 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 13 Mar 2017 10:36:19 -0400 Subject: [PATCH 193/297] NFS prevent double free in async nfs4_exchange_id Since rpc_task is async, the release function should be called which will free the impl_id, scope, and owner. Trond pointed at 2 more problems: -- use of client pointer after free in the nfs4_exchangeid_release() function -- cl_count mismatch if rpc_run_task() isn't run Fixes: 8d89bd70bc9 ("NFS setup async exchange_id") Signed-off-by: Olga Kornievskaia Cc: stable@vger.kernel.org # 4.9 Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c1f5369cd339..c780d98035cc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7425,11 +7425,11 @@ static void nfs4_exchange_id_release(void *data) struct nfs41_exchange_id_data *cdata = (struct nfs41_exchange_id_data *)data; - nfs_put_client(cdata->args.client); if (cdata->xprt) { xprt_put(cdata->xprt); rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); } + nfs_put_client(cdata->args.client); kfree(cdata->res.impl_id); kfree(cdata->res.server_scope); kfree(cdata->res.server_owner); @@ -7536,10 +7536,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, task_setup_data.callback_data = calldata; task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out_impl_id; - } + if (IS_ERR(task)) + return PTR_ERR(task); if (!xprt) { status = rpc_wait_for_completion_task(task); @@ -7567,6 +7565,7 @@ out_server_owner: kfree(calldata->res.server_owner); out_calldata: kfree(calldata); + nfs_put_client(clp); goto out; } From 033853325fe3bdc70819a8b97915bd3bca41d3af Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 8 Mar 2017 14:39:15 -0500 Subject: [PATCH 194/297] NFSv4.1 respect server's max size in CREATE_SESSION Currently client doesn't respect max sizes server returns in CREATE_SESSION. nfs4_session_set_rwsize() gets called and server->rsize, server->wsize are 0 so they never get set to the sizes returned by the server. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 5ae9d64ea08b..8346ccbf2d52 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1023,9 +1023,9 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - if (server->rsize > server_resp_sz) + if (!server->rsize || server->rsize > server_resp_sz) server->rsize = server_resp_sz; - if (server->wsize > server_rqst_sz) + if (!server->wsize || server->wsize > server_rqst_sz) server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } From a33e4b036d4612f62220f37a9fa29d273b6fd0ca Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 9 Mar 2017 12:56:48 -0500 Subject: [PATCH 195/297] pNFS: return status from nfs4_pnfs_ds_connect The nfs4_pnfs_ds_connect path can call rpc_create which can fail or it can wait on another context to reach the same failure. This checks that the rpc_create succeeded and returns the error to the caller. When an error is returned, both the files and flexfiles layouts will return NULL from _prepare_ds(). The flexfiles layout will also return the layout with the error NFS4ERR_NXIO. Signed-off-by: Weston Andros Adamson Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 25 ++++++++++++++++++++++- fs/nfs/filelayout/filelayoutdev.c | 7 ++++++- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 3 ++- fs/nfs/internal.h | 2 ++ fs/nfs/pnfs.h | 2 +- fs/nfs/pnfs_nfs.c | 15 ++++++++++++-- 6 files changed, 48 insertions(+), 6 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 91a8d610ba0f..390ada8741bc 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -325,10 +325,33 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat return NULL; } -static bool nfs_client_init_is_complete(const struct nfs_client *clp) +/* + * Return true if @clp is done initializing, false if still working on it. + * + * Use nfs_client_init_status to check if it was successful. + */ +bool nfs_client_init_is_complete(const struct nfs_client *clp) { return clp->cl_cons_state <= NFS_CS_READY; } +EXPORT_SYMBOL_GPL(nfs_client_init_is_complete); + +/* + * Return 0 if @clp was successfully initialized, -errno otherwise. + * + * This must be called *after* nfs_client_init_is_complete() returns true, + * otherwise it will pop WARN_ON_ONCE and return -EINVAL + */ +int nfs_client_init_status(const struct nfs_client *clp) +{ + /* called without checking nfs_client_init_is_complete */ + if (clp->cl_cons_state > NFS_CS_READY) { + WARN_ON_ONCE(1); + return -EINVAL; + } + return clp->cl_cons_state; +} +EXPORT_SYMBOL_GPL(nfs_client_init_status); int nfs_wait_client_init_complete(const struct nfs_client *clp) { diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index f956ca20a8a3..188120626179 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -266,6 +266,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); struct nfs4_pnfs_ds *ret = ds; struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); + int status; if (ds == NULL) { printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", @@ -277,9 +278,13 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) if (ds->ds_clp) goto out_test_devid; - nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, + status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, dataserver_retrans, 4, s->nfs_client->cl_minorversion); + if (status) { + ret = NULL; + goto out; + } out_test_devid: if (ret->ds_clp == NULL || diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e5a6f248697b..544e7725e679 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -384,6 +384,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; + int status; if (!ff_layout_mirror_valid(lseg, mirror, true)) { pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", @@ -404,7 +405,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, + status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 09ca5095c04e..7b38fedb7e03 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -186,6 +186,8 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); +extern bool nfs_client_init_is_complete(const struct nfs_client *clp); +extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 63f77b49a586..590e1e35781f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -367,7 +367,7 @@ void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); -void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, +int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, unsigned int retrans, u32 version, u32 minor_version); struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 9414b492439f..a7691b927af6 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -745,9 +745,9 @@ out: /* * Create an rpc connection to the nfs4_pnfs_ds data server. * Currently only supports IPv4 and IPv6 addresses. - * If connection fails, make devid unavailable. + * If connection fails, make devid unavailable and return a -errno. */ -void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, +int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, unsigned int retrans, u32 version, u32 minor_version) { @@ -772,6 +772,17 @@ void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, } else { nfs4_wait_ds_connect(ds); } + + /* + * At this point the ds->ds_clp should be ready, but it might have + * hit an error. + */ + if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + + return nfs_client_init_status(ds->ds_clp); } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); From da066f3f039eba3e72e97b2ccad0dd8b45ba84bd Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 9 Mar 2017 12:56:49 -0500 Subject: [PATCH 196/297] pNFS/flexfiles: never nfs4_mark_deviceid_unavailable The flexfiles layout should never mark a device unavailable. Move nfs4_mark_deviceid_unavailable out of nfs4_pnfs_ds_connect and call directly from files layout where it's still needed. The flexfiles driver still handles marked devices in error paths, but will now print a rate limited warning. Signed-off-by: Weston Andros Adamson Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayoutdev.c | 1 + fs/nfs/flexfilelayout/flexfilelayout.h | 14 ++++++++++++- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 2 +- fs/nfs/pnfs_nfs.c | 24 +++++++++++++++-------- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 188120626179..d913e818858f 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -282,6 +282,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) dataserver_retrans, 4, s->nfs_client->cl_minorversion); if (status) { + nfs4_mark_deviceid_unavailable(devid); ret = NULL; goto out; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index f4f39b0ab09b..98b34c9b0564 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -175,7 +175,19 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) static inline bool ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) { - return nfs4_test_deviceid_unavailable(node); + /* + * Flexfiles should never mark a DS unavailable, but if it does + * print a (ratelimited) warning as this can affect performance. + */ + if (nfs4_test_deviceid_unavailable(node)) { + u32 *p = (u32 *)node->deviceid.data; + + pr_warn_ratelimited("NFS: flexfiles layout referencing an " + "unavailable device [%x%x%x%x]\n", + p[0], p[1], p[2], p[3]); + return true; + } + return false; } static inline int diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 544e7725e679..85fde93dff77 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -421,11 +421,11 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].wsize = max_payload; goto out; } +out_fail: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); -out_fail: if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); ds = NULL; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index a7691b927af6..7250b95549ec 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -751,9 +751,11 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, unsigned int retrans, u32 version, u32 minor_version) { - if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { - int err = 0; + int err; +again: + err = 0; + if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { if (version == 3) { err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans); @@ -766,23 +768,29 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, err = -EPROTONOSUPPORT; } - if (err) - nfs4_mark_deviceid_unavailable(devid); nfs4_clear_ds_conn_bit(ds); } else { nfs4_wait_ds_connect(ds); + + /* what was waited on didn't connect AND didn't mark unavail */ + if (!ds->ds_clp && !nfs4_test_deviceid_unavailable(devid)) + goto again; } /* * At this point the ds->ds_clp should be ready, but it might have * hit an error. */ - if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { - WARN_ON_ONCE(1); - return -EINVAL; + if (!err) { + if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { + WARN_ON_ONCE(ds->ds_clp || + !nfs4_test_deviceid_unavailable(devid)); + return -EINVAL; + } + err = nfs_client_init_status(ds->ds_clp); } - return nfs_client_init_status(ds->ds_clp); + return err; } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); From 49d4a334727057af57048ded99697d17b016d91b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 6 Mar 2017 18:20:56 -0800 Subject: [PATCH 197/297] Btrfs: fix regression in lock_delalloc_pages The bug is a regression after commit (da2c7009f6ca "btrfs: teach __process_pages_contig about PAGE_LOCK operation") and commit (76c0021db8fd "Btrfs: use helper to simplify lock/unlock pages"). So if the dirty pages which are under writeback got truncated partially before we lock the dirty pages, we couldn't find all pages mapping to the delalloc range, and the bug didn't return an error so it kept going on and found that the delalloc range got truncated and got to unlock the dirty pages, and then the ASSERT could caught the error, and showed ----------------------------------------------------------------------------- assertion failed: page_ops & PAGE_LOCK, file: fs/btrfs/extent_io.c, line: 1716 ----------------------------------------------------------------------------- This fixes the bug by returning the proper -EAGAIN. Cc: David Sterba Reported-by: Dave Jones Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 28e81922a21c..8df797432740 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1714,7 +1714,8 @@ static int __process_pages_contig(struct address_space *mapping, * can we find nothing at @index. */ ASSERT(page_ops & PAGE_LOCK); - return ret; + err = -EAGAIN; + goto out; } for (i = 0; i < ret; i++) { From e1699d2d7bf6e6cce3e1baff19f9dd4595a58664 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Fri, 10 Mar 2017 16:45:44 -0500 Subject: [PATCH 198/297] btrfs: add missing memset while reading compressed inline extents This is a story about 4 distinct (and very old) btrfs bugs. Commit c8b978188c ("Btrfs: Add zlib compression support") added three data corruption bugs for inline extents (bugs #1-3). Commit 93c82d5750 ("Btrfs: zero page past end of inline file items") fixed bug #1: uncompressed inline extents followed by a hole and more extents could get non-zero data in the hole as they were read. The fix was to add a memset in btrfs_get_extent to zero out the hole. Commit 166ae5a418 ("btrfs: fix inline compressed read err corruption") fixed bug #2: compressed inline extents which contained non-zero bytes might be replaced with zero bytes in some cases. This patch removed an unhelpful memset from uncompress_inline, but the case where memset is required was missed. There is also a memset in the decompression code, but this only covers decompressed data that is shorter than the ram_bytes from the extent ref record. This memset doesn't cover the region between the end of the decompressed data and the end of the page. It has also moved around a few times over the years, so there's no single patch to refer to. This patch fixes bug #3: compressed inline extents followed by a hole and more extents could get non-zero data in the hole as they were read (i.e. bug #3 is the same as bug #1, but s/uncompressed/compressed/). The fix is the same: zero out the hole in the compressed case too, by putting a memset back in uncompress_inline, but this time with correct parameters. The last and oldest bug, bug #0, is the cause of the offending inline extent/hole/extent pattern. Bug #0 is a subtle and mostly-harmless quirk of behavior somewhere in the btrfs write code. In a few special cases, an inline extent and hole are allowed to persist where they normally would be combined with later extents in the file. A fast reproducer for bug #0 is presented below. A few offending extents are also created in the wild during large rsync transfers with the -S flag. A Linux kernel build (git checkout; make allyesconfig; make -j8) will produce a handful of offending files as well. Once an offending file is created, it can present different content to userspace each time it is read. Bug #0 is at least 4 and possibly 8 years old. I verified every vX.Y kernel back to v3.5 has this behavior. There are fossil records of this bug's effects in commits all the way back to v2.6.32. I have no reason to believe bug #0 wasn't present at the beginning of btrfs compression support in v2.6.29, but I can't easily test kernels that old to be sure. It is not clear whether bug #0 is worth fixing. A fix would likely require injecting extra reads into currently write-only paths, and most of the exceptional cases caused by bug #0 are already handled now. Whether we like them or not, bug #0's inline extents followed by holes are part of the btrfs de-facto disk format now, and we need to be able to read them without data corruption or an infoleak. So enough about bug #0, let's get back to bug #3 (this patch). An example of on-disk structure leading to data corruption found in the wild: item 61 key (606890 INODE_ITEM 0) itemoff 9662 itemsize 160 inode generation 50 transid 50 size 47424 nbytes 49141 block group 0 mode 100644 links 1 uid 0 gid 0 rdev 0 flags 0x0(none) item 62 key (606890 INODE_REF 603050) itemoff 9642 itemsize 20 inode ref index 3 namelen 10 name: DB_File.so item 63 key (606890 EXTENT_DATA 0) itemoff 8280 itemsize 1362 inline extent data size 1341 ram 4085 compress(zlib) item 64 key (606890 EXTENT_DATA 4096) itemoff 8227 itemsize 53 extent data disk byte 5367308288 nr 20480 extent data offset 0 nr 45056 ram 45056 extent compression(zlib) Different data appears in userspace during each read of the 11 bytes between 4085 and 4096. The extent in item 63 is not long enough to fill the first page of the file, so a memset is required to fill the space between item 63 (ending at 4085) and item 64 (beginning at 4096) with zero. Here is a reproducer from Liu Bo, which demonstrates another method of creating the same inline extent and hole pattern: Using 'page_poison=on' kernel command line (or enable CONFIG_PAGE_POISONING) run the following: # touch foo # chattr +c foo # xfs_io -f -c "pwrite -W 0 1000" foo # xfs_io -f -c "falloc 4 8188" foo # od -x foo # echo 3 >/proc/sys/vm/drop_caches # od -x foo This produce the following on my box: Correct output: file contains 1000 data bytes followed by zeros: 0000000 cdcd cdcd cdcd cdcd cdcd cdcd cdcd cdcd * 0001740 cdcd cdcd cdcd cdcd 0000 0000 0000 0000 0001760 0000 0000 0000 0000 0000 0000 0000 0000 * 0020000 Actual output: the data after the first 1000 bytes will be different each run: 0000000 cdcd cdcd cdcd cdcd cdcd cdcd cdcd cdcd * 0001740 cdcd cdcd cdcd cdcd 6c63 7400 635f 006d 0001760 5f74 6f43 7400 435f 0053 5f74 7363 7400 0002000 435f 0056 5f74 6164 7400 645f 0062 5f74 (...) Signed-off-by: Zygo Blaxell Reviewed-by: Liu Bo Reviewed-by: Chris Mason Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2bc07aad1ae..e57191072aa3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6709,6 +6709,20 @@ static noinline int uncompress_inline(struct btrfs_path *path, max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); + + /* + * decompression code contains a memset to fill in any space between the end + * of the uncompressed data and the end of max_size in case the decompressed + * data ends up shorter than ram_bytes. That doesn't cover the hole between + * the end of an inline extent and the beginning of the next block, so we + * cover that region here. + */ + + if (max_size + pg_offset < PAGE_SIZE) { + char *map = kmap(page); + memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); + kunmap(page); + } kfree(tmp); return ret; } From 74e3f6e63da6c8e8246fba1689e040bc926b4a1a Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 14 Mar 2017 15:24:51 +0530 Subject: [PATCH 199/297] parisc: perf: Fix potential NULL pointer dereference Fix potential NULL pointer dereference and clean up coding style errors (code indent, trailing whitespaces). Signed-off-by: Arvind Yadav Signed-off-by: Helge Deller --- arch/parisc/kernel/perf.c | 94 ++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index e282a5131d77..6017a5af2e6e 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -39,7 +39,7 @@ * the PDC INTRIGUE calls. This is done to eliminate bugs introduced * in various PDC revisions. The code is much more maintainable * and reliable this way vs having to debug on every version of PDC - * on every box. + * on every box. */ #include @@ -195,8 +195,8 @@ static int perf_config(uint32_t *image_ptr); static int perf_release(struct inode *inode, struct file *file); static int perf_open(struct inode *inode, struct file *file); static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t *ppos); -static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, - loff_t *ppos); +static ssize_t perf_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg); static void perf_start_counters(void); static int perf_stop_counters(uint32_t *raddr); @@ -222,7 +222,7 @@ extern void perf_intrigue_disable_perf_counters (void); /* * configure: * - * Configure the cpu with a given data image. First turn off the counters, + * Configure the cpu with a given data image. First turn off the counters, * then download the image, then turn the counters back on. */ static int perf_config(uint32_t *image_ptr) @@ -234,7 +234,7 @@ static int perf_config(uint32_t *image_ptr) error = perf_stop_counters(raddr); if (error != 0) { printk("perf_config: perf_stop_counters = %ld\n", error); - return -EINVAL; + return -EINVAL; } printk("Preparing to write image\n"); @@ -242,7 +242,7 @@ printk("Preparing to write image\n"); error = perf_write_image((uint64_t *)image_ptr); if (error != 0) { printk("perf_config: DOWNLOAD = %ld\n", error); - return -EINVAL; + return -EINVAL; } printk("Preparing to start counters\n"); @@ -254,7 +254,7 @@ printk("Preparing to start counters\n"); } /* - * Open the device and initialize all of its memory. The device is only + * Open the device and initialize all of its memory. The device is only * opened once, but can be "queried" by multiple processes that know its * file descriptor. */ @@ -298,19 +298,19 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t * called on the processor that the download should happen * on. */ -static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, - loff_t *ppos) +static ssize_t perf_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { size_t image_size; uint32_t image_type; uint32_t interface_type; uint32_t test; - if (perf_processor_interface == ONYX_INTF) + if (perf_processor_interface == ONYX_INTF) image_size = PCXU_IMAGE_SIZE; - else if (perf_processor_interface == CUDA_INTF) + else if (perf_processor_interface == CUDA_INTF) image_size = PCXW_IMAGE_SIZE; - else + else return -EFAULT; if (!capable(CAP_SYS_ADMIN)) @@ -330,22 +330,22 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun /* First check the machine type is correct for the requested image */ - if (((perf_processor_interface == CUDA_INTF) && - (interface_type != CUDA_INTF)) || - ((perf_processor_interface == ONYX_INTF) && - (interface_type != ONYX_INTF))) + if (((perf_processor_interface == CUDA_INTF) && + (interface_type != CUDA_INTF)) || + ((perf_processor_interface == ONYX_INTF) && + (interface_type != ONYX_INTF))) return -EINVAL; /* Next check to make sure the requested image is valid */ - if (((interface_type == CUDA_INTF) && + if (((interface_type == CUDA_INTF) && (test >= MAX_CUDA_IMAGES)) || - ((interface_type == ONYX_INTF) && - (test >= MAX_ONYX_IMAGES))) + ((interface_type == ONYX_INTF) && + (test >= MAX_ONYX_IMAGES))) return -EINVAL; /* Copy the image into the processor */ - if (interface_type == CUDA_INTF) + if (interface_type == CUDA_INTF) return perf_config(cuda_images[test]); else return perf_config(onyx_images[test]); @@ -359,7 +359,7 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun static void perf_patch_images(void) { #if 0 /* FIXME!! */ -/* +/* * NOTE: this routine is VERY specific to the current TLB image. * If the image is changed, this routine might also need to be changed. */ @@ -367,9 +367,9 @@ static void perf_patch_images(void) extern void $i_dtlb_miss_2_0(); extern void PA2_0_iva(); - /* + /* * We can only use the lower 32-bits, the upper 32-bits should be 0 - * anyway given this is in the kernel + * anyway given this is in the kernel */ uint32_t itlb_addr = (uint32_t)&($i_itlb_miss_2_0); uint32_t dtlb_addr = (uint32_t)&($i_dtlb_miss_2_0); @@ -377,21 +377,21 @@ static void perf_patch_images(void) if (perf_processor_interface == ONYX_INTF) { /* clear last 2 bytes */ - onyx_images[TLBMISS][15] &= 0xffffff00; + onyx_images[TLBMISS][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[TLBMISS][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[TLBMISS][16] = (dtlb_addr << 8)&0xffffff00; onyx_images[TLBMISS][17] = itlb_addr; /* clear last 2 bytes */ - onyx_images[TLBHANDMISS][15] &= 0xffffff00; + onyx_images[TLBHANDMISS][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[TLBHANDMISS][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[TLBHANDMISS][16] = (dtlb_addr << 8)&0xffffff00; onyx_images[TLBHANDMISS][17] = itlb_addr; /* clear last 2 bytes */ - onyx_images[BIG_CPI][15] &= 0xffffff00; + onyx_images[BIG_CPI][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[BIG_CPI][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[BIG_CPI][16] = (dtlb_addr << 8)&0xffffff00; @@ -404,24 +404,24 @@ static void perf_patch_images(void) } else if (perf_processor_interface == CUDA_INTF) { /* Cuda interface */ - cuda_images[TLBMISS][16] = + cuda_images[TLBMISS][16] = (cuda_images[TLBMISS][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[TLBMISS][17] = + cuda_images[TLBMISS][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[TLBMISS][18] = (itlb_addr << 16)&0xffff0000; - cuda_images[TLBHANDMISS][16] = + cuda_images[TLBHANDMISS][16] = (cuda_images[TLBHANDMISS][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[TLBHANDMISS][17] = + cuda_images[TLBHANDMISS][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[TLBHANDMISS][18] = (itlb_addr << 16)&0xffff0000; - cuda_images[BIG_CPI][16] = + cuda_images[BIG_CPI][16] = (cuda_images[BIG_CPI][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[BIG_CPI][17] = + cuda_images[BIG_CPI][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[BIG_CPI][18] = (itlb_addr << 16)&0xffff0000; } else { @@ -433,7 +433,7 @@ static void perf_patch_images(void) /* * ioctl routine - * All routines effect the processor that they are executed on. Thus you + * All routines effect the processor that they are executed on. Thus you * must be running on the processor that you wish to change. */ @@ -459,7 +459,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } /* copy out the Counters */ - if (copy_to_user((void __user *)arg, raddr, + if (copy_to_user((void __user *)arg, raddr, sizeof (raddr)) != 0) { error = -EFAULT; break; @@ -487,7 +487,7 @@ static const struct file_operations perf_fops = { .open = perf_open, .release = perf_release }; - + static struct miscdevice perf_dev = { MISC_DYNAMIC_MINOR, PA_PERF_DEV, @@ -595,7 +595,7 @@ static int perf_stop_counters(uint32_t *raddr) /* OR sticky2 (bit 1496) to counter2 bit 32 */ tmp64 |= (userbuf[23] >> 8) & 0x0000000080000000; raddr[2] = (uint32_t)tmp64; - + /* Counter3 is bits 1497 to 1528 */ tmp64 = (userbuf[23] >> 7) & 0x00000000ffffffff; /* OR sticky3 (bit 1529) to counter3 bit 32 */ @@ -617,7 +617,7 @@ static int perf_stop_counters(uint32_t *raddr) userbuf[22] = 0; userbuf[23] = 0; - /* + /* * Write back the zeroed bytes + the image given * the read was destructive. */ @@ -625,13 +625,13 @@ static int perf_stop_counters(uint32_t *raddr) } else { /* - * Read RDR-15 which contains the counters and sticky bits + * Read RDR-15 which contains the counters and sticky bits */ if (!perf_rdr_read_ubuf(15, userbuf)) { return -13; } - /* + /* * Clear out the counters */ perf_rdr_clear(15); @@ -644,7 +644,7 @@ static int perf_stop_counters(uint32_t *raddr) raddr[2] = (uint32_t)((userbuf[1] >> 32) & 0x00000000ffffffffUL); raddr[3] = (uint32_t)(userbuf[1] & 0x00000000ffffffffUL); } - + return 0; } @@ -682,7 +682,7 @@ static int perf_rdr_read_ubuf(uint32_t rdr_num, uint64_t *buffer) i = tentry->num_words; while (i--) { buffer[i] = 0; - } + } /* Check for bits an even number of 64 */ if ((xbits = width & 0x03f) != 0) { @@ -808,18 +808,22 @@ static int perf_write_image(uint64_t *memaddr) } runway = ioremap_nocache(cpu_device->hpa.start, 4096); + if (!runway) { + pr_err("perf_write_image: ioremap failed!\n"); + return -ENOMEM; + } /* Merge intrigue bits into Runway STATUS 0 */ tmp64 = __raw_readq(runway + RUNWAY_STATUS) & 0xffecfffffffffffful; - __raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul), + __raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul), runway + RUNWAY_STATUS); - + /* Write RUNWAY DEBUG registers */ for (i = 0; i < 8; i++) { __raw_writeq(*memaddr++, runway + RUNWAY_DEBUG); } - return 0; + return 0; } /* @@ -843,7 +847,7 @@ printk("perf_rdr_write\n"); perf_rdr_shift_out_U(rdr_num, buffer[i]); } else { perf_rdr_shift_out_W(rdr_num, buffer[i]); - } + } } printk("perf_rdr_write done\n"); } From 73580dac7618e4bcd21679f553cf3c97323fec46 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 18 Mar 2017 17:13:27 +0100 Subject: [PATCH 200/297] parisc: Fix system shutdown halt On those parisc machines which don't provide a software power off function, the system currently kills the init process at the end of a shutdown and unexpectedly restarts insteads of halting. Fix it by adding a loop which will not return. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # 4.9+ --- arch/parisc/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 06f7ca7fe70b..b76f503eee4a 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -142,6 +142,8 @@ void machine_power_off(void) printk(KERN_EMERG "System shut down completed.\n" "Please power this system off now."); + + for (;;); } void (*pm_power_off)(void) = machine_power_off; From 9c28ca4ff8bad7486182291a55b4f67a70af718d Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 8 Mar 2017 00:09:59 -0800 Subject: [PATCH 201/297] target: Drop pointless tfo->check_stop_free check All in-tree fabric drivers provide a tfo->check_stop_free(), so there is no need to do the extra check within existing transport_cmd_check_stop_to_fabric() code. Just to be sure, add a check in target_fabric_tf_ops_check() to notify any out-of-tree drivers that might be missing it. Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_configfs.c | 4 ++++ drivers/target/target_core_transport.c | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index 54b36c9835be..38b5025e4c7a 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -421,6 +421,10 @@ static int target_fabric_tf_ops_check(const struct target_core_fabric_ops *tfo) pr_err("Missing tfo->aborted_task()\n"); return -EINVAL; } + if (!tfo->check_stop_free) { + pr_err("Missing tfo->check_stop_free()\n"); + return -EINVAL; + } /* * We at least require tfo->fabric_make_wwn(), tfo->fabric_drop_wwn() * tfo->fabric_make_tpg() and tfo->fabric_drop_tpg() in diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 434d9d693989..b1a3cdb29468 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -636,8 +636,7 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd) * Fabric modules are expected to return '1' here if the se_cmd being * passed is released at this point, or zero if not being released. */ - return cmd->se_tfo->check_stop_free ? cmd->se_tfo->check_stop_free(cmd) - : 0; + return cmd->se_tfo->check_stop_free(cmd); } static void transport_lun_remove_cmd(struct se_cmd *cmd) From 3abaa2bfdb1e6bb33d38a2e82cf3bb82ec0197bf Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 1 Mar 2017 23:14:39 -0600 Subject: [PATCH 202/297] tcmu: allow hw_max_sectors greater than 128 tcmu hard codes the hw_max_sectors to 128 which is a litle small. Userspace uses the max_sectors to report the optimal IO size and some initiators perform better with larger IOs (open-iscsi seems to do better with 256 to 512 depending on the test). (Fix do not display hw max sectors twice - MNC) Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 54 ++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index c3adefe95e50..24e8580f07b8 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -960,7 +960,8 @@ static int tcmu_configure_device(struct se_device *dev) if (dev->dev_attrib.hw_block_size == 0) dev->dev_attrib.hw_block_size = 512; /* Other attributes can be configured in userspace */ - dev->dev_attrib.hw_max_sectors = 128; + if (!dev->dev_attrib.hw_max_sectors) + dev->dev_attrib.hw_max_sectors = 128; dev->dev_attrib.hw_queue_depth = 128; ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name, @@ -1031,16 +1032,42 @@ static void tcmu_free_device(struct se_device *dev) } enum { - Opt_dev_config, Opt_dev_size, Opt_hw_block_size, Opt_err, + Opt_dev_config, Opt_dev_size, Opt_hw_block_size, Opt_hw_max_sectors, + Opt_err, }; static match_table_t tokens = { {Opt_dev_config, "dev_config=%s"}, {Opt_dev_size, "dev_size=%u"}, {Opt_hw_block_size, "hw_block_size=%u"}, + {Opt_hw_max_sectors, "hw_max_sectors=%u"}, {Opt_err, NULL} }; +static int tcmu_set_dev_attrib(substring_t *arg, u32 *dev_attrib) +{ + unsigned long tmp_ul; + char *arg_p; + int ret; + + arg_p = match_strdup(arg); + if (!arg_p) + return -ENOMEM; + + ret = kstrtoul(arg_p, 0, &tmp_ul); + kfree(arg_p); + if (ret < 0) { + pr_err("kstrtoul() failed for dev attrib\n"); + return ret; + } + if (!tmp_ul) { + pr_err("dev attrib must be nonzero\n"); + return -EINVAL; + } + *dev_attrib = tmp_ul; + return 0; +} + static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, const char *page, ssize_t count) { @@ -1048,7 +1075,6 @@ static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, char *orig, *ptr, *opts, *arg_p; substring_t args[MAX_OPT_ARGS]; int ret = 0, token; - unsigned long tmp_ul; opts = kstrdup(page, GFP_KERNEL); if (!opts) @@ -1082,22 +1108,12 @@ static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, pr_err("kstrtoul() failed for dev_size=\n"); break; case Opt_hw_block_size: - arg_p = match_strdup(&args[0]); - if (!arg_p) { - ret = -ENOMEM; - break; - } - ret = kstrtoul(arg_p, 0, &tmp_ul); - kfree(arg_p); - if (ret < 0) { - pr_err("kstrtoul() failed for hw_block_size=\n"); - break; - } - if (!tmp_ul) { - pr_err("hw_block_size must be nonzero\n"); - break; - } - dev->dev_attrib.hw_block_size = tmp_ul; + ret = tcmu_set_dev_attrib(&args[0], + &(dev->dev_attrib.hw_block_size)); + break; + case Opt_hw_max_sectors: + ret = tcmu_set_dev_attrib(&args[0], + &(dev->dev_attrib.hw_max_sectors)); break; default: break; From 2579325ca0acc598fdf41ba12b2871d3467f28df Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 1 Mar 2017 23:14:40 -0600 Subject: [PATCH 203/297] tcmu: return on first Opt parse failure We only were returing failure if the last opt to be parsed failed. This has a return failure when we first detect a failure. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 24e8580f07b8..4339ab2133b3 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1118,6 +1118,9 @@ static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, default: break; } + + if (ret) + break; } kfree(orig); From 530c6891b1220cba780b6c18f4691d85a3435080 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 1 Mar 2017 23:13:24 -0600 Subject: [PATCH 204/297] target: allow ALUA setup for some passthrough backends This patch allows passthrough backends to use the core/base LIO ALUA setup and state checks, but still handle the execution of commands. This will allow the target_core_user module to execute STPG and RTPG in userspace, and not have to duplicate the ALUA state checks, path information (needed so we can check if command is executable on specific paths) and setup (rtslib sets/updates the configfs ALUA interface like it does for iblock or file). For STPG, the target_core_user userspace daemon, tcmu-runner will still execute the STPG, and to update the core/base LIO state it will use the existing configfs interface. For RTPG, tcmu-runner will loop over configfs and/or cache the state. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_alua.c | 9 +++++---- drivers/target/target_core_pscsi.c | 3 ++- drivers/target/target_core_tpg.c | 3 ++- include/target/target_core_backend.h | 7 ++++++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index f5e330099bfc..a41bbb8087cf 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -691,7 +691,7 @@ target_alua_state_check(struct se_cmd *cmd) if (dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE) return 0; - if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) + if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA) return 0; /* @@ -1973,7 +1973,7 @@ ssize_t core_alua_store_tg_pt_gp_info( unsigned char buf[TG_PT_GROUP_NAME_BUF]; int move = 0; - if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH || + if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA || (dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE)) return -ENODEV; @@ -2230,7 +2230,7 @@ ssize_t core_alua_store_offline_bit( unsigned long tmp; int ret; - if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH || + if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA || (dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE)) return -ENODEV; @@ -2316,7 +2316,8 @@ ssize_t core_alua_store_secondary_write_metadata( int core_setup_alua(struct se_device *dev) { - if (!(dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) && + if (!(dev->transport->transport_flags & + TRANSPORT_FLAG_PASSTHROUGH_ALUA) && !(dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE)) { struct t10_alua_lu_gp_member *lu_gp_mem; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 44d92f23a3f0..94cda7991e80 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1080,7 +1080,8 @@ static void pscsi_req_done(struct request *req, int uptodate) static const struct target_backend_ops pscsi_ops = { .name = "pscsi", .owner = THIS_MODULE, - .transport_flags = TRANSPORT_FLAG_PASSTHROUGH, + .transport_flags = TRANSPORT_FLAG_PASSTHROUGH | + TRANSPORT_FLAG_PASSTHROUGH_ALUA, .attach_hba = pscsi_attach_hba, .detach_hba = pscsi_detach_hba, .pmode_enable_hba = pscsi_pmode_enable_hba, diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index c0dbfa016575..6fb191914f45 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -602,7 +602,8 @@ int core_tpg_add_lun( if (ret) goto out_kill_ref; - if (!(dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) && + if (!(dev->transport->transport_flags & + TRANSPORT_FLAG_PASSTHROUGH_ALUA) && !(dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE)) target_attach_tg_pt_gp(lun, dev->t10_alua.default_tg_pt_gp); diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index b54b98dc2d4a..1b0f447ce850 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -4,7 +4,12 @@ #include #include -#define TRANSPORT_FLAG_PASSTHROUGH 1 +#define TRANSPORT_FLAG_PASSTHROUGH 0x1 +/* + * ALUA commands, state checks and setup operations are handled by the + * backend module. + */ +#define TRANSPORT_FLAG_PASSTHROUGH_ALUA 0x2 struct request_queue; struct scatterlist; From 0a4145729871ef29afe8b0c57560a1f5bd736416 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 1 Mar 2017 23:13:25 -0600 Subject: [PATCH 205/297] target: fail ALUA transitions for pscsi We do not setup the LU group for pscsi devices, so if you write a state to alua_access_state that will cause a transition you will get a NULL pointer dereference. This patch will fail attempts to try and transition the path for backend devices that set the TRANSPORT_FLAG_PASSTHROUGH_ALUA flag. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_alua.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index a41bbb8087cf..5b5a1e250a65 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -1149,6 +1149,9 @@ int core_alua_do_port_transition( struct t10_alua_tg_pt_gp *tg_pt_gp; int primary, valid_states, rc = 0; + if (l_dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA) + return -ENODEV; + valid_states = l_tg_pt_gp->tg_pt_gp_alua_supported_states; if (core_alua_check_transition(new_state, valid_states, &primary) != 0) return -EINVAL; From 207ee84133c00a8a2a5bdec94df4a5b37d78881c Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 1 Mar 2017 23:13:26 -0600 Subject: [PATCH 206/297] target: Use system workqueue for ALUA transitions If tcmu-runner is processing a STPG and needs to change the kernel's ALUA state then we cannot use the same work queue for task management requests and ALUA transitions, because we could deadlock. The problem occurs when a STPG times out before tcmu-runner is able to call into target_tg_pt_gp_alua_access_state_store-> core_alua_do_port_transition -> core_alua_do_transition_tg_pt -> queue_work. In this case, the tmr is on the work queue waiting for the STPG to complete, but the STPG transition is now queued behind the waiting tmr. Note: This bug will also be fixed by this patch: http://www.spinics.net/lists/target-devel/msg14560.html which switches the tmr code to use the system workqueues. For both, I am not sure if we need a dedicated workqueue since it is not a performance path and I do not think we need WQ_MEM_RECLAIM to make forward progress to free up memory like the block layer does. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_alua.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index 5b5a1e250a65..58bf5e6350ac 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -1121,13 +1121,11 @@ static int core_alua_do_transition_tg_pt( unsigned long transition_tmo; transition_tmo = tg_pt_gp->tg_pt_gp_implicit_trans_secs * HZ; - queue_delayed_work(tg_pt_gp->tg_pt_gp_dev->tmr_wq, - &tg_pt_gp->tg_pt_gp_transition_work, - transition_tmo); + schedule_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work, + transition_tmo); } else { tg_pt_gp->tg_pt_gp_transition_complete = &wait; - queue_delayed_work(tg_pt_gp->tg_pt_gp_dev->tmr_wq, - &tg_pt_gp->tg_pt_gp_transition_work, 0); + schedule_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work, 0); wait_for_completion(&wait); tg_pt_gp->tg_pt_gp_transition_complete = NULL; } From d7175373f2745ed4abe5b388d5aabd06304f801e Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 2 Mar 2017 04:59:48 -0600 Subject: [PATCH 207/297] target: fix ALUA transition timeout handling The implicit transition time tells initiators the min time to wait before timing out a transition. We currently schedule the transition to occur in tg_pt_gp_implicit_trans_secs seconds so there is no room for delays. If core_alua_do_transition_tg_pt_work->core_alua_update_tpg_primary_metadata needs to write out info to a remote file, then the initiator can easily time out the operation. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_alua.c | 23 ++++++++--------------- include/target/target_core_base.h | 2 +- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index 58bf5e6350ac..594807cd92cb 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -1013,7 +1013,7 @@ static void core_alua_queue_state_change_ua(struct t10_alua_tg_pt_gp *tg_pt_gp) static void core_alua_do_transition_tg_pt_work(struct work_struct *work) { struct t10_alua_tg_pt_gp *tg_pt_gp = container_of(work, - struct t10_alua_tg_pt_gp, tg_pt_gp_transition_work.work); + struct t10_alua_tg_pt_gp, tg_pt_gp_transition_work); struct se_device *dev = tg_pt_gp->tg_pt_gp_dev; bool explicit = (tg_pt_gp->tg_pt_gp_alua_access_status == ALUA_STATUS_ALTERED_BY_EXPLICIT_STPG); @@ -1076,13 +1076,12 @@ static int core_alua_do_transition_tg_pt( /* * Flush any pending transitions */ - if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs && - atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state) == + if (!explicit && atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state) == ALUA_ACCESS_STATE_TRANSITION) { /* Just in case */ tg_pt_gp->tg_pt_gp_alua_pending_state = new_state; tg_pt_gp->tg_pt_gp_transition_complete = &wait; - flush_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work); + flush_work(&tg_pt_gp->tg_pt_gp_transition_work); wait_for_completion(&wait); tg_pt_gp->tg_pt_gp_transition_complete = NULL; return 0; @@ -1117,15 +1116,9 @@ static int core_alua_do_transition_tg_pt( atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); - if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs) { - unsigned long transition_tmo; - - transition_tmo = tg_pt_gp->tg_pt_gp_implicit_trans_secs * HZ; - schedule_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work, - transition_tmo); - } else { + schedule_work(&tg_pt_gp->tg_pt_gp_transition_work); + if (explicit) { tg_pt_gp->tg_pt_gp_transition_complete = &wait; - schedule_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work, 0); wait_for_completion(&wait); tg_pt_gp->tg_pt_gp_transition_complete = NULL; } @@ -1696,8 +1689,8 @@ struct t10_alua_tg_pt_gp *core_alua_allocate_tg_pt_gp(struct se_device *dev, mutex_init(&tg_pt_gp->tg_pt_gp_md_mutex); spin_lock_init(&tg_pt_gp->tg_pt_gp_lock); atomic_set(&tg_pt_gp->tg_pt_gp_ref_cnt, 0); - INIT_DELAYED_WORK(&tg_pt_gp->tg_pt_gp_transition_work, - core_alua_do_transition_tg_pt_work); + INIT_WORK(&tg_pt_gp->tg_pt_gp_transition_work, + core_alua_do_transition_tg_pt_work); tg_pt_gp->tg_pt_gp_dev = dev; atomic_set(&tg_pt_gp->tg_pt_gp_alua_access_state, ALUA_ACCESS_STATE_ACTIVE_OPTIMIZED); @@ -1805,7 +1798,7 @@ void core_alua_free_tg_pt_gp( dev->t10_alua.alua_tg_pt_gps_counter--; spin_unlock(&dev->t10_alua.tg_pt_gps_lock); - flush_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work); + flush_work(&tg_pt_gp->tg_pt_gp_transition_work); /* * Allow a struct t10_alua_tg_pt_gp_member * referenced by diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 37c274e61acc..4b784b6e21c0 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -299,7 +299,7 @@ struct t10_alua_tg_pt_gp { struct list_head tg_pt_gp_lun_list; struct se_lun *tg_pt_gp_alua_lun; struct se_node_acl *tg_pt_gp_alua_nacl; - struct delayed_work tg_pt_gp_transition_work; + struct work_struct tg_pt_gp_transition_work; struct completion *tg_pt_gp_transition_complete; }; From 1ca4d4fa3bfcbe8964f81e5818a9b90436466eb0 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 2 Mar 2017 04:59:49 -0600 Subject: [PATCH 208/297] target: allow userspace to set state to transitioning Userspace target_core_user handlers like tcmu-runner may want to set the ALUA state to transitioning while it does implicit transitions. This patch allows that state when set from configfs. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_alua.c | 37 ++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index 594807cd92cb..252d4e4b7b33 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -43,7 +43,7 @@ #include "target_core_ua.h" static sense_reason_t core_alua_check_transition(int state, int valid, - int *primary); + int *primary, int explicit); static int core_alua_set_tg_pt_secondary_state( struct se_lun *lun, int explicit, int offline); @@ -335,8 +335,8 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) * the state is a primary or secondary target port asymmetric * access state. */ - rc = core_alua_check_transition(alua_access_state, - valid_states, &primary); + rc = core_alua_check_transition(alua_access_state, valid_states, + &primary, 1); if (rc) { /* * If the SET TARGET PORT GROUPS attempts to establish @@ -762,7 +762,7 @@ target_alua_state_check(struct se_cmd *cmd) * Check implicit and explicit ALUA state change request. */ static sense_reason_t -core_alua_check_transition(int state, int valid, int *primary) +core_alua_check_transition(int state, int valid, int *primary, int explicit) { /* * OPTIMIZED, NON-OPTIMIZED, STANDBY and UNAVAILABLE are @@ -804,11 +804,14 @@ core_alua_check_transition(int state, int valid, int *primary) *primary = 0; break; case ALUA_ACCESS_STATE_TRANSITION: - /* - * Transitioning is set internally, and - * cannot be selected manually. - */ - goto not_supported; + if (!(valid & ALUA_T_SUP) || explicit) + /* + * Transitioning is set internally and by tcmu daemon, + * and cannot be selected through a STPG. + */ + goto not_supported; + *primary = 0; + break; default: pr_err("Unknown ALUA access state: 0x%02x\n", state); return TCM_INVALID_PARAMETER_LIST; @@ -1070,7 +1073,7 @@ static int core_alua_do_transition_tg_pt( if (atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state) == new_state) return 0; - if (new_state == ALUA_ACCESS_STATE_TRANSITION) + if (explicit && new_state == ALUA_ACCESS_STATE_TRANSITION) return -EAGAIN; /* @@ -1091,10 +1094,6 @@ static int core_alua_do_transition_tg_pt( * Save the old primary ALUA access state, and set the current state * to ALUA_ACCESS_STATE_TRANSITION. */ - tg_pt_gp->tg_pt_gp_alua_previous_state = - atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state); - tg_pt_gp->tg_pt_gp_alua_pending_state = new_state; - atomic_set(&tg_pt_gp->tg_pt_gp_alua_access_state, ALUA_ACCESS_STATE_TRANSITION); tg_pt_gp->tg_pt_gp_alua_access_status = (explicit) ? @@ -1103,6 +1102,13 @@ static int core_alua_do_transition_tg_pt( core_alua_queue_state_change_ua(tg_pt_gp); + if (new_state == ALUA_ACCESS_STATE_TRANSITION) + return 0; + + tg_pt_gp->tg_pt_gp_alua_previous_state = + atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state); + tg_pt_gp->tg_pt_gp_alua_pending_state = new_state; + /* * Check for the optional ALUA primary state transition delay */ @@ -1144,7 +1150,8 @@ int core_alua_do_port_transition( return -ENODEV; valid_states = l_tg_pt_gp->tg_pt_gp_alua_supported_states; - if (core_alua_check_transition(new_state, valid_states, &primary) != 0) + if (core_alua_check_transition(new_state, valid_states, &primary, + explicit) != 0) return -EINVAL; local_lu_gp_mem = l_dev->dev_alua_lu_gp_mem; From 760bf578edf8122f2503a3a6a3f4b0de3b6ce0bb Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 2 Mar 2017 04:59:50 -0600 Subject: [PATCH 209/297] target: fix race during implicit transition work flushes This fixes the following races: 1. core_alua_do_transition_tg_pt could have read tg_pt_gp_alua_access_state and gone into this if chunk: if (!explicit && atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state) == ALUA_ACCESS_STATE_TRANSITION) { and then core_alua_do_transition_tg_pt_work could update the state. core_alua_do_transition_tg_pt would then only set tg_pt_gp_alua_pending_state and the tg_pt_gp_alua_access_state would not get updated with the second calls state. 2. core_alua_do_transition_tg_pt could be setting tg_pt_gp_transition_complete while the tg_pt_gp_transition_work is already completing. core_alua_do_transition_tg_pt then waits on the completion that will never be called. To handle these issues, we just call flush_work which will return when core_alua_do_transition_tg_pt_work has completed so there is no need to do the complete/wait. And, if core_alua_do_transition_tg_pt_work was running, instead of trying to sneak in the state change, we just schedule up another core_alua_do_transition_tg_pt_work call. Note that this does not handle a possible race where there are multiple threads call core_alua_do_transition_tg_pt at the same time. I think we need a mutex in target_tg_pt_gp_alua_access_state_store. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_alua.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index 252d4e4b7b33..fd7c16a7ca6e 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -1079,16 +1079,8 @@ static int core_alua_do_transition_tg_pt( /* * Flush any pending transitions */ - if (!explicit && atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state) == - ALUA_ACCESS_STATE_TRANSITION) { - /* Just in case */ - tg_pt_gp->tg_pt_gp_alua_pending_state = new_state; - tg_pt_gp->tg_pt_gp_transition_complete = &wait; + if (!explicit) flush_work(&tg_pt_gp->tg_pt_gp_transition_work); - wait_for_completion(&wait); - tg_pt_gp->tg_pt_gp_transition_complete = NULL; - return 0; - } /* * Save the old primary ALUA access state, and set the current state From 972c7f167974fa41ea8a2eed4b857cc59f59c42c Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 9 Mar 2017 02:42:08 -0600 Subject: [PATCH 210/297] tcmu: add helper to check if dev was configured This adds a helper to check if the dev was configured. It will be used in the next patch to prevent updates to some config settings after the device has been setup. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 4339ab2133b3..892b311e7874 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -998,6 +998,11 @@ static void tcmu_dev_call_rcu(struct rcu_head *p) kfree(udev); } +static bool tcmu_dev_configured(struct tcmu_dev *udev) +{ + return udev->uio_info.uio_dev ? true : false; +} + static void tcmu_free_device(struct se_device *dev) { struct tcmu_dev *udev = TCMU_DEV(dev); @@ -1019,8 +1024,7 @@ static void tcmu_free_device(struct se_device *dev) spin_unlock_irq(&udev->commands_lock); WARN_ON(!all_expired); - /* Device was configured */ - if (udev->uio_info.uio_dev) { + if (tcmu_dev_configured(udev)) { tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, udev->uio_info.uio_dev->minor); From af980e46a26ac8805685bb70c8572dbc47abb126 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 9 Mar 2017 02:42:09 -0600 Subject: [PATCH 211/297] tcmu: make cmd timeout configurable A single daemon could implement multiple types of devices using multuple types of real devices that may not support restarting from crashes and/or handling tcmu timeouts. This makes the cmd timeout configurable, so handlers that do not support it can turn if off for now. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 41 ++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 892b311e7874..10cc15f0b1fa 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -112,6 +112,7 @@ struct tcmu_dev { spinlock_t commands_lock; struct timer_list timeout; + unsigned int cmd_time_out; char dev_config[TCMU_CONFIG_LEN]; }; @@ -172,7 +173,9 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) tcmu_cmd->se_cmd = se_cmd; tcmu_cmd->tcmu_dev = udev; - tcmu_cmd->deadline = jiffies + msecs_to_jiffies(TCMU_TIME_OUT); + if (udev->cmd_time_out) + tcmu_cmd->deadline = jiffies + + msecs_to_jiffies(udev->cmd_time_out); idr_preload(GFP_KERNEL); spin_lock_irq(&udev->commands_lock); @@ -451,7 +454,11 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) pr_debug("sleeping for ring space\n"); spin_unlock_irq(&udev->cmdr_lock); - ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT)); + if (udev->cmd_time_out) + ret = schedule_timeout( + msecs_to_jiffies(udev->cmd_time_out)); + else + ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT)); finish_wait(&udev->wait_cmdr, &__wait); if (!ret) { pr_warn("tcmu: command timed out\n"); @@ -526,8 +533,9 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) /* TODO: only if FLUSH and FUA? */ uio_event_notify(&udev->uio_info); - mod_timer(&udev->timeout, - round_jiffies_up(jiffies + msecs_to_jiffies(TCMU_TIME_OUT))); + if (udev->cmd_time_out) + mod_timer(&udev->timeout, round_jiffies_up(jiffies + + msecs_to_jiffies(udev->cmd_time_out))); return TCM_NO_SENSE; } @@ -742,6 +750,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) } udev->hba = hba; + udev->cmd_time_out = TCMU_TIME_OUT; init_waitqueue_head(&udev->wait_cmdr); spin_lock_init(&udev->cmdr_lock); @@ -1037,7 +1046,7 @@ static void tcmu_free_device(struct se_device *dev) enum { Opt_dev_config, Opt_dev_size, Opt_hw_block_size, Opt_hw_max_sectors, - Opt_err, + Opt_cmd_time_out, Opt_err, }; static match_table_t tokens = { @@ -1045,6 +1054,7 @@ static match_table_t tokens = { {Opt_dev_size, "dev_size=%u"}, {Opt_hw_block_size, "hw_block_size=%u"}, {Opt_hw_max_sectors, "hw_max_sectors=%u"}, + {Opt_cmd_time_out, "cmd_time_out=%u"}, {Opt_err, NULL} }; @@ -1111,6 +1121,23 @@ static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, if (ret < 0) pr_err("kstrtoul() failed for dev_size=\n"); break; + case Opt_cmd_time_out: + if (tcmu_dev_configured(udev)) { + pr_err("Can not update cmd_time_out after device has been configured.\n"); + ret = -EINVAL; + break; + } + arg_p = match_strdup(&args[0]); + if (!arg_p) { + ret = -ENOMEM; + break; + } + ret = kstrtouint(arg_p, 0, &udev->cmd_time_out); + kfree(arg_p); + if (ret < 0) + pr_err("kstrtouint() failed for cmd_time_out=\n"); + udev->cmd_time_out *= MSEC_PER_SEC; + break; case Opt_hw_block_size: ret = tcmu_set_dev_attrib(&args[0], &(dev->dev_attrib.hw_block_size)); @@ -1138,7 +1165,9 @@ static ssize_t tcmu_show_configfs_dev_params(struct se_device *dev, char *b) bl = sprintf(b + bl, "Config: %s ", udev->dev_config[0] ? udev->dev_config : "NULL"); - bl += sprintf(b + bl, "Size: %zu\n", udev->dev_size); + bl += sprintf(b + bl, "Size: %zu ", udev->dev_size); + bl += sprintf(b + bl, "Cmd Time Out: %lu\n", + udev->cmd_time_out / MSEC_PER_SEC); return bl; } From 7d7a743543905a8297dce53b36e793e5307da5d7 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 18 Mar 2017 15:04:13 -0700 Subject: [PATCH 212/297] tcmu: Convert cmd_time_out into backend device attribute Instead of putting cmd_time_out under ../target/core/user_0/foo/control, which has historically been used by parameters needed for initial backend device configuration, go ahead and move cmd_time_out into a backend device attribute. In order to do this, tcmu_module_init() has been updated to create a local struct configfs_attribute **tcmu_attrs, that is based upon the existing passthrough_attrib_attrs along with the new cmd_time_out attribute. Once **tcm_attrs has been setup, go ahead and point it at tcmu_ops->tb_dev_attrib_attrs so it's picked up by target-core. Also following MNC's previous change, ->cmd_time_out is stored in milliseconds but exposed via configfs in seconds. Also, note this patch restricts the modification of ->cmd_time_out to before + after the TCMU device has been configured, but not while it has active fabric exports. Cc: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 94 ++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 26 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 10cc15f0b1fa..c6874c38a10b 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1046,7 +1047,7 @@ static void tcmu_free_device(struct se_device *dev) enum { Opt_dev_config, Opt_dev_size, Opt_hw_block_size, Opt_hw_max_sectors, - Opt_cmd_time_out, Opt_err, + Opt_err, }; static match_table_t tokens = { @@ -1054,7 +1055,6 @@ static match_table_t tokens = { {Opt_dev_size, "dev_size=%u"}, {Opt_hw_block_size, "hw_block_size=%u"}, {Opt_hw_max_sectors, "hw_max_sectors=%u"}, - {Opt_cmd_time_out, "cmd_time_out=%u"}, {Opt_err, NULL} }; @@ -1121,23 +1121,6 @@ static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, if (ret < 0) pr_err("kstrtoul() failed for dev_size=\n"); break; - case Opt_cmd_time_out: - if (tcmu_dev_configured(udev)) { - pr_err("Can not update cmd_time_out after device has been configured.\n"); - ret = -EINVAL; - break; - } - arg_p = match_strdup(&args[0]); - if (!arg_p) { - ret = -ENOMEM; - break; - } - ret = kstrtouint(arg_p, 0, &udev->cmd_time_out); - kfree(arg_p); - if (ret < 0) - pr_err("kstrtouint() failed for cmd_time_out=\n"); - udev->cmd_time_out *= MSEC_PER_SEC; - break; case Opt_hw_block_size: ret = tcmu_set_dev_attrib(&args[0], &(dev->dev_attrib.hw_block_size)); @@ -1165,9 +1148,7 @@ static ssize_t tcmu_show_configfs_dev_params(struct se_device *dev, char *b) bl = sprintf(b + bl, "Config: %s ", udev->dev_config[0] ? udev->dev_config : "NULL"); - bl += sprintf(b + bl, "Size: %zu ", udev->dev_size); - bl += sprintf(b + bl, "Cmd Time Out: %lu\n", - udev->cmd_time_out / MSEC_PER_SEC); + bl += sprintf(b + bl, "Size: %zu\n", udev->dev_size); return bl; } @@ -1186,7 +1167,48 @@ tcmu_parse_cdb(struct se_cmd *cmd) return passthrough_parse_cdb(cmd, tcmu_queue_cmd); } -static const struct target_backend_ops tcmu_ops = { +static ssize_t tcmu_cmd_time_out_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = container_of(da->da_dev, + struct tcmu_dev, se_dev); + + return snprintf(page, PAGE_SIZE, "%lu\n", udev->cmd_time_out / MSEC_PER_SEC); +} + +static ssize_t tcmu_cmd_time_out_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = container_of(da->da_dev, + struct tcmu_dev, se_dev); + u32 val; + int ret; + + if (da->da_dev->export_count) { + pr_err("Unable to set tcmu cmd_time_out while exports exist\n"); + return -EINVAL; + } + + ret = kstrtou32(page, 0, &val); + if (ret < 0) + return ret; + + if (!val) { + pr_err("Illegal value for cmd_time_out\n"); + return -EINVAL; + } + + udev->cmd_time_out = val * MSEC_PER_SEC; + return count; +} +CONFIGFS_ATTR(tcmu_, cmd_time_out); + +static struct configfs_attribute **tcmu_attrs; + +static struct target_backend_ops tcmu_ops = { .name = "user", .owner = THIS_MODULE, .transport_flags = TRANSPORT_FLAG_PASSTHROUGH, @@ -1200,12 +1222,12 @@ static const struct target_backend_ops tcmu_ops = { .show_configfs_dev_params = tcmu_show_configfs_dev_params, .get_device_type = sbc_get_device_type, .get_blocks = tcmu_get_blocks, - .tb_dev_attrib_attrs = passthrough_attrib_attrs, + .tb_dev_attrib_attrs = NULL, }; static int __init tcmu_module_init(void) { - int ret; + int ret, i, len = 0; BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0); @@ -1227,12 +1249,31 @@ static int __init tcmu_module_init(void) goto out_unreg_device; } + for (i = 0; passthrough_attrib_attrs[i] != NULL; i++) { + len += sizeof(struct configfs_attribute *); + } + len += sizeof(struct configfs_attribute *) * 2; + + tcmu_attrs = kzalloc(len, GFP_KERNEL); + if (!tcmu_attrs) { + ret = -ENOMEM; + goto out_unreg_genl; + } + + for (i = 0; passthrough_attrib_attrs[i] != NULL; i++) { + tcmu_attrs[i] = passthrough_attrib_attrs[i]; + } + tcmu_attrs[i] = &tcmu_attr_cmd_time_out; + tcmu_ops.tb_dev_attrib_attrs = tcmu_attrs; + ret = transport_backend_register(&tcmu_ops); if (ret) - goto out_unreg_genl; + goto out_attrs; return 0; +out_attrs: + kfree(tcmu_attrs); out_unreg_genl: genl_unregister_family(&tcmu_genl_family); out_unreg_device: @@ -1246,6 +1287,7 @@ out_free_cache: static void __exit tcmu_module_exit(void) { target_backend_unregister(&tcmu_ops); + kfree(tcmu_attrs); genl_unregister_family(&tcmu_genl_family); root_device_unregister(tcmu_root_device); kmem_cache_destroy(tcmu_cmd_cache); From c4a9b538ab2a109c5f9798bea1f8f4bf93aadfb9 Mon Sep 17 00:00:00 2001 From: Joe Carnuccio Date: Wed, 15 Mar 2017 09:48:43 -0700 Subject: [PATCH 213/297] qla2xxx: Allow vref count to timeout on vport delete. Cc: Signed-off-by: Joe Carnuccio Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_attr.c | 4 +--- drivers/scsi/qla2xxx/qla_def.h | 6 +++++- drivers/scsi/qla2xxx/qla_init.c | 1 + drivers/scsi/qla2xxx/qla_mid.c | 14 ++++++++------ drivers/scsi/qla2xxx/qla_os.c | 1 + 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index f610103994af..435ff7fd6384 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -2154,8 +2154,6 @@ qla24xx_vport_delete(struct fc_vport *fc_vport) "Timer for the VP[%d] has stopped\n", vha->vp_idx); } - BUG_ON(atomic_read(&vha->vref_count)); - qla2x00_free_fcports(vha); mutex_lock(&ha->vport_lock); @@ -2166,7 +2164,7 @@ qla24xx_vport_delete(struct fc_vport *fc_vport) dma_free_coherent(&ha->pdev->dev, vha->gnl.size, vha->gnl.l, vha->gnl.ldma); - if (vha->qpair->vp_idx == vha->vp_idx) { + if (vha->qpair && vha->qpair->vp_idx == vha->vp_idx) { if (qla2xxx_delete_qpair(vha, vha->qpair) != QLA_SUCCESS) ql_log(ql_log_warn, vha, 0x7087, "Queue Pair delete failed.\n"); diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 625d438e3cce..8662ef4192db 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -4076,6 +4076,7 @@ typedef struct scsi_qla_host { /* Count of active session/fcport */ int fcport_count; wait_queue_head_t fcport_waitQ; + wait_queue_head_t vref_waitq; } scsi_qla_host_t; struct qla27xx_image_status { @@ -4131,14 +4132,17 @@ struct qla2_sgx { mb(); \ if (__vha->flags.delete_progress) { \ atomic_dec(&__vha->vref_count); \ + wake_up(&__vha->vref_waitq); \ __bail = 1; \ } else { \ __bail = 0; \ } \ } while (0) -#define QLA_VHA_MARK_NOT_BUSY(__vha) \ +#define QLA_VHA_MARK_NOT_BUSY(__vha) do { \ atomic_dec(&__vha->vref_count); \ + wake_up(&__vha->vref_waitq); \ +} while (0) \ #define QLA_QPAIR_MARK_BUSY(__qpair, __bail) do { \ atomic_inc(&__qpair->ref_count); \ diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 32fb9007f137..9f3740c68cc8 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -5148,6 +5148,7 @@ qla2x00_update_fcports(scsi_qla_host_t *base_vha) } } atomic_dec(&vha->vref_count); + wake_up(&vha->vref_waitq); } spin_unlock_irqrestore(&ha->vport_slock, flags); } diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index c6d6f0d912ff..09a490c98763 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -74,13 +74,14 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha) * ensures no active vp_list traversal while the vport is removed * from the queue) */ + wait_event_timeout(vha->vref_waitq, atomic_read(&vha->vref_count), + 10*HZ); + spin_lock_irqsave(&ha->vport_slock, flags); - while (atomic_read(&vha->vref_count)) { - spin_unlock_irqrestore(&ha->vport_slock, flags); - - msleep(500); - - spin_lock_irqsave(&ha->vport_slock, flags); + if (atomic_read(&vha->vref_count)) { + ql_dbg(ql_dbg_vport, vha, 0xfffa, + "vha->vref_count=%u timeout\n", vha->vref_count.counter); + vha->vref_count = (atomic_t)ATOMIC_INIT(0); } list_del(&vha->list); qlt_update_vp_map(vha, RESET_VP_IDX); @@ -269,6 +270,7 @@ qla2x00_alert_all_vps(struct rsp_que *rsp, uint16_t *mb) spin_lock_irqsave(&ha->vport_slock, flags); atomic_dec(&vha->vref_count); + wake_up(&vha->vref_waitq); } i++; } diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 1fed235a1b4a..54d4e802bde0 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4268,6 +4268,7 @@ struct scsi_qla_host *qla2x00_create_host(struct scsi_host_template *sht, spin_lock_init(&vha->work_lock); spin_lock_init(&vha->cmd_list_lock); init_waitqueue_head(&vha->fcport_waitQ); + init_waitqueue_head(&vha->vref_waitq); vha->gnl.size = sizeof(struct get_name_list_extended) * (ha->max_loop_id + 1); From ae940f2c472a62904dc18234de5cf3ed28f195ee Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:44 -0700 Subject: [PATCH 214/297] qla2xxx: Fix memory leak for abts processing Cc: Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 45f5077684f0..ecf97c5993e8 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -6642,6 +6642,8 @@ qlt_handle_abts_recv_work(struct work_struct *work) spin_lock_irqsave(&ha->hardware_lock, flags); qlt_response_pkt_all_vps(vha, (response_t *)&op->atio); spin_unlock_irqrestore(&ha->hardware_lock, flags); + + kfree(op); } void From 8b666809e10cda9814af3e8be339d35b83909056 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:45 -0700 Subject: [PATCH 215/297] qla2xxx: Fix request queue corruption. When FW notify driver or driver detects low FW resource, driver tries to send out Busy SCSI Status to tell Initiator side to back off. During the send process, the lock was not held. Cc: Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index ecf97c5993e8..a463bcc57902 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -5079,16 +5079,22 @@ qlt_send_busy(struct scsi_qla_host *vha, static int qlt_chk_qfull_thresh_hold(struct scsi_qla_host *vha, - struct atio_from_isp *atio) + struct atio_from_isp *atio, bool ha_locked) { struct qla_hw_data *ha = vha->hw; uint16_t status; + unsigned long flags; if (ha->tgt.num_pend_cmds < Q_FULL_THRESH_HOLD(ha)) return 0; + if (!ha_locked) + spin_lock_irqsave(&ha->hardware_lock, flags); status = temp_sam_status; qlt_send_busy(vha, atio, status); + if (!ha_locked) + spin_unlock_irqrestore(&ha->hardware_lock, flags); + return 1; } @@ -5133,7 +5139,7 @@ static void qlt_24xx_atio_pkt(struct scsi_qla_host *vha, if (likely(atio->u.isp24.fcp_cmnd.task_mgmt_flags == 0)) { - rc = qlt_chk_qfull_thresh_hold(vha, atio); + rc = qlt_chk_qfull_thresh_hold(vha, atio, ha_locked); if (rc != 0) { tgt->atio_irq_cmd_count--; return; @@ -5256,7 +5262,7 @@ static void qlt_response_pkt(struct scsi_qla_host *vha, response_t *pkt) break; } - rc = qlt_chk_qfull_thresh_hold(vha, atio); + rc = qlt_chk_qfull_thresh_hold(vha, atio, true); if (rc != 0) { tgt->irq_cmd_count--; return; From 8f6fc8d4e7ae2347d6261d11a7eb2b247d2954d8 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:46 -0700 Subject: [PATCH 216/297] qla2xxx: Fix inadequate lock protection for ABTS. Normally, ABTS is sent to Target Core as Task MGMT command. In the case of error, qla2xxx needs to send response, hardware_lock is required to prevent request queue corruption. Cc: Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index a463bcc57902..a78c3e9bcb57 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -130,6 +130,9 @@ static void qlt_send_term_imm_notif(struct scsi_qla_host *vha, static struct fc_port *qlt_create_sess(struct scsi_qla_host *vha, fc_port_t *fcport, bool local); void qlt_unreg_sess(struct fc_port *sess); +static void qlt_24xx_handle_abts(struct scsi_qla_host *, + struct abts_recv_from_24xx *); + /* * Global Variables */ @@ -389,6 +392,8 @@ static bool qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *vha, (struct abts_recv_from_24xx *)atio; struct scsi_qla_host *host = qlt_find_host_by_vp_idx(vha, entry->vp_index); + unsigned long flags; + if (unlikely(!host)) { ql_dbg(ql_dbg_tgt, vha, 0xffff, "qla_target(%d): Response pkt (ABTS_RECV_24XX) " @@ -396,9 +401,12 @@ static bool qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *vha, vha->vp_idx, entry->vp_index); break; } - qlt_response_pkt(host, (response_t *)atio); + if (!ha_locked) + spin_lock_irqsave(&host->hw->hardware_lock, flags); + qlt_24xx_handle_abts(host, (struct abts_recv_from_24xx *)atio); + if (!ha_locked) + spin_unlock_irqrestore(&host->hw->hardware_lock, flags); break; - } /* case PUREX_IOCB_TYPE: ql2xmvasynctoatio */ From f159b3c7cd45c550d0f73806451a10b6b6bc08ae Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:47 -0700 Subject: [PATCH 217/297] qla2xxx: Fix sess_lock & hardware_lock lock order problem. The main lock that needs to be held for CMD or TMR submission to upper layer is the sess_lock. The sess_lock is used to serialize cmd submission and session deletion. The addition of hardware_lock being held is not necessary. This patch removes hardware_lock dependency from CMD/TMR submission. Use hardware_lock only for error response in this case. Path1 CPU0 CPU1 ---- ---- lock(&(&ha->tgt.sess_lock)->rlock); lock(&(&ha->hardware_lock)->rlock); lock(&(&ha->tgt.sess_lock)->rlock); lock(&(&ha->hardware_lock)->rlock); Path2/deadlock *** DEADLOCK *** Call Trace: dump_stack+0x85/0xc2 print_circular_bug+0x1e3/0x250 __lock_acquire+0x1425/0x1620 lock_acquire+0xbf/0x210 _raw_spin_lock_irqsave+0x53/0x70 qlt_sess_work_fn+0x21d/0x480 [qla2xxx] process_one_work+0x1f4/0x6e0 Cc: Cc: Bart Van Assche Reported-by: Bart Van Assche Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.c | 43 ++++++++++++++----------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index a78c3e9bcb57..989f931af156 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -5727,30 +5727,23 @@ static void qlt_abort_work(struct qla_tgt *tgt, } } - spin_lock_irqsave(&ha->hardware_lock, flags); - - if (tgt->tgt_stop) - goto out_term; - rc = __qlt_24xx_handle_abts(vha, &prm->abts, sess); + ha->tgt.tgt_ops->put_sess(sess); + spin_unlock_irqrestore(&ha->tgt.sess_lock, flags2); + if (rc != 0) goto out_term; - spin_unlock_irqrestore(&ha->hardware_lock, flags); - if (sess) - ha->tgt.tgt_ops->put_sess(sess); - spin_unlock_irqrestore(&ha->tgt.sess_lock, flags2); return; out_term2: - spin_lock_irqsave(&ha->hardware_lock, flags); - -out_term: - qlt_24xx_send_abts_resp(vha, &prm->abts, FCP_TMF_REJECTED, false); - spin_unlock_irqrestore(&ha->hardware_lock, flags); - if (sess) ha->tgt.tgt_ops->put_sess(sess); spin_unlock_irqrestore(&ha->tgt.sess_lock, flags2); + +out_term: + spin_lock_irqsave(&ha->hardware_lock, flags); + qlt_24xx_send_abts_resp(vha, &prm->abts, FCP_TMF_REJECTED, false); + spin_unlock_irqrestore(&ha->hardware_lock, flags); } static void qlt_tmr_work(struct qla_tgt *tgt, @@ -5770,7 +5763,7 @@ static void qlt_tmr_work(struct qla_tgt *tgt, spin_lock_irqsave(&ha->tgt.sess_lock, flags); if (tgt->tgt_stop) - goto out_term; + goto out_term2; s_id = prm->tm_iocb2.u.isp24.fcp_hdr.s_id; sess = ha->tgt.tgt_ops->find_sess_by_s_id(vha, s_id); @@ -5782,11 +5775,11 @@ static void qlt_tmr_work(struct qla_tgt *tgt, spin_lock_irqsave(&ha->tgt.sess_lock, flags); if (!sess) - goto out_term; + goto out_term2; } else { if (sess->deleted) { sess = NULL; - goto out_term; + goto out_term2; } if (!kref_get_unless_zero(&sess->sess_kref)) { @@ -5794,7 +5787,7 @@ static void qlt_tmr_work(struct qla_tgt *tgt, "%s: kref_get fail %8phC\n", __func__, sess->port_name); sess = NULL; - goto out_term; + goto out_term2; } } @@ -5804,17 +5797,19 @@ static void qlt_tmr_work(struct qla_tgt *tgt, unpacked_lun = scsilun_to_int((struct scsi_lun *)&lun); rc = qlt_issue_task_mgmt(sess, unpacked_lun, fn, iocb, 0); + ha->tgt.tgt_ops->put_sess(sess); + spin_unlock_irqrestore(&ha->tgt.sess_lock, flags); + if (rc != 0) goto out_term; - - ha->tgt.tgt_ops->put_sess(sess); - spin_unlock_irqrestore(&ha->tgt.sess_lock, flags); return; +out_term2: + if (sess) + ha->tgt.tgt_ops->put_sess(sess); + spin_unlock_irqrestore(&ha->tgt.sess_lock, flags); out_term: qlt_send_term_exchange(vha, NULL, &prm->tm_iocb2, 1, 0); - ha->tgt.tgt_ops->put_sess(sess); - spin_unlock_irqrestore(&ha->tgt.sess_lock, flags); } static void qlt_sess_work_fn(struct work_struct *work) From 5b33469a055c77001fd2c62b0f985c991b0e5b65 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:48 -0700 Subject: [PATCH 218/297] qla2xxx: Allow relogin to proceed if remote login did not finish If the remote port have started the login process, then the PLOGI and PRLI should be back to back. Driver will allow the remote port to complete the process. For the case where the remote port decide to back off from sending PRLI, this local port sets an expiration timer for the PRLI. Once the expiration time passes, the relogin retry logic is allowed to go through and perform login with the remote port. Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_def.h | 2 ++ drivers/scsi/qla2xxx/qla_init.c | 12 ++++++++++-- drivers/scsi/qla2xxx/qla_isr.c | 25 +++++++++++++++++++------ drivers/scsi/qla2xxx/qla_target.c | 1 + 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 8662ef4192db..56cd45fc600a 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -2300,6 +2300,8 @@ typedef struct fc_port { struct ct_sns_desc ct_desc; enum discovery_state disc_state; enum login_state fw_login_state; + unsigned long plogi_nack_done_deadline; + u32 login_gen, last_login_gen; u32 rscn_gen, last_rscn_gen; u32 chip_reset; diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 9f3740c68cc8..a7865a5d556d 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -876,10 +876,14 @@ int qla24xx_fcport_handle_login(struct scsi_qla_host *vha, fc_port_t *fcport) fcport->login_retry--; if ((fcport->fw_login_state == DSC_LS_PLOGI_PEND) || - (fcport->fw_login_state == DSC_LS_PLOGI_COMP) || (fcport->fw_login_state == DSC_LS_PRLI_PEND)) return 0; + if (fcport->fw_login_state == DSC_LS_PLOGI_COMP) { + if (time_before_eq(jiffies, fcport->plogi_nack_done_deadline)) + return 0; + } + /* for pure Target Mode. Login will not be initiated */ if (vha->host->active_mode == MODE_TARGET) return 0; @@ -1041,10 +1045,14 @@ void qla24xx_handle_relogin_event(scsi_qla_host_t *vha, fcport->flags); if ((fcport->fw_login_state == DSC_LS_PLOGI_PEND) || - (fcport->fw_login_state == DSC_LS_PLOGI_COMP) || (fcport->fw_login_state == DSC_LS_PRLI_PEND)) return; + if (fcport->fw_login_state == DSC_LS_PLOGI_COMP) { + if (time_before_eq(jiffies, fcport->plogi_nack_done_deadline)) + return; + } + if (fcport->flags & FCF_ASYNC_SENT) { fcport->login_retry++; set_bit(RELOGIN_NEEDED, &vha->dpc_flags); diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index 3c66ea29de27..b2c6da752edd 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -1620,9 +1620,9 @@ qla24xx_logio_entry(scsi_qla_host_t *vha, struct req_que *req, QLA_LOGIO_LOGIN_RETRIED : 0; if (logio->entry_status) { ql_log(ql_log_warn, fcport->vha, 0x5034, - "Async-%s error entry - hdl=%x" + "Async-%s error entry - %8phC hdl=%x" "portid=%02x%02x%02x entry-status=%x.\n", - type, sp->handle, fcport->d_id.b.domain, + type, fcport->port_name, sp->handle, fcport->d_id.b.domain, fcport->d_id.b.area, fcport->d_id.b.al_pa, logio->entry_status); ql_dump_buffer(ql_dbg_async + ql_dbg_buffer, vha, 0x504d, @@ -1633,8 +1633,9 @@ qla24xx_logio_entry(scsi_qla_host_t *vha, struct req_que *req, if (le16_to_cpu(logio->comp_status) == CS_COMPLETE) { ql_dbg(ql_dbg_async, fcport->vha, 0x5036, - "Async-%s complete - hdl=%x portid=%02x%02x%02x " - "iop0=%x.\n", type, sp->handle, fcport->d_id.b.domain, + "Async-%s complete - %8phC hdl=%x portid=%02x%02x%02x " + "iop0=%x.\n", type, fcport->port_name, sp->handle, + fcport->d_id.b.domain, fcport->d_id.b.area, fcport->d_id.b.al_pa, le32_to_cpu(logio->io_parameter[0])); @@ -1674,6 +1675,17 @@ qla24xx_logio_entry(scsi_qla_host_t *vha, struct req_que *req, case LSC_SCODE_NPORT_USED: data[0] = MBS_LOOP_ID_USED; break; + case LSC_SCODE_CMD_FAILED: + if (iop[1] == 0x0606) { + /* + * PLOGI/PRLI Completed. We must have Recv PLOGI/PRLI, + * Target side acked. + */ + data[0] = MBS_COMMAND_COMPLETE; + goto logio_done; + } + data[0] = MBS_COMMAND_ERROR; + break; case LSC_SCODE_NOXCB: vha->hw->exch_starvation++; if (vha->hw->exch_starvation > 5) { @@ -1695,8 +1707,9 @@ qla24xx_logio_entry(scsi_qla_host_t *vha, struct req_que *req, } ql_dbg(ql_dbg_async, fcport->vha, 0x5037, - "Async-%s failed - hdl=%x portid=%02x%02x%02x comp=%x " - "iop0=%x iop1=%x.\n", type, sp->handle, fcport->d_id.b.domain, + "Async-%s failed - %8phC hdl=%x portid=%02x%02x%02x comp=%x " + "iop0=%x iop1=%x.\n", type, fcport->port_name, + sp->handle, fcport->d_id.b.domain, fcport->d_id.b.area, fcport->d_id.b.al_pa, le16_to_cpu(logio->comp_status), le32_to_cpu(logio->io_parameter[0]), diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 989f931af156..925d9b858b24 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -562,6 +562,7 @@ void qla2x00_async_nack_sp_done(void *s, int res) sp->fcport->login_gen++; sp->fcport->fw_login_state = DSC_LS_PLOGI_COMP; sp->fcport->logout_on_delete = 1; + sp->fcport->plogi_nack_done_deadline = jiffies + HZ; break; case SRB_NACK_PRLI: From be25152c0d9e236076323abbe9def9714234b761 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:49 -0700 Subject: [PATCH 219/297] qla2xxx: Improve T10-DIF/PI handling in driver. Add routines to support T10 DIF tag. Signed-off-by: Quinn Tran Signed-off-by: Anil Gurumurthy Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_dbg.h | 1 + drivers/scsi/qla2xxx/qla_def.h | 10 + drivers/scsi/qla2xxx/qla_gbl.h | 6 +- drivers/scsi/qla2xxx/qla_iocb.c | 13 +- drivers/scsi/qla2xxx/qla_target.c | 542 +++++++++++++++++------------ drivers/scsi/qla2xxx/qla_target.h | 38 +- drivers/scsi/qla2xxx/tcm_qla2xxx.c | 49 ++- 7 files changed, 407 insertions(+), 252 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_dbg.h b/drivers/scsi/qla2xxx/qla_dbg.h index e1fc4e66966a..c6bffe929fe7 100644 --- a/drivers/scsi/qla2xxx/qla_dbg.h +++ b/drivers/scsi/qla2xxx/qla_dbg.h @@ -348,6 +348,7 @@ ql_log_pci(uint32_t, struct pci_dev *pdev, int32_t, const char *fmt, ...); #define ql_dbg_tgt 0x00004000 /* Target mode */ #define ql_dbg_tgt_mgt 0x00002000 /* Target mode management */ #define ql_dbg_tgt_tmr 0x00001000 /* Target mode task management */ +#define ql_dbg_tgt_dif 0x00000800 /* Target mode dif */ extern int qla27xx_dump_mpi_ram(struct qla_hw_data *, uint32_t, uint32_t *, uint32_t, void **); diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 56cd45fc600a..9d1d3dcf1c87 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -3127,6 +3127,16 @@ struct bidi_statistics { unsigned long long transfer_bytes; }; +struct qla_tc_param { + struct scsi_qla_host *vha; + uint32_t blk_sz; + uint32_t bufflen; + struct scatterlist *sg; + struct scatterlist *prot_sg; + struct crc_context *ctx; + uint8_t *ctx_dsd_alloced; +}; + /* Multi queue support */ #define MBC_INITIALIZE_MULTIQ 0x1f #define QLA_QUE_PAGE 0X1000 diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index b3d6441d1d90..ca6f122e5865 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -256,11 +256,11 @@ extern unsigned long qla2x00_get_async_timeout(struct scsi_qla_host *); extern void *qla2x00_alloc_iocbs(scsi_qla_host_t *, srb_t *); extern int qla2x00_issue_marker(scsi_qla_host_t *, int); extern int qla24xx_walk_and_build_sglist_no_difb(struct qla_hw_data *, srb_t *, - uint32_t *, uint16_t, struct qla_tgt_cmd *); + uint32_t *, uint16_t, struct qla_tc_param *); extern int qla24xx_walk_and_build_sglist(struct qla_hw_data *, srb_t *, - uint32_t *, uint16_t, struct qla_tgt_cmd *); + uint32_t *, uint16_t, struct qla_tc_param *); extern int qla24xx_walk_and_build_prot_sglist(struct qla_hw_data *, srb_t *, - uint32_t *, uint16_t, struct qla_tgt_cmd *); + uint32_t *, uint16_t, struct qla_tc_param *); extern int qla24xx_get_one_block_sg(uint32_t, struct qla2_sgx *, uint32_t *); extern int qla24xx_configure_prot_mode(srb_t *, uint16_t *); extern int qla24xx_build_scsi_crc_2_iocbs(srb_t *, diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index 535079280288..ea027f6a7fd4 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -889,7 +889,7 @@ qla24xx_get_one_block_sg(uint32_t blk_sz, struct qla2_sgx *sgx, int qla24xx_walk_and_build_sglist_no_difb(struct qla_hw_data *ha, srb_t *sp, - uint32_t *dsd, uint16_t tot_dsds, struct qla_tgt_cmd *tc) + uint32_t *dsd, uint16_t tot_dsds, struct qla_tc_param *tc) { void *next_dsd; uint8_t avail_dsds = 0; @@ -898,7 +898,6 @@ qla24xx_walk_and_build_sglist_no_difb(struct qla_hw_data *ha, srb_t *sp, struct scatterlist *sg_prot; uint32_t *cur_dsd = dsd; uint16_t used_dsds = tot_dsds; - uint32_t prot_int; /* protection interval */ uint32_t partial; struct qla2_sgx sgx; @@ -966,7 +965,7 @@ alloc_and_fill: } else { list_add_tail(&dsd_ptr->list, &(tc->ctx->dsd_list)); - tc->ctx_dsd_alloced = 1; + *tc->ctx_dsd_alloced = 1; } @@ -1005,7 +1004,7 @@ alloc_and_fill: int qla24xx_walk_and_build_sglist(struct qla_hw_data *ha, srb_t *sp, uint32_t *dsd, - uint16_t tot_dsds, struct qla_tgt_cmd *tc) + uint16_t tot_dsds, struct qla_tc_param *tc) { void *next_dsd; uint8_t avail_dsds = 0; @@ -1066,7 +1065,7 @@ qla24xx_walk_and_build_sglist(struct qla_hw_data *ha, srb_t *sp, uint32_t *dsd, } else { list_add_tail(&dsd_ptr->list, &(tc->ctx->dsd_list)); - tc->ctx_dsd_alloced = 1; + *tc->ctx_dsd_alloced = 1; } /* add new list to cmd iocb or last list */ @@ -1092,7 +1091,7 @@ qla24xx_walk_and_build_sglist(struct qla_hw_data *ha, srb_t *sp, uint32_t *dsd, int qla24xx_walk_and_build_prot_sglist(struct qla_hw_data *ha, srb_t *sp, - uint32_t *dsd, uint16_t tot_dsds, struct qla_tgt_cmd *tc) + uint32_t *dsd, uint16_t tot_dsds, struct qla_tc_param *tc) { void *next_dsd; uint8_t avail_dsds = 0; @@ -1158,7 +1157,7 @@ qla24xx_walk_and_build_prot_sglist(struct qla_hw_data *ha, srb_t *sp, } else { list_add_tail(&dsd_ptr->list, &(tc->ctx->dsd_list)); - tc->ctx_dsd_alloced = 1; + *tc->ctx_dsd_alloced = 1; } /* add new list to cmd iocb or last list */ diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 925d9b858b24..532004981dbd 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -143,6 +143,20 @@ static struct workqueue_struct *qla_tgt_wq; static DEFINE_MUTEX(qla_tgt_mutex); static LIST_HEAD(qla_tgt_glist); +static const char *prot_op_str(u32 prot_op) +{ + switch (prot_op) { + case TARGET_PROT_NORMAL: return "NORMAL"; + case TARGET_PROT_DIN_INSERT: return "DIN_INSERT"; + case TARGET_PROT_DOUT_INSERT: return "DOUT_INSERT"; + case TARGET_PROT_DIN_STRIP: return "DIN_STRIP"; + case TARGET_PROT_DOUT_STRIP: return "DOUT_STRIP"; + case TARGET_PROT_DIN_PASS: return "DIN_PASS"; + case TARGET_PROT_DOUT_PASS: return "DOUT_PASS"; + default: return "UNKNOWN"; + } +} + /* This API intentionally takes dest as a parameter, rather than returning * int value to avoid caller forgetting to issue wmb() after the store */ void qlt_do_generation_tick(struct scsi_qla_host *vha, int *dest) @@ -2022,6 +2036,70 @@ void qlt_free_mcmd(struct qla_tgt_mgmt_cmd *mcmd) } EXPORT_SYMBOL(qlt_free_mcmd); +/* + * ha->hardware_lock supposed to be held on entry. Might drop it, then + * reacquire + */ +void qlt_send_resp_ctio(scsi_qla_host_t *vha, struct qla_tgt_cmd *cmd, + uint8_t scsi_status, uint8_t sense_key, uint8_t asc, uint8_t ascq) +{ + struct atio_from_isp *atio = &cmd->atio; + struct ctio7_to_24xx *ctio; + uint16_t temp; + + ql_dbg(ql_dbg_tgt_dif, vha, 0x3066, + "Sending response CTIO7 (vha=%p, atio=%p, scsi_status=%02x, " + "sense_key=%02x, asc=%02x, ascq=%02x", + vha, atio, scsi_status, sense_key, asc, ascq); + + ctio = (struct ctio7_to_24xx *)qla2x00_alloc_iocbs(vha, NULL); + if (!ctio) { + ql_dbg(ql_dbg_async, vha, 0x3067, + "qla2x00t(%ld): %s failed: unable to allocate request packet", + vha->host_no, __func__); + goto out; + } + + ctio->entry_type = CTIO_TYPE7; + ctio->entry_count = 1; + ctio->handle = QLA_TGT_SKIP_HANDLE; + ctio->nport_handle = cmd->sess->loop_id; + ctio->timeout = cpu_to_le16(QLA_TGT_TIMEOUT); + ctio->vp_index = vha->vp_idx; + ctio->initiator_id[0] = atio->u.isp24.fcp_hdr.s_id[2]; + ctio->initiator_id[1] = atio->u.isp24.fcp_hdr.s_id[1]; + ctio->initiator_id[2] = atio->u.isp24.fcp_hdr.s_id[0]; + ctio->exchange_addr = atio->u.isp24.exchange_addr; + ctio->u.status1.flags = (atio->u.isp24.attr << 9) | + cpu_to_le16(CTIO7_FLAGS_STATUS_MODE_1 | CTIO7_FLAGS_SEND_STATUS); + temp = be16_to_cpu(atio->u.isp24.fcp_hdr.ox_id); + ctio->u.status1.ox_id = cpu_to_le16(temp); + ctio->u.status1.scsi_status = + cpu_to_le16(SS_RESPONSE_INFO_LEN_VALID | scsi_status); + ctio->u.status1.response_len = cpu_to_le16(18); + ctio->u.status1.residual = cpu_to_le32(get_datalen_for_atio(atio)); + + if (ctio->u.status1.residual != 0) + ctio->u.status1.scsi_status |= + cpu_to_le16(SS_RESIDUAL_UNDER); + + /* Response code and sense key */ + put_unaligned_le32(((0x70 << 24) | (sense_key << 8)), + (&ctio->u.status1.sense_data)[0]); + /* Additional sense length */ + put_unaligned_le32(0x0a, (&ctio->u.status1.sense_data)[1]); + /* ASC and ASCQ */ + put_unaligned_le32(((asc << 24) | (ascq << 16)), + (&ctio->u.status1.sense_data)[3]); + + /* Memory Barrier */ + wmb(); + + qla2x00_start_iocbs(vha, vha->req); +out: + return; +} + /* callback from target fabric module code */ void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *mcmd) { @@ -2270,7 +2348,7 @@ static int qlt_24xx_build_ctio_pkt(struct qla_tgt_prm *prm, */ return -EAGAIN; } else - ha->tgt.cmds[h-1] = prm->cmd; + ha->tgt.cmds[h - 1] = prm->cmd; pkt->handle = h | CTIO_COMPLETION_HANDLE_MARK; pkt->nport_handle = prm->cmd->loop_id; @@ -2400,6 +2478,50 @@ static inline int qlt_has_data(struct qla_tgt_cmd *cmd) return cmd->bufflen > 0; } +static void qlt_print_dif_err(struct qla_tgt_prm *prm) +{ + struct qla_tgt_cmd *cmd; + struct scsi_qla_host *vha; + + /* asc 0x10=dif error */ + if (prm->sense_buffer && (prm->sense_buffer[12] == 0x10)) { + cmd = prm->cmd; + vha = cmd->vha; + /* ASCQ */ + switch (prm->sense_buffer[13]) { + case 1: + ql_dbg(ql_dbg_tgt_dif, vha, 0xffff, + "BE detected Guard TAG ERR: lba[0x%llx|%lld] len[0x%x] " + "se_cmd=%p tag[%x]", + cmd->lba, cmd->lba, cmd->num_blks, &cmd->se_cmd, + cmd->atio.u.isp24.exchange_addr); + break; + case 2: + ql_dbg(ql_dbg_tgt_dif, vha, 0xffff, + "BE detected APP TAG ERR: lba[0x%llx|%lld] len[0x%x] " + "se_cmd=%p tag[%x]", + cmd->lba, cmd->lba, cmd->num_blks, &cmd->se_cmd, + cmd->atio.u.isp24.exchange_addr); + break; + case 3: + ql_dbg(ql_dbg_tgt_dif, vha, 0xffff, + "BE detected REF TAG ERR: lba[0x%llx|%lld] len[0x%x] " + "se_cmd=%p tag[%x]", + cmd->lba, cmd->lba, cmd->num_blks, &cmd->se_cmd, + cmd->atio.u.isp24.exchange_addr); + break; + default: + ql_dbg(ql_dbg_tgt_dif, vha, 0xffff, + "BE detected Dif ERR: lba[%llx|%lld] len[%x] " + "se_cmd=%p tag[%x]", + cmd->lba, cmd->lba, cmd->num_blks, &cmd->se_cmd, + cmd->atio.u.isp24.exchange_addr); + break; + } + ql_dump_buffer(ql_dbg_tgt_dif, vha, 0xffff, cmd->cdb, 16); + } +} + /* * Called without ha->hardware_lock held */ @@ -2521,18 +2643,9 @@ skip_explict_conf: for (i = 0; i < prm->sense_buffer_len/4; i++) ((uint32_t *)ctio->u.status1.sense_data)[i] = cpu_to_be32(((uint32_t *)prm->sense_buffer)[i]); -#if 0 - if (unlikely((prm->sense_buffer_len % 4) != 0)) { - static int q; - if (q < 10) { - ql_dbg(ql_dbg_tgt, vha, 0xe04f, - "qla_target(%d): %d bytes of sense " - "lost", prm->tgt->ha->vp_idx, - prm->sense_buffer_len % 4); - q++; - } - } -#endif + + qlt_print_dif_err(prm); + } else { ctio->u.status1.flags &= ~cpu_to_le16(CTIO7_FLAGS_STATUS_MODE_0); @@ -2546,19 +2659,9 @@ skip_explict_conf: /* Sense with len > 24, is it possible ??? */ } - - -/* diff */ static inline int qlt_hba_err_chk_enabled(struct se_cmd *se_cmd) { - /* - * Uncomment when corresponding SCSI changes are done. - * - if (!sp->cmd->prot_chk) - return 0; - * - */ switch (se_cmd->prot_op) { case TARGET_PROT_DOUT_INSERT: case TARGET_PROT_DIN_STRIP: @@ -2579,16 +2682,38 @@ qlt_hba_err_chk_enabled(struct se_cmd *se_cmd) return 0; } -/* - * qla24xx_set_t10dif_tags_from_cmd - Extract Ref and App tags from SCSI command - * - */ -static inline void -qlt_set_t10dif_tags(struct se_cmd *se_cmd, struct crc_context *ctx) +static inline int +qla_tgt_ref_mask_check(struct se_cmd *se_cmd) { - uint32_t lba = 0xffffffff & se_cmd->t_task_lba; + switch (se_cmd->prot_op) { + case TARGET_PROT_DIN_INSERT: + case TARGET_PROT_DOUT_INSERT: + case TARGET_PROT_DIN_STRIP: + case TARGET_PROT_DOUT_STRIP: + case TARGET_PROT_DIN_PASS: + case TARGET_PROT_DOUT_PASS: + return 1; + default: + return 0; + } + return 0; +} - /* wait til Mode Sense/Select cmd, modepage Ah, subpage 2 +/* + * qla_tgt_set_dif_tags - Extract Ref and App tags from SCSI command + */ +static void +qla_tgt_set_dif_tags(struct qla_tgt_cmd *cmd, struct crc_context *ctx, + uint16_t *pfw_prot_opts) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + uint32_t lba = 0xffffffff & se_cmd->t_task_lba; + scsi_qla_host_t *vha = cmd->tgt->vha; + struct qla_hw_data *ha = vha->hw; + uint32_t t32 = 0; + + /* + * wait till Mode Sense/Select cmd, modepage Ah, subpage 2 * have been immplemented by TCM, before AppTag is avail. * Look for modesense_handlers[] */ @@ -2596,65 +2721,73 @@ qlt_set_t10dif_tags(struct se_cmd *se_cmd, struct crc_context *ctx) ctx->app_tag_mask[0] = 0x0; ctx->app_tag_mask[1] = 0x0; + if (IS_PI_UNINIT_CAPABLE(ha)) { + if ((se_cmd->prot_type == TARGET_DIF_TYPE1_PROT) || + (se_cmd->prot_type == TARGET_DIF_TYPE2_PROT)) + *pfw_prot_opts |= PO_DIS_VALD_APP_ESC; + else if (se_cmd->prot_type == TARGET_DIF_TYPE3_PROT) + *pfw_prot_opts |= PO_DIS_VALD_APP_REF_ESC; + } + + t32 = ha->tgt.tgt_ops->get_dif_tags(cmd, pfw_prot_opts); + switch (se_cmd->prot_type) { case TARGET_DIF_TYPE0_PROT: /* - * No check for ql2xenablehba_err_chk, as it would be an - * I/O error if hba tag generation is not done. + * No check for ql2xenablehba_err_chk, as it + * would be an I/O error if hba tag generation + * is not done. */ ctx->ref_tag = cpu_to_le32(lba); - - if (!qlt_hba_err_chk_enabled(se_cmd)) - break; - /* enable ALL bytes of the ref tag */ ctx->ref_tag_mask[0] = 0xff; ctx->ref_tag_mask[1] = 0xff; ctx->ref_tag_mask[2] = 0xff; ctx->ref_tag_mask[3] = 0xff; break; - /* - * For TYpe 1 protection: 16 bit GUARD tag, 32 bit REF tag, and - * 16 bit app tag. - */ case TARGET_DIF_TYPE1_PROT: - ctx->ref_tag = cpu_to_le32(lba); - - if (!qlt_hba_err_chk_enabled(se_cmd)) - break; - - /* enable ALL bytes of the ref tag */ - ctx->ref_tag_mask[0] = 0xff; - ctx->ref_tag_mask[1] = 0xff; - ctx->ref_tag_mask[2] = 0xff; - ctx->ref_tag_mask[3] = 0xff; - break; - /* - * For TYPE 2 protection: 16 bit GUARD + 32 bit REF tag has to - * match LBA in CDB + N - */ + /* + * For TYPE 1 protection: 16 bit GUARD tag, 32 bit + * REF tag, and 16 bit app tag. + */ + ctx->ref_tag = cpu_to_le32(lba); + if (!qla_tgt_ref_mask_check(se_cmd) || + !(ha->tgt.tgt_ops->chk_dif_tags(t32))) { + *pfw_prot_opts |= PO_DIS_REF_TAG_VALD; + break; + } + /* enable ALL bytes of the ref tag */ + ctx->ref_tag_mask[0] = 0xff; + ctx->ref_tag_mask[1] = 0xff; + ctx->ref_tag_mask[2] = 0xff; + ctx->ref_tag_mask[3] = 0xff; + break; case TARGET_DIF_TYPE2_PROT: - ctx->ref_tag = cpu_to_le32(lba); - - if (!qlt_hba_err_chk_enabled(se_cmd)) - break; - - /* enable ALL bytes of the ref tag */ - ctx->ref_tag_mask[0] = 0xff; - ctx->ref_tag_mask[1] = 0xff; - ctx->ref_tag_mask[2] = 0xff; - ctx->ref_tag_mask[3] = 0xff; - break; - - /* For Type 3 protection: 16 bit GUARD only */ + /* + * For TYPE 2 protection: 16 bit GUARD + 32 bit REF + * tag has to match LBA in CDB + N + */ + ctx->ref_tag = cpu_to_le32(lba); + if (!qla_tgt_ref_mask_check(se_cmd) || + !(ha->tgt.tgt_ops->chk_dif_tags(t32))) { + *pfw_prot_opts |= PO_DIS_REF_TAG_VALD; + break; + } + /* enable ALL bytes of the ref tag */ + ctx->ref_tag_mask[0] = 0xff; + ctx->ref_tag_mask[1] = 0xff; + ctx->ref_tag_mask[2] = 0xff; + ctx->ref_tag_mask[3] = 0xff; + break; case TARGET_DIF_TYPE3_PROT: - ctx->ref_tag_mask[0] = ctx->ref_tag_mask[1] = - ctx->ref_tag_mask[2] = ctx->ref_tag_mask[3] = 0x00; - break; + /* For TYPE 3 protection: 16 bit GUARD only */ + *pfw_prot_opts |= PO_DIS_REF_TAG_VALD; + ctx->ref_tag_mask[0] = ctx->ref_tag_mask[1] = + ctx->ref_tag_mask[2] = ctx->ref_tag_mask[3] = 0x00; + break; } } - static inline int qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) { @@ -2673,6 +2806,7 @@ qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) struct se_cmd *se_cmd = &cmd->se_cmd; uint32_t h; struct atio_from_isp *atio = &prm->cmd->atio; + struct qla_tc_param tc; uint16_t t16; ha = vha->hw; @@ -2698,16 +2832,15 @@ qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) case TARGET_PROT_DIN_INSERT: case TARGET_PROT_DOUT_STRIP: transfer_length = data_bytes; - data_bytes += dif_bytes; + if (cmd->prot_sg_cnt) + data_bytes += dif_bytes; break; - case TARGET_PROT_DIN_STRIP: case TARGET_PROT_DOUT_INSERT: case TARGET_PROT_DIN_PASS: case TARGET_PROT_DOUT_PASS: transfer_length = data_bytes + dif_bytes; break; - default: BUG(); break; @@ -2743,7 +2876,6 @@ qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) break; } - /* ---- PKT ---- */ /* Update entry type to indicate Command Type CRC_2 IOCB */ pkt->entry_type = CTIO_CRC2; @@ -2761,9 +2893,8 @@ qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) } else ha->tgt.cmds[h-1] = prm->cmd; - pkt->handle = h | CTIO_COMPLETION_HANDLE_MARK; - pkt->nport_handle = prm->cmd->loop_id; + pkt->nport_handle = cpu_to_le16(prm->cmd->loop_id); pkt->timeout = cpu_to_le16(QLA_TGT_TIMEOUT); pkt->initiator_id[0] = atio->u.isp24.fcp_hdr.s_id[2]; pkt->initiator_id[1] = atio->u.isp24.fcp_hdr.s_id[1]; @@ -2784,12 +2915,10 @@ qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) else if (cmd->dma_data_direction == DMA_FROM_DEVICE) pkt->flags = cpu_to_le16(CTIO7_FLAGS_DATA_OUT); - pkt->dseg_count = prm->tot_dsds; /* Fibre channel byte count */ pkt->transfer_length = cpu_to_le32(transfer_length); - /* ----- CRC context -------- */ /* Allocate CRC context from global pool */ @@ -2809,13 +2938,12 @@ qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) /* Set handle */ crc_ctx_pkt->handle = pkt->handle; - qlt_set_t10dif_tags(se_cmd, crc_ctx_pkt); + qla_tgt_set_dif_tags(cmd, crc_ctx_pkt, &fw_prot_opts); pkt->crc_context_address[0] = cpu_to_le32(LSD(crc_ctx_dma)); pkt->crc_context_address[1] = cpu_to_le32(MSD(crc_ctx_dma)); pkt->crc_context_len = CRC_CONTEXT_LEN_FW; - if (!bundling) { cur_dsd = (uint32_t *) &crc_ctx_pkt->u.nobundling.data_address; } else { @@ -2836,16 +2964,24 @@ qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) crc_ctx_pkt->byte_count = cpu_to_le32(data_bytes); crc_ctx_pkt->guard_seed = cpu_to_le16(0); + memset((uint8_t *)&tc, 0 , sizeof(tc)); + tc.vha = vha; + tc.blk_sz = cmd->blk_sz; + tc.bufflen = cmd->bufflen; + tc.sg = cmd->sg; + tc.prot_sg = cmd->prot_sg; + tc.ctx = crc_ctx_pkt; + tc.ctx_dsd_alloced = &cmd->ctx_dsd_alloced; /* Walks data segments */ pkt->flags |= cpu_to_le16(CTIO7_FLAGS_DSD_PTR); if (!bundling && prm->prot_seg_cnt) { if (qla24xx_walk_and_build_sglist_no_difb(ha, NULL, cur_dsd, - prm->tot_dsds, cmd)) + prm->tot_dsds, &tc)) goto crc_queuing_error; } else if (qla24xx_walk_and_build_sglist(ha, NULL, cur_dsd, - (prm->tot_dsds - prm->prot_seg_cnt), cmd)) + (prm->tot_dsds - prm->prot_seg_cnt), &tc)) goto crc_queuing_error; if (bundling && prm->prot_seg_cnt) { @@ -2854,18 +2990,18 @@ qlt_build_ctio_crc2_pkt(struct qla_tgt_prm *prm, scsi_qla_host_t *vha) cur_dsd = (uint32_t *) &crc_ctx_pkt->u.bundling.dif_address; if (qla24xx_walk_and_build_prot_sglist(ha, NULL, cur_dsd, - prm->prot_seg_cnt, cmd)) + prm->prot_seg_cnt, &tc)) goto crc_queuing_error; } return QLA_SUCCESS; crc_queuing_error: /* Cleanup will be performed by the caller */ + vha->hw->tgt.cmds[h - 1] = NULL; return QLA_FUNCTION_FAILED; } - /* * Callback to setup response of xmit_type of QLA_TGT_XMIT_DATA and * * QLA_TGT_XMIT_STATUS for >= 24xx silicon @@ -3113,139 +3249,113 @@ EXPORT_SYMBOL(qlt_rdy_to_xfer); /* - * Checks the guard or meta-data for the type of error - * detected by the HBA. + * it is assumed either hardware_lock or qpair lock is held. */ -static inline int +static void qlt_handle_dif_error(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd, - struct ctio_crc_from_fw *sts) + struct ctio_crc_from_fw *sts) { uint8_t *ap = &sts->actual_dif[0]; uint8_t *ep = &sts->expected_dif[0]; - uint32_t e_ref_tag, a_ref_tag; - uint16_t e_app_tag, a_app_tag; - uint16_t e_guard, a_guard; uint64_t lba = cmd->se_cmd.t_task_lba; + uint8_t scsi_status, sense_key, asc, ascq; + unsigned long flags; - a_guard = be16_to_cpu(*(uint16_t *)(ap + 0)); - a_app_tag = be16_to_cpu(*(uint16_t *)(ap + 2)); - a_ref_tag = be32_to_cpu(*(uint32_t *)(ap + 4)); + cmd->trc_flags |= TRC_DIF_ERR; - e_guard = be16_to_cpu(*(uint16_t *)(ep + 0)); - e_app_tag = be16_to_cpu(*(uint16_t *)(ep + 2)); - e_ref_tag = be32_to_cpu(*(uint32_t *)(ep + 4)); + cmd->a_guard = be16_to_cpu(*(uint16_t *)(ap + 0)); + cmd->a_app_tag = be16_to_cpu(*(uint16_t *)(ap + 2)); + cmd->a_ref_tag = be32_to_cpu(*(uint32_t *)(ap + 4)); - ql_dbg(ql_dbg_tgt, vha, 0xe075, - "iocb(s) %p Returned STATUS.\n", sts); + cmd->e_guard = be16_to_cpu(*(uint16_t *)(ep + 0)); + cmd->e_app_tag = be16_to_cpu(*(uint16_t *)(ep + 2)); + cmd->e_ref_tag = be32_to_cpu(*(uint32_t *)(ep + 4)); - ql_dbg(ql_dbg_tgt, vha, 0xf075, - "dif check TGT cdb 0x%x lba 0x%llx: [Actual|Expected] Ref Tag[0x%x|0x%x], App Tag [0x%x|0x%x], Guard [0x%x|0x%x]\n", - cmd->atio.u.isp24.fcp_cmnd.cdb[0], lba, - a_ref_tag, e_ref_tag, a_app_tag, e_app_tag, a_guard, e_guard); + ql_dbg(ql_dbg_tgt_dif, vha, 0xf075, + "%s: aborted %d state %d\n", __func__, cmd->aborted, cmd->state); - /* - * Ignore sector if: - * For type 3: ref & app tag is all 'f's - * For type 0,1,2: app tag is all 'f's - */ - if ((a_app_tag == 0xffff) && - ((cmd->se_cmd.prot_type != TARGET_DIF_TYPE3_PROT) || - (a_ref_tag == 0xffffffff))) { - uint32_t blocks_done; + scsi_status = sense_key = asc = ascq = 0; - /* 2TB boundary case covered automatically with this */ - blocks_done = e_ref_tag - (uint32_t)lba + 1; - cmd->se_cmd.bad_sector = e_ref_tag; - cmd->se_cmd.pi_err = 0; - ql_dbg(ql_dbg_tgt, vha, 0xf074, - "need to return scsi good\n"); + /* check appl tag */ + if (cmd->e_app_tag != cmd->a_app_tag) { + ql_dbg(ql_dbg_tgt_dif, vha, 0xffff, + "App Tag ERR: cdb[%x] lba[%llx %llx] blks[%x] [Actual|Expected] " + "Ref[%x|%x], App[%x|%x], " + "Guard [%x|%x] cmd=%p ox_id[%04x]", + cmd->cdb[0], lba, (lba+cmd->num_blks), cmd->num_blks, + cmd->a_ref_tag, cmd->e_ref_tag, + cmd->a_app_tag, cmd->e_app_tag, + cmd->a_guard, cmd->e_guard, + cmd, cmd->atio.u.isp24.fcp_hdr.ox_id); - /* Update protection tag */ - if (cmd->prot_sg_cnt) { - uint32_t i, k = 0, num_ent; - struct scatterlist *sg, *sgl; - - - sgl = cmd->prot_sg; - - /* Patch the corresponding protection tags */ - for_each_sg(sgl, sg, cmd->prot_sg_cnt, i) { - num_ent = sg_dma_len(sg) / 8; - if (k + num_ent < blocks_done) { - k += num_ent; - continue; - } - k = blocks_done; - break; - } - - if (k != blocks_done) { - ql_log(ql_log_warn, vha, 0xf076, - "unexpected tag values tag:lba=%u:%llu)\n", - e_ref_tag, (unsigned long long)lba); - goto out; - } - -#if 0 - struct sd_dif_tuple *spt; - /* TODO: - * This section came from initiator. Is it valid here? - * should ulp be override with actual val??? - */ - spt = page_address(sg_page(sg)) + sg->offset; - spt += j; - - spt->app_tag = 0xffff; - if (cmd->se_cmd.prot_type == SCSI_PROT_DIF_TYPE3) - spt->ref_tag = 0xffffffff; -#endif - } - - return 0; - } - - /* check guard */ - if (e_guard != a_guard) { - cmd->se_cmd.pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; - cmd->se_cmd.bad_sector = cmd->se_cmd.t_task_lba; - - ql_log(ql_log_warn, vha, 0xe076, - "Guard ERR: cdb 0x%x lba 0x%llx: [Actual|Expected] Ref Tag[0x%x|0x%x], App Tag [0x%x|0x%x], Guard [0x%x|0x%x] cmd=%p\n", - cmd->atio.u.isp24.fcp_cmnd.cdb[0], lba, - a_ref_tag, e_ref_tag, a_app_tag, e_app_tag, - a_guard, e_guard, cmd); - goto out; + cmd->dif_err_code = DIF_ERR_APP; + scsi_status = SAM_STAT_CHECK_CONDITION; + sense_key = ABORTED_COMMAND; + asc = 0x10; + ascq = 0x2; } /* check ref tag */ - if (e_ref_tag != a_ref_tag) { - cmd->se_cmd.pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; - cmd->se_cmd.bad_sector = e_ref_tag; + if (cmd->e_ref_tag != cmd->a_ref_tag) { + ql_dbg(ql_dbg_tgt_dif, vha, 0xffff, + "Ref Tag ERR: cdb[%x] lba[%llx %llx] blks[%x] [Actual|Expected] " + "Ref[%x|%x], App[%x|%x], " + "Guard[%x|%x] cmd=%p ox_id[%04x] ", + cmd->cdb[0], lba, (lba+cmd->num_blks), cmd->num_blks, + cmd->a_ref_tag, cmd->e_ref_tag, + cmd->a_app_tag, cmd->e_app_tag, + cmd->a_guard, cmd->e_guard, + cmd, cmd->atio.u.isp24.fcp_hdr.ox_id); - ql_log(ql_log_warn, vha, 0xe077, - "Ref Tag ERR: cdb 0x%x lba 0x%llx: [Actual|Expected] Ref Tag[0x%x|0x%x], App Tag [0x%x|0x%x], Guard [0x%x|0x%x] cmd=%p\n", - cmd->atio.u.isp24.fcp_cmnd.cdb[0], lba, - a_ref_tag, e_ref_tag, a_app_tag, e_app_tag, - a_guard, e_guard, cmd); + cmd->dif_err_code = DIF_ERR_REF; + scsi_status = SAM_STAT_CHECK_CONDITION; + sense_key = ABORTED_COMMAND; + asc = 0x10; + ascq = 0x3; goto out; } - /* check appl tag */ - if (e_app_tag != a_app_tag) { - cmd->se_cmd.pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; - cmd->se_cmd.bad_sector = cmd->se_cmd.t_task_lba; - - ql_log(ql_log_warn, vha, 0xe078, - "App Tag ERR: cdb 0x%x lba 0x%llx: [Actual|Expected] Ref Tag[0x%x|0x%x], App Tag [0x%x|0x%x], Guard [0x%x|0x%x] cmd=%p\n", - cmd->atio.u.isp24.fcp_cmnd.cdb[0], lba, - a_ref_tag, e_ref_tag, a_app_tag, e_app_tag, - a_guard, e_guard, cmd); - goto out; + /* check guard */ + if (cmd->e_guard != cmd->a_guard) { + ql_dbg(ql_dbg_tgt_dif, vha, 0xffff, + "Guard ERR: cdb[%x] lba[%llx %llx] blks[%x] [Actual|Expected] " + "Ref[%x|%x], App[%x|%x], " + "Guard [%x|%x] cmd=%p ox_id[%04x]", + cmd->cdb[0], lba, (lba+cmd->num_blks), cmd->num_blks, + cmd->a_ref_tag, cmd->e_ref_tag, + cmd->a_app_tag, cmd->e_app_tag, + cmd->a_guard, cmd->e_guard, + cmd, cmd->atio.u.isp24.fcp_hdr.ox_id); + cmd->dif_err_code = DIF_ERR_GRD; + scsi_status = SAM_STAT_CHECK_CONDITION; + sense_key = ABORTED_COMMAND; + asc = 0x10; + ascq = 0x1; } out: - return 1; -} + switch (cmd->state) { + case QLA_TGT_STATE_NEED_DATA: + /* handle_data will load DIF error code */ + cmd->state = QLA_TGT_STATE_DATA_IN; + vha->hw->tgt.tgt_ops->handle_data(cmd); + break; + default: + spin_lock_irqsave(&cmd->cmd_lock, flags); + if (cmd->aborted) { + spin_unlock_irqrestore(&cmd->cmd_lock, flags); + vha->hw->tgt.tgt_ops->free_cmd(cmd); + break; + } + spin_unlock_irqrestore(&cmd->cmd_lock, flags); + qlt_send_resp_ctio(vha, cmd, scsi_status, sense_key, asc, ascq); + /* assume scsi status gets out on the wire. + * Will not wait for completion. + */ + vha->hw->tgt.tgt_ops->free_cmd(cmd); + break; + } +} /* If hardware_lock held on entry, might drop it, then reaquire */ /* This function sends the appropriate CTIO to ISP 2xxx or 24xx */ @@ -3552,6 +3662,16 @@ static int qlt_term_ctio_exchange(struct scsi_qla_host *vha, void *ctio, { int term = 0; + if (cmd->se_cmd.prot_op) + ql_dbg(ql_dbg_tgt_dif, vha, 0xffff, + "Term DIF cmd: lba[0x%llx|%lld] len[0x%x] " + "se_cmd=%p tag[%x] op %#x/%s", + cmd->lba, cmd->lba, + cmd->num_blks, &cmd->se_cmd, + cmd->atio.u.isp24.exchange_addr, + cmd->se_cmd.prot_op, + prot_op_str(cmd->se_cmd.prot_op)); + if (ctio != NULL) { struct ctio7_from_24xx *c = (struct ctio7_from_24xx *)ctio; term = !(c->flags & @@ -3769,32 +3889,15 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha, uint32_t handle, struct ctio_crc_from_fw *crc = (struct ctio_crc_from_fw *)ctio; ql_dbg(ql_dbg_tgt_mgt, vha, 0xf073, - "qla_target(%d): CTIO with DIF_ERROR status %x received (state %x, se_cmd %p) actual_dif[0x%llx] expect_dif[0x%llx]\n", + "qla_target(%d): CTIO with DIF_ERROR status %x " + "received (state %x, ulp_cmd %p) actual_dif[0x%llx] " + "expect_dif[0x%llx]\n", vha->vp_idx, status, cmd->state, se_cmd, *((u64 *)&crc->actual_dif[0]), *((u64 *)&crc->expected_dif[0])); - if (qlt_handle_dif_error(vha, cmd, ctio)) { - if (cmd->state == QLA_TGT_STATE_NEED_DATA) { - /* scsi Write/xfer rdy complete */ - goto skip_term; - } else { - /* scsi read/xmit respond complete - * call handle dif to send scsi status - * rather than terminate exchange. - */ - cmd->state = QLA_TGT_STATE_PROCESSED; - ha->tgt.tgt_ops->handle_dif_err(cmd); - return; - } - } else { - /* Need to generate a SCSI good completion. - * because FW did not send scsi status. - */ - status = 0; - goto skip_term; - } - break; + qlt_handle_dif_error(vha, cmd, ctio); + return; } default: ql_dbg(ql_dbg_tgt_mgt, vha, 0xf05b, @@ -3817,7 +3920,6 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha, uint32_t handle, return; } } -skip_term: if (cmd->state == QLA_TGT_STATE_PROCESSED) { cmd->trc_flags |= TRC_CTIO_DONE; diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h index a7f90dcaae37..c35f889b94a6 100644 --- a/drivers/scsi/qla2xxx/qla_target.h +++ b/drivers/scsi/qla2xxx/qla_target.h @@ -378,6 +378,14 @@ static inline void adjust_corrupted_atio(struct atio_from_isp *atio) atio->u.isp24.fcp_cmnd.add_cdb_len = 0; } +static inline int get_datalen_for_atio(struct atio_from_isp *atio) +{ + int len = atio->u.isp24.fcp_cmnd.add_cdb_len; + + return (be32_to_cpu(get_unaligned((uint32_t *) + &atio->u.isp24.fcp_cmnd.add_cdb[len * 4]))); +} + #define CTIO_TYPE7 0x12 /* Continue target I/O entry (for 24xx) */ /* @@ -667,7 +675,6 @@ struct qla_tgt_func_tmpl { int (*handle_cmd)(struct scsi_qla_host *, struct qla_tgt_cmd *, unsigned char *, uint32_t, int, int, int); void (*handle_data)(struct qla_tgt_cmd *); - void (*handle_dif_err)(struct qla_tgt_cmd *); int (*handle_tmr)(struct qla_tgt_mgmt_cmd *, uint32_t, uint16_t, uint32_t); void (*free_cmd)(struct qla_tgt_cmd *); @@ -684,6 +691,8 @@ struct qla_tgt_func_tmpl { void (*clear_nacl_from_fcport_map)(struct fc_port *); void (*put_sess)(struct fc_port *); void (*shutdown_sess)(struct fc_port *); + int (*get_dif_tags)(struct qla_tgt_cmd *cmd, uint16_t *pfw_prot_opts); + int (*chk_dif_tags)(uint32_t tag); }; int qla2x00_wait_for_hba_online(struct scsi_qla_host *); @@ -720,8 +729,8 @@ int qla2x00_wait_for_hba_online(struct scsi_qla_host *); #define QLA_TGT_ABORT_ALL 0xFFFE #define QLA_TGT_NEXUS_LOSS_SESS 0xFFFD #define QLA_TGT_NEXUS_LOSS 0xFFFC -#define QLA_TGT_ABTS 0xFFFB -#define QLA_TGT_2G_ABORT_TASK 0xFFFA +#define QLA_TGT_ABTS 0xFFFB +#define QLA_TGT_2G_ABORT_TASK 0xFFFA /* Notify Acknowledge flags */ #define NOTIFY_ACK_RES_COUNT BIT_8 @@ -845,6 +854,7 @@ enum trace_flags { TRC_CMD_FREE = BIT_17, TRC_DATA_IN = BIT_18, TRC_ABORT = BIT_19, + TRC_DIF_ERR = BIT_20, }; struct qla_tgt_cmd { @@ -862,7 +872,6 @@ struct qla_tgt_cmd { unsigned int sg_mapped:1; unsigned int free_sg:1; unsigned int write_data_transferred:1; - unsigned int ctx_dsd_alloced:1; unsigned int q_full:1; unsigned int term_exchg:1; unsigned int cmd_sent_to_fw:1; @@ -885,11 +894,25 @@ struct qla_tgt_cmd { struct list_head cmd_list; struct atio_from_isp atio; - /* t10dif */ + + uint8_t ctx_dsd_alloced; + + /* T10-DIF */ +#define DIF_ERR_NONE 0 +#define DIF_ERR_GRD 1 +#define DIF_ERR_REF 2 +#define DIF_ERR_APP 3 + int8_t dif_err_code; struct scatterlist *prot_sg; uint32_t prot_sg_cnt; - uint32_t blk_sz; + uint32_t blk_sz, num_blks; + uint8_t scsi_status, sense_key, asc, ascq; + struct crc_context *ctx; + uint8_t *cdb; + uint64_t lba; + uint16_t a_guard, e_guard, a_app_tag, e_app_tag; + uint32_t a_ref_tag, e_ref_tag; uint64_t jiffies_at_alloc; uint64_t jiffies_at_free; @@ -1053,4 +1076,7 @@ extern int qlt_free_qfull_cmds(struct scsi_qla_host *); extern void qlt_logo_completion_handler(fc_port_t *, int); extern void qlt_do_generation_tick(struct scsi_qla_host *, int *); +void qlt_send_resp_ctio(scsi_qla_host_t *, struct qla_tgt_cmd *, uint8_t, + uint8_t, uint8_t, uint8_t); + #endif /* __QLA_TARGET_H */ diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index 8e8ab0fa9672..7443e4efa3ae 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -531,6 +531,24 @@ static void tcm_qla2xxx_handle_data_work(struct work_struct *work) return; } + switch (cmd->dif_err_code) { + case DIF_ERR_GRD: + cmd->se_cmd.pi_err = + TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + break; + case DIF_ERR_REF: + cmd->se_cmd.pi_err = + TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + break; + case DIF_ERR_APP: + cmd->se_cmd.pi_err = + TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + break; + case DIF_ERR_NONE: + default: + break; + } + if (cmd->se_cmd.pi_err) transport_generic_request_failure(&cmd->se_cmd, cmd->se_cmd.pi_err); @@ -555,25 +573,23 @@ static void tcm_qla2xxx_handle_data(struct qla_tgt_cmd *cmd) queue_work_on(smp_processor_id(), tcm_qla2xxx_free_wq, &cmd->work); } -static void tcm_qla2xxx_handle_dif_work(struct work_struct *work) +static int tcm_qla2xxx_chk_dif_tags(uint32_t tag) { - struct qla_tgt_cmd *cmd = container_of(work, struct qla_tgt_cmd, work); - - /* take an extra kref to prevent cmd free too early. - * need to wait for SCSI status/check condition to - * finish responding generate by transport_generic_request_failure. - */ - kref_get(&cmd->se_cmd.cmd_kref); - transport_generic_request_failure(&cmd->se_cmd, cmd->se_cmd.pi_err); + return 0; } -/* - * Called from qla_target.c:qlt_do_ctio_completion() - */ -static void tcm_qla2xxx_handle_dif_err(struct qla_tgt_cmd *cmd) +static int tcm_qla2xxx_dif_tags(struct qla_tgt_cmd *cmd, + uint16_t *pfw_prot_opts) { - INIT_WORK(&cmd->work, tcm_qla2xxx_handle_dif_work); - queue_work(tcm_qla2xxx_free_wq, &cmd->work); + struct se_cmd *se_cmd = &cmd->se_cmd; + + if (!(se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD)) + *pfw_prot_opts |= PO_DISABLE_GUARD_CHECK; + + if (!(se_cmd->prot_checks & TARGET_DIF_CHECK_APPTAG)) + *pfw_prot_opts |= PO_DIS_APP_TAG_VALD; + + return 0; } /* @@ -1610,7 +1626,6 @@ static void tcm_qla2xxx_update_sess(struct fc_port *sess, port_id_t s_id, static struct qla_tgt_func_tmpl tcm_qla2xxx_template = { .handle_cmd = tcm_qla2xxx_handle_cmd, .handle_data = tcm_qla2xxx_handle_data, - .handle_dif_err = tcm_qla2xxx_handle_dif_err, .handle_tmr = tcm_qla2xxx_handle_tmr, .free_cmd = tcm_qla2xxx_free_cmd, .free_mcmd = tcm_qla2xxx_free_mcmd, @@ -1622,6 +1637,8 @@ static struct qla_tgt_func_tmpl tcm_qla2xxx_template = { .clear_nacl_from_fcport_map = tcm_qla2xxx_clear_nacl_from_fcport_map, .put_sess = tcm_qla2xxx_put_sess, .shutdown_sess = tcm_qla2xxx_shutdown_sess, + .get_dif_tags = tcm_qla2xxx_dif_tags, + .chk_dif_tags = tcm_qla2xxx_chk_dif_tags, }; static int tcm_qla2xxx_init_lport(struct tcm_qla2xxx_lport *lport) From 54b9993c8cf2d77c0f23be828a22e0817f742442 Mon Sep 17 00:00:00 2001 From: Anil Gurumurthy Date: Wed, 15 Mar 2017 09:48:50 -0700 Subject: [PATCH 220/297] qla2xxx: Export DIF stats via debugfs Signed-off-by: Anil Gurumurthy Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_def.h | 12 ++++++++++++ drivers/scsi/qla2xxx/qla_dfs.c | 15 +++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 9d1d3dcf1c87..8228dfac6a31 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -3108,6 +3108,16 @@ struct qla_chip_state_84xx { uint32_t gold_fw_version; }; +struct qla_dif_statistics { + uint64_t dif_input_bytes; + uint64_t dif_output_bytes; + uint64_t dif_input_requests; + uint64_t dif_output_requests; + uint32_t dif_guard_err; + uint32_t dif_ref_tag_err; + uint32_t dif_app_tag_err; +}; + struct qla_statistics { uint32_t total_isp_aborts; uint64_t input_bytes; @@ -3120,6 +3130,8 @@ struct qla_statistics { uint32_t stat_max_pend_cmds; uint32_t stat_max_qfull_cmds_alloc; uint32_t stat_max_qfull_cmds_dropped; + + struct qla_dif_statistics qla_dif_stats; }; struct bidi_statistics { diff --git a/drivers/scsi/qla2xxx/qla_dfs.c b/drivers/scsi/qla2xxx/qla_dfs.c index b48cce696bac..3b35905619b0 100644 --- a/drivers/scsi/qla2xxx/qla_dfs.c +++ b/drivers/scsi/qla2xxx/qla_dfs.c @@ -114,6 +114,21 @@ qla_dfs_tgt_counters_show(struct seq_file *s, void *unused) seq_printf(s, "num Q full sent = %lld\n", vha->tgt_counters.num_q_full_sent); + /* DIF stats */ + seq_printf(s, "DIF Inp Bytes = %lld\n", + vha->qla_stats.qla_dif_stats.dif_input_bytes); + seq_printf(s, "DIF Outp Bytes = %lld\n", + vha->qla_stats.qla_dif_stats.dif_output_bytes); + seq_printf(s, "DIF Inp Req = %lld\n", + vha->qla_stats.qla_dif_stats.dif_input_requests); + seq_printf(s, "DIF Outp Req = %lld\n", + vha->qla_stats.qla_dif_stats.dif_output_requests); + seq_printf(s, "DIF Guard err = %d\n", + vha->qla_stats.qla_dif_stats.dif_guard_err); + seq_printf(s, "DIF Ref tag err = %d\n", + vha->qla_stats.qla_dif_stats.dif_ref_tag_err); + seq_printf(s, "DIF App tag err = %d\n", + vha->qla_stats.qla_dif_stats.dif_app_tag_err); return 0; } From f1443eebca7792b3b8b41b27652d67ddc5d31fa2 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:51 -0700 Subject: [PATCH 221/297] qla2xxx: Add async new target notification Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.c | 6 +++--- drivers/scsi/qla2xxx/qla_target.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 532004981dbd..563116188c43 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -6005,13 +6005,13 @@ int qlt_add_target(struct qla_hw_data *ha, struct scsi_qla_host *base_vha) tgt->datasegs_per_cmd = QLA_TGT_DATASEGS_PER_CMD_24XX; tgt->datasegs_per_cont = QLA_TGT_DATASEGS_PER_CONT_24XX; - if (base_vha->fc_vport) - return 0; - mutex_lock(&qla_tgt_mutex); list_add_tail(&tgt->tgt_list_entry, &qla_tgt_glist); mutex_unlock(&qla_tgt_mutex); + if (ha->tgt.tgt_ops && ha->tgt.tgt_ops->add_target) + ha->tgt.tgt_ops->add_target(base_vha); + return 0; } diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h index c35f889b94a6..d64420251194 100644 --- a/drivers/scsi/qla2xxx/qla_target.h +++ b/drivers/scsi/qla2xxx/qla_target.h @@ -693,6 +693,7 @@ struct qla_tgt_func_tmpl { void (*shutdown_sess)(struct fc_port *); int (*get_dif_tags)(struct qla_tgt_cmd *cmd, uint16_t *pfw_prot_opts); int (*chk_dif_tags)(uint32_t tag); + void (*add_target)(struct scsi_qla_host *); }; int qla2x00_wait_for_hba_online(struct scsi_qla_host *); From 15f30a5752287f20c7de428423c34bc51cfbe465 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:52 -0700 Subject: [PATCH 222/297] qla2xxx: Use IOCB interface to submit non-critical MBX. The Mailbox interface is currently over subscribed. We like to reserve the Mailbox interface for the chip managment and link initialization. Any non essential Mailbox command will be routed through the IOCB interface. The IOCB interface is able to absorb more commands. Following commands are being routed through IOCB interface - Get ID List (007Ch) - Get Port DB (0064h) - Get Link Priv Stats (006Dh) Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_def.h | 12 +- drivers/scsi/qla2xxx/qla_gbl.h | 10 +- drivers/scsi/qla2xxx/qla_init.c | 46 +---- drivers/scsi/qla2xxx/qla_isr.c | 2 +- drivers/scsi/qla2xxx/qla_mbx.c | 270 ++++++++++++++++++++++++++++-- drivers/scsi/qla2xxx/qla_target.c | 4 +- 6 files changed, 279 insertions(+), 65 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 8228dfac6a31..ae38b7a789b1 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -395,11 +395,15 @@ struct srb_iocb { struct completion comp; } abt; struct ct_arg ctarg; +#define MAX_IOCB_MB_REG 28 +#define SIZEOF_IOCB_MB_REG (MAX_IOCB_MB_REG * sizeof(uint16_t)) struct { - __le16 in_mb[28]; /* fr fw */ - __le16 out_mb[28]; /* to fw */ + __le16 in_mb[MAX_IOCB_MB_REG]; /* from FW */ + __le16 out_mb[MAX_IOCB_MB_REG]; /* to FW */ void *out, *in; dma_addr_t out_dma, in_dma; + struct completion comp; + int rc; } mbx; struct { struct imm_ntfy_from_isp *ntfy; @@ -437,7 +441,7 @@ typedef struct srb { uint32_t handle; uint16_t flags; uint16_t type; - char *name; + const char *name; int iocbs; struct qla_qpair *qpair; u32 gen1; /* scratch */ @@ -3364,6 +3368,8 @@ struct qla_hw_data { uint32_t exlogins_enabled:1; uint32_t exchoffld_enabled:1; /* 35 bits */ + + uint32_t fw_started:1; } flags; /* This spinlock is used to protect "io transactions", you must diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index ca6f122e5865..323b912b47f7 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -193,6 +193,7 @@ extern int qla24xx_post_upd_fcport_work(struct scsi_qla_host *, fc_port_t *); void qla2x00_handle_login_done_event(struct scsi_qla_host *, fc_port_t *, uint16_t *); int qla24xx_post_gnl_work(struct scsi_qla_host *, fc_port_t *); +int qla24xx_async_abort_cmd(srb_t *); /* * Global Functions in qla_mid.c source file. @@ -368,7 +369,7 @@ qla2x00_get_link_status(scsi_qla_host_t *, uint16_t, struct link_statistics *, extern int qla24xx_get_isp_stats(scsi_qla_host_t *, struct link_statistics *, - dma_addr_t, uint); + dma_addr_t, uint16_t); extern int qla24xx_abort_command(srb_t *); extern int qla24xx_async_abort_command(srb_t *); @@ -472,6 +473,13 @@ qla2x00_dump_mctp_data(scsi_qla_host_t *, dma_addr_t, uint32_t, uint32_t); extern int qla26xx_dport_diagnostics(scsi_qla_host_t *, void *, uint, uint); +int qla24xx_send_mb_cmd(struct scsi_qla_host *, mbx_cmd_t *); +int qla24xx_gpdb_wait(struct scsi_qla_host *, fc_port_t *, u8); +int qla24xx_gidlist_wait(struct scsi_qla_host *, void *, dma_addr_t, + uint16_t *); +int __qla24xx_parse_gpdb(struct scsi_qla_host *, fc_port_t *, + struct port_database_24xx *); + /* * Global Function Prototypes in qla_isr.c source file. */ diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index a7865a5d556d..b1bfa63f7d4e 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -629,7 +629,6 @@ void qla24xx_async_gpdb_sp_done(void *s, int res) struct srb *sp = s; struct scsi_qla_host *vha = sp->vha; struct qla_hw_data *ha = vha->hw; - uint64_t zero = 0; struct port_database_24xx *pd; fc_port_t *fcport = sp->fcport; u16 *mb = sp->u.iocb_cmd.u.mbx.in_mb; @@ -649,48 +648,7 @@ void qla24xx_async_gpdb_sp_done(void *s, int res) pd = (struct port_database_24xx *)sp->u.iocb_cmd.u.mbx.in; - /* Check for logged in state. */ - if (pd->current_login_state != PDS_PRLI_COMPLETE && - pd->last_login_state != PDS_PRLI_COMPLETE) { - ql_dbg(ql_dbg_mbx, vha, 0xffff, - "Unable to verify login-state (%x/%x) for " - "loop_id %x.\n", pd->current_login_state, - pd->last_login_state, fcport->loop_id); - rval = QLA_FUNCTION_FAILED; - goto gpd_error_out; - } - - if (fcport->loop_id == FC_NO_LOOP_ID || - (memcmp(fcport->port_name, (uint8_t *)&zero, 8) && - memcmp(fcport->port_name, pd->port_name, 8))) { - /* We lost the device mid way. */ - rval = QLA_NOT_LOGGED_IN; - goto gpd_error_out; - } - - /* Names are little-endian. */ - memcpy(fcport->node_name, pd->node_name, WWN_SIZE); - - /* Get port_id of device. */ - fcport->d_id.b.domain = pd->port_id[0]; - fcport->d_id.b.area = pd->port_id[1]; - fcport->d_id.b.al_pa = pd->port_id[2]; - fcport->d_id.b.rsvd_1 = 0; - - /* If not target must be initiator or unknown type. */ - if ((pd->prli_svc_param_word_3[0] & BIT_4) == 0) - fcport->port_type = FCT_INITIATOR; - else - fcport->port_type = FCT_TARGET; - - /* Passback COS information. */ - fcport->supported_classes = (pd->flags & PDF_CLASS_2) ? - FC_COS_CLASS2 : FC_COS_CLASS3; - - if (pd->prli_svc_param_word_3[0] & BIT_7) { - fcport->flags |= FCF_CONF_COMP_SUPPORTED; - fcport->conf_compl_supported = 1; - } + rval = __qla24xx_parse_gpdb(vha, fcport, pd); gpd_error_out: memset(&ea, 0, sizeof(ea)); @@ -1266,7 +1224,7 @@ qla24xx_abort_sp_done(void *ptr, int res) complete(&abt->u.abt.comp); } -static int +int qla24xx_async_abort_cmd(srb_t *cmd_sp) { scsi_qla_host_t *vha = cmd_sp->vha; diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index b2c6da752edd..3953c8d6af69 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -2692,7 +2692,7 @@ qla24xx_abort_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, return; abt = &sp->u.iocb_cmd; - abt->u.abt.comp_status = le32_to_cpu(pkt->nport_handle); + abt->u.abt.comp_status = le16_to_cpu(pkt->nport_handle); sp->done(sp, 0); } diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 35079f417417..e40ed570d3c1 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -10,6 +10,28 @@ #include #include +static struct mb_cmd_name { + uint16_t cmd; + const char *str; +} mb_str[] = { + {MBC_GET_PORT_DATABASE, "GPDB"}, + {MBC_GET_ID_LIST, "GIDList"}, + {MBC_GET_LINK_PRIV_STATS, "Stats"}, +}; + +static const char *mb_to_str(uint16_t cmd) +{ + int i; + struct mb_cmd_name *e; + + for (i = 0; i < ARRAY_SIZE(mb_str); i++) { + e = mb_str + i; + if (cmd == e->cmd) + return e->str; + } + return "unknown"; +} + static struct rom_cmd { uint16_t cmd; } rom_cmds[] = { @@ -2818,7 +2840,7 @@ qla2x00_get_link_status(scsi_qla_host_t *vha, uint16_t loop_id, int qla24xx_get_isp_stats(scsi_qla_host_t *vha, struct link_statistics *stats, - dma_addr_t stats_dma, uint options) + dma_addr_t stats_dma, uint16_t options) { int rval; mbx_cmd_t mc; @@ -2828,19 +2850,17 @@ qla24xx_get_isp_stats(scsi_qla_host_t *vha, struct link_statistics *stats, ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x1088, "Entered %s.\n", __func__); - mcp->mb[0] = MBC_GET_LINK_PRIV_STATS; - mcp->mb[2] = MSW(stats_dma); - mcp->mb[3] = LSW(stats_dma); - mcp->mb[6] = MSW(MSD(stats_dma)); - mcp->mb[7] = LSW(MSD(stats_dma)); - mcp->mb[8] = sizeof(struct link_statistics) / 4; - mcp->mb[9] = vha->vp_idx; - mcp->mb[10] = options; - mcp->out_mb = MBX_10|MBX_9|MBX_8|MBX_7|MBX_6|MBX_3|MBX_2|MBX_0; - mcp->in_mb = MBX_2|MBX_1|MBX_0; - mcp->tov = MBX_TOV_SECONDS; - mcp->flags = IOCTL_CMD; - rval = qla2x00_mailbox_command(vha, mcp); + memset(&mc, 0, sizeof(mc)); + mc.mb[0] = MBC_GET_LINK_PRIV_STATS; + mc.mb[2] = MSW(stats_dma); + mc.mb[3] = LSW(stats_dma); + mc.mb[6] = MSW(MSD(stats_dma)); + mc.mb[7] = LSW(MSD(stats_dma)); + mc.mb[8] = sizeof(struct link_statistics) / 4; + mc.mb[9] = cpu_to_le16(vha->vp_idx); + mc.mb[10] = cpu_to_le16(options); + + rval = qla24xx_send_mb_cmd(vha, &mc); if (rval == QLA_SUCCESS) { if (mcp->mb[0] != MBS_COMMAND_COMPLETE) { @@ -5827,3 +5847,225 @@ qla26xx_dport_diagnostics(scsi_qla_host_t *vha, return rval; } + +static void qla2x00_async_mb_sp_done(void *s, int res) +{ + struct srb *sp = s; + + sp->u.iocb_cmd.u.mbx.rc = res; + + complete(&sp->u.iocb_cmd.u.mbx.comp); + /* don't free sp here. Let the caller do the free */ +} + +/* + * This mailbox uses the iocb interface to send MB command. + * This allows non-critial (non chip setup) command to go + * out in parrallel. + */ +int qla24xx_send_mb_cmd(struct scsi_qla_host *vha, mbx_cmd_t *mcp) +{ + int rval = QLA_FUNCTION_FAILED; + srb_t *sp; + struct srb_iocb *c; + + if (!vha->hw->flags.fw_started) + goto done; + + sp = qla2x00_get_sp(vha, NULL, GFP_KERNEL); + if (!sp) + goto done; + + sp->type = SRB_MB_IOCB; + sp->name = mb_to_str(mcp->mb[0]); + + qla2x00_init_timer(sp, qla2x00_get_async_timeout(vha) + 2); + + memcpy(sp->u.iocb_cmd.u.mbx.out_mb, mcp->mb, SIZEOF_IOCB_MB_REG); + + c = &sp->u.iocb_cmd; + c->timeout = qla2x00_async_iocb_timeout; + init_completion(&c->u.mbx.comp); + + sp->done = qla2x00_async_mb_sp_done; + + rval = qla2x00_start_sp(sp); + if (rval != QLA_SUCCESS) { + ql_dbg(ql_dbg_mbx, vha, 0xffff, + "%s: %s Failed submission. %x.\n", + __func__, sp->name, rval); + goto done_free_sp; + } + + ql_dbg(ql_dbg_mbx, vha, 0xffff, "MB:%s hndl %x submitted\n", + sp->name, sp->handle); + + wait_for_completion(&c->u.mbx.comp); + memcpy(mcp->mb, sp->u.iocb_cmd.u.mbx.in_mb, SIZEOF_IOCB_MB_REG); + + rval = c->u.mbx.rc; + switch (rval) { + case QLA_FUNCTION_TIMEOUT: + ql_dbg(ql_dbg_mbx, vha, 0xffff, "%s: %s Timeout. %x.\n", + __func__, sp->name, rval); + break; + case QLA_SUCCESS: + ql_dbg(ql_dbg_mbx, vha, 0xffff, "%s: %s done.\n", + __func__, sp->name); + sp->free(sp); + break; + default: + ql_dbg(ql_dbg_mbx, vha, 0xffff, "%s: %s Failed. %x.\n", + __func__, sp->name, rval); + sp->free(sp); + break; + } + + return rval; + +done_free_sp: + sp->free(sp); +done: + return rval; +} + +/* + * qla24xx_gpdb_wait + * NOTE: Do not call this routine from DPC thread + */ +int qla24xx_gpdb_wait(struct scsi_qla_host *vha, fc_port_t *fcport, u8 opt) +{ + int rval = QLA_FUNCTION_FAILED; + dma_addr_t pd_dma; + struct port_database_24xx *pd; + struct qla_hw_data *ha = vha->hw; + mbx_cmd_t mc; + + if (!vha->hw->flags.fw_started) + goto done; + + pd = dma_pool_alloc(ha->s_dma_pool, GFP_KERNEL, &pd_dma); + if (pd == NULL) { + ql_log(ql_log_warn, vha, 0xffff, + "Failed to allocate port database structure.\n"); + goto done_free_sp; + } + memset(pd, 0, max(PORT_DATABASE_SIZE, PORT_DATABASE_24XX_SIZE)); + + memset(&mc, 0, sizeof(mc)); + mc.mb[0] = MBC_GET_PORT_DATABASE; + mc.mb[1] = cpu_to_le16(fcport->loop_id); + mc.mb[2] = MSW(pd_dma); + mc.mb[3] = LSW(pd_dma); + mc.mb[6] = MSW(MSD(pd_dma)); + mc.mb[7] = LSW(MSD(pd_dma)); + mc.mb[9] = cpu_to_le16(vha->vp_idx); + mc.mb[10] = cpu_to_le16((uint16_t)opt); + + rval = qla24xx_send_mb_cmd(vha, &mc); + if (rval != QLA_SUCCESS) { + ql_dbg(ql_dbg_mbx, vha, 0xffff, + "%s: %8phC fail\n", __func__, fcport->port_name); + goto done_free_sp; + } + + rval = __qla24xx_parse_gpdb(vha, fcport, pd); + + ql_dbg(ql_dbg_mbx, vha, 0xffff, "%s: %8phC done\n", + __func__, fcport->port_name); + +done_free_sp: + if (pd) + dma_pool_free(ha->s_dma_pool, pd, pd_dma); +done: + return rval; +} + +int __qla24xx_parse_gpdb(struct scsi_qla_host *vha, fc_port_t *fcport, + struct port_database_24xx *pd) +{ + int rval = QLA_SUCCESS; + uint64_t zero = 0; + + /* Check for logged in state. */ + if (pd->current_login_state != PDS_PRLI_COMPLETE && + pd->last_login_state != PDS_PRLI_COMPLETE) { + ql_dbg(ql_dbg_mbx, vha, 0xffff, + "Unable to verify login-state (%x/%x) for " + "loop_id %x.\n", pd->current_login_state, + pd->last_login_state, fcport->loop_id); + rval = QLA_FUNCTION_FAILED; + goto gpd_error_out; + } + + if (fcport->loop_id == FC_NO_LOOP_ID || + (memcmp(fcport->port_name, (uint8_t *)&zero, 8) && + memcmp(fcport->port_name, pd->port_name, 8))) { + /* We lost the device mid way. */ + rval = QLA_NOT_LOGGED_IN; + goto gpd_error_out; + } + + /* Names are little-endian. */ + memcpy(fcport->node_name, pd->node_name, WWN_SIZE); + memcpy(fcport->port_name, pd->port_name, WWN_SIZE); + + /* Get port_id of device. */ + fcport->d_id.b.domain = pd->port_id[0]; + fcport->d_id.b.area = pd->port_id[1]; + fcport->d_id.b.al_pa = pd->port_id[2]; + fcport->d_id.b.rsvd_1 = 0; + + /* If not target must be initiator or unknown type. */ + if ((pd->prli_svc_param_word_3[0] & BIT_4) == 0) + fcport->port_type = FCT_INITIATOR; + else + fcport->port_type = FCT_TARGET; + + /* Passback COS information. */ + fcport->supported_classes = (pd->flags & PDF_CLASS_2) ? + FC_COS_CLASS2 : FC_COS_CLASS3; + + if (pd->prli_svc_param_word_3[0] & BIT_7) { + fcport->flags |= FCF_CONF_COMP_SUPPORTED; + fcport->conf_compl_supported = 1; + } + +gpd_error_out: + return rval; +} + +/* + * qla24xx_gidlist__wait + * NOTE: don't call this routine from DPC thread. + */ +int qla24xx_gidlist_wait(struct scsi_qla_host *vha, + void *id_list, dma_addr_t id_list_dma, uint16_t *entries) +{ + int rval = QLA_FUNCTION_FAILED; + mbx_cmd_t mc; + + if (!vha->hw->flags.fw_started) + goto done; + + memset(&mc, 0, sizeof(mc)); + mc.mb[0] = MBC_GET_ID_LIST; + mc.mb[2] = MSW(id_list_dma); + mc.mb[3] = LSW(id_list_dma); + mc.mb[6] = MSW(MSD(id_list_dma)); + mc.mb[7] = LSW(MSD(id_list_dma)); + mc.mb[8] = 0; + mc.mb[9] = cpu_to_le16(vha->vp_idx); + + rval = qla24xx_send_mb_cmd(vha, &mc); + if (rval != QLA_SUCCESS) { + ql_dbg(ql_dbg_mbx, vha, 0xffff, + "%s: fail\n", __func__); + } else { + *entries = mc.mb[1]; + ql_dbg(ql_dbg_mbx, vha, 0xffff, + "%s: done\n", __func__); + } +done: + return rval; +} diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 563116188c43..38bdcd325fa4 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -1238,7 +1238,7 @@ static int qla24xx_get_loop_id(struct scsi_qla_host *vha, const uint8_t *s_id, } /* Get list of logged in devices */ - rc = qla2x00_get_id_list(vha, gid_list, gid_list_dma, &entries); + rc = qla24xx_gidlist_wait(vha, gid_list, gid_list_dma, &entries); if (rc != QLA_SUCCESS) { ql_dbg(ql_dbg_tgt_mgt, vha, 0xf045, "qla_target(%d): get_id_list() failed: %x\n", @@ -5648,7 +5648,7 @@ static fc_port_t *qlt_get_port_database(struct scsi_qla_host *vha, fcport->loop_id = loop_id; - rc = qla2x00_get_port_database(vha, fcport, 0); + rc = qla24xx_gpdb_wait(vha, fcport, 0); if (rc != QLA_SUCCESS) { ql_dbg(ql_dbg_tgt_mgt, vha, 0xf070, "qla_target(%d): Failed to retrieve fcport " From c423437e3ff41b8ca551ab6621baf11538dbfe9d Mon Sep 17 00:00:00 2001 From: Himanshu Madhani Date: Wed, 15 Mar 2017 09:48:53 -0700 Subject: [PATCH 223/297] qla2xxx: Add DebugFS node to display Port Database Signed-off-by: Himanshu Madhani Signed-off-by: Giridhar Malavali Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_def.h | 2 + drivers/scsi/qla2xxx/qla_dfs.c | 92 ++++++++++++++++++++++++++++++++-- 2 files changed, 90 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index ae38b7a789b1..089480280558 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -3300,6 +3300,8 @@ struct qlt_hw_data { uint8_t tgt_node_name[WWN_SIZE]; struct dentry *dfs_tgt_sess; + struct dentry *dfs_tgt_port_database; + struct list_head q_full_list; uint32_t num_pend_cmds; uint32_t num_qfull_cmds_alloc; diff --git a/drivers/scsi/qla2xxx/qla_dfs.c b/drivers/scsi/qla2xxx/qla_dfs.c index 3b35905619b0..989e17b0758c 100644 --- a/drivers/scsi/qla2xxx/qla_dfs.c +++ b/drivers/scsi/qla2xxx/qla_dfs.c @@ -19,11 +19,11 @@ qla2x00_dfs_tgt_sess_show(struct seq_file *s, void *unused) struct qla_hw_data *ha = vha->hw; unsigned long flags; struct fc_port *sess = NULL; - struct qla_tgt *tgt= vha->vha_tgt.qla_tgt; + struct qla_tgt *tgt = vha->vha_tgt.qla_tgt; - seq_printf(s, "%s\n",vha->host_str); + seq_printf(s, "%s\n", vha->host_str); if (tgt) { - seq_printf(s, "Port ID Port Name Handle\n"); + seq_puts(s, "Port ID Port Name Handle\n"); spin_lock_irqsave(&ha->tgt.sess_lock, flags); list_for_each_entry(sess, &vha->vp_fcports, list) @@ -44,7 +44,6 @@ qla2x00_dfs_tgt_sess_open(struct inode *inode, struct file *file) return single_open(file, qla2x00_dfs_tgt_sess_show, vha); } - static const struct file_operations dfs_tgt_sess_ops = { .open = qla2x00_dfs_tgt_sess_open, .read = seq_read, @@ -52,6 +51,78 @@ static const struct file_operations dfs_tgt_sess_ops = { .release = single_release, }; +static int +qla2x00_dfs_tgt_port_database_show(struct seq_file *s, void *unused) +{ + scsi_qla_host_t *vha = s->private; + struct qla_hw_data *ha = vha->hw; + struct gid_list_info *gid_list; + dma_addr_t gid_list_dma; + fc_port_t fc_port; + char *id_iter; + int rc, i; + uint16_t entries, loop_id; + struct qla_tgt *tgt = vha->vha_tgt.qla_tgt; + + seq_printf(s, "%s\n", vha->host_str); + if (tgt) { + gid_list = dma_alloc_coherent(&ha->pdev->dev, + qla2x00_gid_list_size(ha), + &gid_list_dma, GFP_KERNEL); + if (!gid_list) { + ql_dbg(ql_dbg_user, vha, 0x705c, + "DMA allocation failed for %u\n", + qla2x00_gid_list_size(ha)); + return 0; + } + + rc = qla24xx_gidlist_wait(vha, gid_list, gid_list_dma, + &entries); + if (rc != QLA_SUCCESS) + goto out_free_id_list; + + id_iter = (char *)gid_list; + + seq_puts(s, "Port Name Port ID Loop ID\n"); + + for (i = 0; i < entries; i++) { + struct gid_list_info *gid = + (struct gid_list_info *)id_iter; + loop_id = le16_to_cpu(gid->loop_id); + memset(&fc_port, 0, sizeof(fc_port_t)); + + fc_port.loop_id = loop_id; + + rc = qla24xx_gpdb_wait(vha, &fc_port, 0); + seq_printf(s, "%8phC %02x%02x%02x %d\n", + fc_port.port_name, fc_port.d_id.b.domain, + fc_port.d_id.b.area, fc_port.d_id.b.al_pa, + fc_port.loop_id); + id_iter += ha->gid_list_info_size; + } +out_free_id_list: + dma_free_coherent(&ha->pdev->dev, qla2x00_gid_list_size(ha), + gid_list, gid_list_dma); + } + + return 0; +} + +static int +qla2x00_dfs_tgt_port_database_open(struct inode *inode, struct file *file) +{ + scsi_qla_host_t *vha = inode->i_private; + + return single_open(file, qla2x00_dfs_tgt_port_database_show, vha); +} + +static const struct file_operations dfs_tgt_port_database_ops = { + .open = qla2x00_dfs_tgt_port_database_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int qla_dfs_fw_resource_cnt_show(struct seq_file *s, void *unused) { @@ -296,6 +367,14 @@ create_nodes: goto out; } + ha->tgt.dfs_tgt_port_database = debugfs_create_file("tgt_port_database", + S_IRUSR, ha->dfs_dir, vha, &dfs_tgt_port_database_ops); + if (!ha->tgt.dfs_tgt_port_database) { + ql_log(ql_log_warn, vha, 0xffff, + "Unable to create debugFS tgt_port_database node.\n"); + goto out; + } + ha->dfs_fce = debugfs_create_file("fce", S_IRUSR, ha->dfs_dir, vha, &dfs_fce_ops); if (!ha->dfs_fce) { @@ -326,6 +405,11 @@ qla2x00_dfs_remove(scsi_qla_host_t *vha) ha->tgt.dfs_tgt_sess = NULL; } + if (ha->tgt.dfs_tgt_port_database) { + debugfs_remove(ha->tgt.dfs_tgt_port_database); + ha->tgt.dfs_tgt_port_database = NULL; + } + if (ha->dfs_fw_resource_cnt) { debugfs_remove(ha->dfs_fw_resource_cnt); ha->dfs_fw_resource_cnt = NULL; From 482c9dc79204bb83c3433a59680c787a0b98c000 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:54 -0700 Subject: [PATCH 224/297] qla2xxx: Change scsi host lookup method. For target mode, when new scsi command arrive, driver first performs a look up of the SCSI Host. The current look up method is based on the ALPA portion of the NPort ID. For Cisco switch, the ALPA can not be used as the index. Instead, the new search method is based on the full value of the Nport_ID via btree lib. Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/Kconfig | 1 + drivers/scsi/qla2xxx/qla_def.h | 2 + drivers/scsi/qla2xxx/qla_gbl.h | 2 + drivers/scsi/qla2xxx/qla_init.c | 14 ++--- drivers/scsi/qla2xxx/qla_mbx.c | 28 +++------- drivers/scsi/qla2xxx/qla_os.c | 1 + drivers/scsi/qla2xxx/qla_target.c | 92 ++++++++++++++++++++++++++----- 7 files changed, 100 insertions(+), 40 deletions(-) diff --git a/drivers/scsi/qla2xxx/Kconfig b/drivers/scsi/qla2xxx/Kconfig index 67c0d5aa3212..de952935b5d2 100644 --- a/drivers/scsi/qla2xxx/Kconfig +++ b/drivers/scsi/qla2xxx/Kconfig @@ -3,6 +3,7 @@ config SCSI_QLA_FC depends on PCI && SCSI depends on SCSI_FC_ATTRS select FW_LOADER + select BTREE ---help--- This qla2xxx driver supports all QLogic Fibre Channel PCI and PCIe host adapters. diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 089480280558..9251918773b1 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -3311,6 +3312,7 @@ struct qlt_hw_data { spinlock_t sess_lock; int rspq_vector_cpuid; spinlock_t atio_lock ____cacheline_aligned; + struct btree_head32 host_map; }; #define MAX_QFULL_CMDS_ALLOC 8192 diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index 323b912b47f7..5b2451745e9f 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -854,5 +854,7 @@ extern struct fc_port *qlt_find_sess_invalidate_other(scsi_qla_host_t *, uint64_t wwn, port_id_t port_id, uint16_t loop_id, struct fc_port **); void qla24xx_delete_sess_fn(struct work_struct *); void qlt_unknown_atio_work_fn(struct work_struct *); +void qlt_update_host_map(struct scsi_qla_host *, port_id_t); +void qlt_remove_target_resources(struct qla_hw_data *); #endif /* _QLA_GBL_H */ diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index b1bfa63f7d4e..b0f6ad3020d3 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -3340,8 +3340,8 @@ qla2x00_configure_hba(scsi_qla_host_t *vha) uint8_t domain; char connect_type[22]; struct qla_hw_data *ha = vha->hw; - unsigned long flags; scsi_qla_host_t *base_vha = pci_get_drvdata(ha->pdev); + port_id_t id; /* Get host addresses. */ rval = qla2x00_get_adapter_id(vha, @@ -3419,13 +3419,11 @@ qla2x00_configure_hba(scsi_qla_host_t *vha) /* Save Host port and loop ID. */ /* byte order - Big Endian */ - vha->d_id.b.domain = domain; - vha->d_id.b.area = area; - vha->d_id.b.al_pa = al_pa; - - spin_lock_irqsave(&ha->vport_slock, flags); - qlt_update_vp_map(vha, SET_AL_PA); - spin_unlock_irqrestore(&ha->vport_slock, flags); + id.b.domain = domain; + id.b.area = area; + id.b.al_pa = al_pa; + id.b.rsvd_1 = 0; + qlt_update_host_map(vha, id); if (!vha->flags.init_done) ql_log(ql_log_info, vha, 0x2010, diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index e40ed570d3c1..53d9579acc74 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -3623,6 +3623,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, scsi_qla_host_t *vp = NULL; unsigned long flags; int found; + port_id_t id; ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x10b6, "Entered %s.\n", __func__); @@ -3630,6 +3631,11 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, if (rptid_entry->entry_status != 0) return; + id.b.domain = rptid_entry->port_id[2]; + id.b.area = rptid_entry->port_id[1]; + id.b.al_pa = rptid_entry->port_id[0]; + id.b.rsvd_1 = 0; + if (rptid_entry->format == 0) { /* loop */ ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x10b7, @@ -3641,13 +3647,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, rptid_entry->port_id[2], rptid_entry->port_id[1], rptid_entry->port_id[0]); - vha->d_id.b.domain = rptid_entry->port_id[2]; - vha->d_id.b.area = rptid_entry->port_id[1]; - vha->d_id.b.al_pa = rptid_entry->port_id[0]; - - spin_lock_irqsave(&ha->vport_slock, flags); - qlt_update_vp_map(vha, SET_AL_PA); - spin_unlock_irqrestore(&ha->vport_slock, flags); + qlt_update_host_map(vha, id); } else if (rptid_entry->format == 1) { /* fabric */ @@ -3673,12 +3673,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, WWN_SIZE); } - vha->d_id.b.domain = rptid_entry->port_id[2]; - vha->d_id.b.area = rptid_entry->port_id[1]; - vha->d_id.b.al_pa = rptid_entry->port_id[0]; - spin_lock_irqsave(&ha->vport_slock, flags); - qlt_update_vp_map(vha, SET_AL_PA); - spin_unlock_irqrestore(&ha->vport_slock, flags); + qlt_update_host_map(vha, id); } fc_host_port_name(vha->host) = @@ -3714,12 +3709,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, if (!found) return; - vp->d_id.b.domain = rptid_entry->port_id[2]; - vp->d_id.b.area = rptid_entry->port_id[1]; - vp->d_id.b.al_pa = rptid_entry->port_id[0]; - spin_lock_irqsave(&ha->vport_slock, flags); - qlt_update_vp_map(vp, SET_AL_PA); - spin_unlock_irqrestore(&ha->vport_slock, flags); + qlt_update_host_map(vp, id); /* * Cannot configure here as we are still sitting on the diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 54d4e802bde0..344faf59783d 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -3469,6 +3469,7 @@ qla2x00_remove_one(struct pci_dev *pdev) qla2x00_free_sysfs_attr(base_vha, true); fc_remove_host(base_vha->host); + qlt_remove_target_resources(ha); scsi_remove_host(base_vha->host); diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 38bdcd325fa4..7278e046bf87 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -187,21 +187,23 @@ static inline struct scsi_qla_host *qlt_find_host_by_d_id(struct scsi_qla_host *vha, uint8_t *d_id) { - struct qla_hw_data *ha = vha->hw; - uint8_t vp_idx; + struct scsi_qla_host *host; + uint32_t key = 0; - if ((vha->d_id.b.area != d_id[1]) || (vha->d_id.b.domain != d_id[0])) - return NULL; - - if (vha->d_id.b.al_pa == d_id[2]) + if ((vha->d_id.b.area == d_id[1]) && (vha->d_id.b.domain == d_id[0]) && + (vha->d_id.b.al_pa == d_id[2])) return vha; - BUG_ON(ha->tgt.tgt_vp_map == NULL); - vp_idx = ha->tgt.tgt_vp_map[d_id[2]].idx; - if (likely(test_bit(vp_idx, ha->vp_idx_map))) - return ha->tgt.tgt_vp_map[vp_idx].vha; + key = (uint32_t)d_id[0] << 16; + key |= (uint32_t)d_id[1] << 8; + key |= (uint32_t)d_id[2]; - return NULL; + host = btree_lookup32(&vha->hw->tgt.host_map, key); + if (!host) + ql_dbg(ql_dbg_tgt_mgt, vha, 0xffff, + "Unable to find host %06x\n", key); + + return host; } static inline @@ -6040,6 +6042,17 @@ int qlt_remove_target(struct qla_hw_data *ha, struct scsi_qla_host *vha) return 0; } +void qlt_remove_target_resources(struct qla_hw_data *ha) +{ + struct scsi_qla_host *node; + u32 key = 0; + + btree_for_each_safe32(&ha->tgt.host_map, key, node) + btree_remove32(&ha->tgt.host_map, key); + + btree_destroy32(&ha->tgt.host_map); +} + static void qlt_lport_dump(struct scsi_qla_host *vha, u64 wwpn, unsigned char *b) { @@ -6693,6 +6706,8 @@ qlt_modify_vp_config(struct scsi_qla_host *vha, void qlt_probe_one_stage1(struct scsi_qla_host *base_vha, struct qla_hw_data *ha) { + int rc; + if (!QLA_TGT_MODE_ENABLED()) return; @@ -6712,6 +6727,13 @@ qlt_probe_one_stage1(struct scsi_qla_host *base_vha, struct qla_hw_data *ha) qlt_unknown_atio_work_fn); qlt_clear_mode(base_vha); + + rc = btree_init32(&ha->tgt.host_map); + if (rc) + ql_log(ql_log_info, base_vha, 0xffff, + "Unable to initialize ha->host_map btree\n"); + + qlt_update_vp_map(base_vha, SET_VP_IDX); } irqreturn_t @@ -6820,25 +6842,69 @@ qlt_mem_free(struct qla_hw_data *ha) void qlt_update_vp_map(struct scsi_qla_host *vha, int cmd) { + void *slot; + u32 key; + int rc; + if (!QLA_TGT_MODE_ENABLED()) return; + key = vha->d_id.b24; + switch (cmd) { case SET_VP_IDX: vha->hw->tgt.tgt_vp_map[vha->vp_idx].vha = vha; break; case SET_AL_PA: - vha->hw->tgt.tgt_vp_map[vha->d_id.b.al_pa].idx = vha->vp_idx; + slot = btree_lookup32(&vha->hw->tgt.host_map, key); + if (!slot) { + ql_dbg(ql_dbg_tgt_mgt, vha, 0xffff, + "Save vha in host_map %p %06x\n", vha, key); + rc = btree_insert32(&vha->hw->tgt.host_map, + key, vha, GFP_ATOMIC); + if (rc) + ql_log(ql_log_info, vha, 0xffff, + "Unable to insert s_id into host_map: %06x\n", + key); + return; + } + ql_dbg(ql_dbg_tgt_mgt, vha, 0xffff, + "replace existing vha in host_map %p %06x\n", vha, key); + btree_update32(&vha->hw->tgt.host_map, key, vha); break; case RESET_VP_IDX: vha->hw->tgt.tgt_vp_map[vha->vp_idx].vha = NULL; break; case RESET_AL_PA: - vha->hw->tgt.tgt_vp_map[vha->d_id.b.al_pa].idx = 0; + ql_dbg(ql_dbg_tgt_mgt, vha, 0xffff, + "clear vha in host_map %p %06x\n", vha, key); + slot = btree_lookup32(&vha->hw->tgt.host_map, key); + if (slot) + btree_remove32(&vha->hw->tgt.host_map, key); + vha->d_id.b24 = 0; break; } } +void qlt_update_host_map(struct scsi_qla_host *vha, port_id_t id) +{ + unsigned long flags; + struct qla_hw_data *ha = vha->hw; + + if (!vha->d_id.b24) { + spin_lock_irqsave(&ha->vport_slock, flags); + vha->d_id = id; + qlt_update_vp_map(vha, SET_AL_PA); + spin_unlock_irqrestore(&ha->vport_slock, flags); + } else if (vha->d_id.b24 != id.b24) { + spin_lock_irqsave(&ha->vport_slock, flags); + qlt_update_vp_map(vha, RESET_AL_PA); + vha->d_id = id; + qlt_update_vp_map(vha, SET_AL_PA); + spin_unlock_irqrestore(&ha->vport_slock, flags); + } +} + static int __init qlt_parse_ini_mode(void) { if (strcasecmp(qlini_mode, QLA2XXX_INI_MODE_STR_EXCLUSIVE) == 0) From ec7193e26055112bc824929fd943035f9a30b06f Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 15 Mar 2017 09:48:55 -0700 Subject: [PATCH 225/297] qla2xxx: Fix delayed response to command for loop mode/direct connect. Current driver wait for FW to be in the ready state before processing in-coming commands. For Arbitrated Loop or Point-to- Point (not switch), FW Ready state can take a while. FW will transition to ready state after all Nports have been logged in. In the mean time, certain initiators have completed the login and starts IO. Driver needs to start processing all queues if FW is already started. Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_def.h | 10 ++++++-- drivers/scsi/qla2xxx/qla_init.c | 12 ++++++++++ drivers/scsi/qla2xxx/qla_isr.c | 14 +++++++++++- drivers/scsi/qla2xxx/qla_mbx.c | 6 ++--- drivers/scsi/qla2xxx/qla_os.c | 21 ++++++++++++++++- drivers/scsi/qla2xxx/qla_target.c | 38 ++++++++++++++++++++----------- 6 files changed, 81 insertions(+), 20 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 9251918773b1..ae119018dfaa 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -3322,6 +3322,10 @@ struct qlt_hw_data { #define LEAK_EXCHG_THRESH_HOLD_PERCENT 75 /* 75 percent */ +#define QLA_EARLY_LINKUP(_ha) \ + ((_ha->flags.n2n_ae || _ha->flags.lip_ae) && \ + _ha->flags.fw_started && !_ha->flags.fw_init_done) + /* * Qlogic host adapter specific data structure. */ @@ -3371,9 +3375,11 @@ struct qla_hw_data { uint32_t fawwpn_enabled:1; uint32_t exlogins_enabled:1; uint32_t exchoffld_enabled:1; - /* 35 bits */ + uint32_t lip_ae:1; + uint32_t n2n_ae:1; uint32_t fw_started:1; + uint32_t fw_init_done:1; } flags; /* This spinlock is used to protect "io transactions", you must @@ -3466,7 +3472,6 @@ struct qla_hw_data { #define P2P_LOOP 3 uint8_t interrupts_on; uint32_t isp_abort_cnt; - #define PCI_DEVICE_ID_QLOGIC_ISP2532 0x2532 #define PCI_DEVICE_ID_QLOGIC_ISP8432 0x8432 #define PCI_DEVICE_ID_QLOGIC_ISP8001 0x8001 @@ -3947,6 +3952,7 @@ typedef struct scsi_qla_host { struct list_head vp_fcports; /* list of fcports */ struct list_head work_list; spinlock_t work_lock; + struct work_struct iocb_work; /* Commonly used flags and state information. */ struct Scsi_Host *host; diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index b0f6ad3020d3..f9d2fe7b1ade 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -3178,6 +3178,7 @@ next_check: } else { ql_dbg(ql_dbg_init, vha, 0x00d3, "Init Firmware -- success.\n"); + ha->flags.fw_started = 1; } return (rval); @@ -4000,6 +4001,7 @@ qla2x00_configure_loop(scsi_qla_host_t *vha) atomic_set(&vha->loop_state, LOOP_READY); ql_dbg(ql_dbg_disc, vha, 0x2069, "LOOP READY.\n"); + ha->flags.fw_init_done = 1; /* * Process any ATIO queue entries that came in @@ -5491,6 +5493,11 @@ qla2x00_abort_isp_cleanup(scsi_qla_host_t *vha) if (!(IS_P3P_TYPE(ha))) ha->isp_ops->reset_chip(vha); + ha->flags.n2n_ae = 0; + ha->flags.lip_ae = 0; + ha->current_topology = 0; + ha->flags.fw_started = 0; + ha->flags.fw_init_done = 0; ha->chip_reset++; atomic_set(&vha->loop_down_timer, LOOP_DOWN_TIME); @@ -6767,6 +6774,8 @@ qla2x00_try_to_stop_firmware(scsi_qla_host_t *vha) return; if (!ha->fw_major_version) return; + if (!ha->flags.fw_started) + return; ret = qla2x00_stop_firmware(vha); for (retries = 5; ret != QLA_SUCCESS && ret != QLA_FUNCTION_TIMEOUT && @@ -6780,6 +6789,9 @@ qla2x00_try_to_stop_firmware(scsi_qla_host_t *vha) "Attempting retry of stop-firmware command.\n"); ret = qla2x00_stop_firmware(vha); } + + ha->flags.fw_started = 0; + ha->flags.fw_init_done = 0; } int diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index 3953c8d6af69..3203367a4f42 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -708,6 +708,8 @@ skip_rio: "mbx7=%xh.\n", mb[1], mb[2], mb[3], mbx); ha->isp_ops->fw_dump(vha, 1); + ha->flags.fw_init_done = 0; + ha->flags.fw_started = 0; if (IS_FWI2_CAPABLE(ha)) { if (mb[1] == 0 && mb[2] == 0) { @@ -761,6 +763,9 @@ skip_rio: break; case MBA_LIP_OCCURRED: /* Loop Initialization Procedure */ + ha->flags.lip_ae = 1; + ha->flags.n2n_ae = 0; + ql_dbg(ql_dbg_async, vha, 0x5009, "LIP occurred (%x).\n", mb[1]); @@ -797,6 +802,10 @@ skip_rio: break; case MBA_LOOP_DOWN: /* Loop Down Event */ + ha->flags.n2n_ae = 0; + ha->flags.lip_ae = 0; + ha->current_topology = 0; + mbx = (IS_QLA81XX(ha) || IS_QLA8031(ha)) ? RD_REG_WORD(®24->mailbox4) : 0; mbx = (IS_P3P_TYPE(ha)) ? RD_REG_WORD(®82->mailbox_out[4]) @@ -866,6 +875,9 @@ skip_rio: /* case MBA_DCBX_COMPLETE: */ case MBA_POINT_TO_POINT: /* Point-to-Point */ + ha->flags.lip_ae = 0; + ha->flags.n2n_ae = 1; + if (IS_QLA2100(ha)) break; @@ -2706,7 +2718,7 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, struct sts_entry_24xx *pkt; struct qla_hw_data *ha = vha->hw; - if (!vha->flags.online) + if (!ha->flags.fw_started) return; while (rsp->ring_ptr->signature != RESPONSE_PROCESSED) { diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 53d9579acc74..a113ab3592a7 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -3638,11 +3638,11 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, if (rptid_entry->format == 0) { /* loop */ - ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x10b7, + ql_dbg(ql_dbg_async, vha, 0x10b7, "Format 0 : Number of VPs setup %d, number of " "VPs acquired %d.\n", rptid_entry->vp_setup, rptid_entry->vp_acquired); - ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x10b8, + ql_dbg(ql_dbg_async, vha, 0x10b8, "Primary port id %02x%02x%02x.\n", rptid_entry->port_id[2], rptid_entry->port_id[1], rptid_entry->port_id[0]); @@ -3651,7 +3651,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, } else if (rptid_entry->format == 1) { /* fabric */ - ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x10b9, + ql_dbg(ql_dbg_async, vha, 0x10b9, "Format 1: VP[%d] enabled - status %d - with " "port id %02x%02x%02x.\n", rptid_entry->vp_idx, rptid_entry->vp_status, diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 344faf59783d..41d5b09f7326 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -2560,6 +2560,20 @@ qla2xxx_scan_finished(struct Scsi_Host *shost, unsigned long time) return atomic_read(&vha->loop_state) == LOOP_READY; } +static void qla2x00_iocb_work_fn(struct work_struct *work) +{ + struct scsi_qla_host *vha = container_of(work, + struct scsi_qla_host, iocb_work); + int cnt = 0; + + while (!list_empty(&vha->work_list)) { + qla2x00_do_work(vha); + cnt++; + if (cnt > 10) + break; + } +} + /* * PCI driver interface */ @@ -3078,6 +3092,7 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) */ qla2xxx_wake_dpc(base_vha); + INIT_WORK(&base_vha->iocb_work, qla2x00_iocb_work_fn); INIT_WORK(&ha->board_disable, qla2x00_disable_board_on_pci_error); if (IS_QLA8031(ha) || IS_MCTP_CAPABLE(ha)) { @@ -4321,7 +4336,11 @@ qla2x00_post_work(struct scsi_qla_host *vha, struct qla_work_evt *e) spin_lock_irqsave(&vha->work_lock, flags); list_add_tail(&e->list, &vha->work_list); spin_unlock_irqrestore(&vha->work_lock, flags); - qla2xxx_wake_dpc(vha); + + if (QLA_EARLY_LINKUP(vha->hw)) + schedule_work(&vha->iocb_work); + else + qla2xxx_wake_dpc(vha); return QLA_SUCCESS; } diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 7278e046bf87..0e03ca2ab3e5 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -638,6 +638,7 @@ int qla24xx_async_notify_ack(scsi_qla_host_t *vha, fc_port_t *fcport, break; case SRB_NACK_PRLI: fcport->fw_login_state = DSC_LS_PRLI_PEND; + fcport->deleted = 0; c = "PRLI"; break; case SRB_NACK_LOGO: @@ -1576,6 +1577,9 @@ static void qlt_send_notify_ack(struct scsi_qla_host *vha, request_t *pkt; struct nack_to_isp *nack; + if (!ha->flags.fw_started) + return; + ql_dbg(ql_dbg_tgt, vha, 0xe004, "Sending NOTIFY_ACK (ha=%p)\n", ha); /* Send marker if required */ @@ -3053,7 +3057,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type, else vha->tgt_counters.core_qla_que_buf++; - if (!vha->flags.online || cmd->reset_count != ha->chip_reset) { + if (!ha->flags.fw_started || cmd->reset_count != ha->chip_reset) { /* * Either the port is not online or this request was from * previous life, just abort the processing. @@ -3194,7 +3198,7 @@ int qlt_rdy_to_xfer(struct qla_tgt_cmd *cmd) spin_lock_irqsave(&ha->hardware_lock, flags); - if (!vha->flags.online || (cmd->reset_count != ha->chip_reset) || + if (!ha->flags.fw_started || (cmd->reset_count != ha->chip_reset) || (cmd->sess && cmd->sess->deleted)) { /* * Either the port is not online or this request was from @@ -3372,7 +3376,7 @@ static int __qlt_send_term_imm_notif(struct scsi_qla_host *vha, ql_dbg(ql_dbg_tgt_tmr, vha, 0xe01c, "Sending TERM ELS CTIO (ha=%p)\n", ha); - pkt = (request_t *)qla2x00_alloc_iocbs_ready(vha, NULL); + pkt = (request_t *)qla2x00_alloc_iocbs(vha, NULL); if (pkt == NULL) { ql_dbg(ql_dbg_tgt, vha, 0xe080, "qla_target(%d): %s failed: unable to allocate " @@ -4697,7 +4701,8 @@ static int qlt_24xx_handle_els(struct scsi_qla_host *vha, } if (sess != NULL) { - if (sess->fw_login_state == DSC_LS_PLOGI_PEND) { + if (sess->fw_login_state != DSC_LS_PLOGI_PEND && + sess->fw_login_state != DSC_LS_PLOGI_COMP) { /* * Impatient initiator sent PRLI before last * PLOGI could finish. Will force him to re-try, @@ -4736,15 +4741,23 @@ static int qlt_24xx_handle_els(struct scsi_qla_host *vha, /* Make session global (not used in fabric mode) */ if (ha->current_topology != ISP_CFG_F) { - set_bit(LOOP_RESYNC_NEEDED, &vha->dpc_flags); - set_bit(LOCAL_LOOP_UPDATE, &vha->dpc_flags); - qla2xxx_wake_dpc(vha); + if (sess) { + ql_dbg(ql_dbg_disc, vha, 0xffff, + "%s %d %8phC post nack\n", + __func__, __LINE__, sess->port_name); + qla24xx_post_nack_work(vha, sess, iocb, + SRB_NACK_PRLI); + res = 0; + } else { + set_bit(LOOP_RESYNC_NEEDED, &vha->dpc_flags); + set_bit(LOCAL_LOOP_UPDATE, &vha->dpc_flags); + qla2xxx_wake_dpc(vha); + } } else { if (sess) { ql_dbg(ql_dbg_disc, vha, 0xffff, - "%s %d %8phC post nack\n", - __func__, __LINE__, sess->port_name); - + "%s %d %8phC post nack\n", + __func__, __LINE__, sess->port_name); qla24xx_post_nack_work(vha, sess, iocb, SRB_NACK_PRLI); res = 0; @@ -4752,7 +4765,6 @@ static int qlt_24xx_handle_els(struct scsi_qla_host *vha, } break; - case ELS_TPRLO: if (le16_to_cpu(iocb->u.isp24.flags) & NOTIFY24XX_FLAGS_GLOBAL_TPRLO) { @@ -5222,7 +5234,7 @@ static void qlt_24xx_atio_pkt(struct scsi_qla_host *vha, unsigned long flags; if (unlikely(tgt == NULL)) { - ql_dbg(ql_dbg_io, vha, 0x3064, + ql_dbg(ql_dbg_tgt, vha, 0x3064, "ATIO pkt, but no tgt (ha %p)", ha); return; } @@ -6359,7 +6371,7 @@ qlt_24xx_process_atio_queue(struct scsi_qla_host *vha, uint8_t ha_locked) struct atio_from_isp *pkt; int cnt, i; - if (!vha->flags.online) + if (!ha->flags.fw_started) return; while ((ha->tgt.atio_ring_ptr->signature != ATIO_PROCESSED) || From 6c611d18f386d37cce3afbd921568e2a895bd86e Mon Sep 17 00:00:00 2001 From: Himanshu Madhani Date: Wed, 15 Mar 2017 09:48:56 -0700 Subject: [PATCH 226/297] qla2xxx: Update driver version to 9.00.00.00-k Signed-off-by: Himanshu Madhani signed-off-by: Giridhar Malavali Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_version.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_version.h b/drivers/scsi/qla2xxx/qla_version.h index 3cb1964b7786..45bc84e8e3bf 100644 --- a/drivers/scsi/qla2xxx/qla_version.h +++ b/drivers/scsi/qla2xxx/qla_version.h @@ -7,9 +7,9 @@ /* * Driver version */ -#define QLA2XXX_VERSION "8.07.00.38-k" +#define QLA2XXX_VERSION "9.00.00.00-k" -#define QLA_DRIVER_MAJOR_VER 8 -#define QLA_DRIVER_MINOR_VER 7 +#define QLA_DRIVER_MAJOR_VER 9 +#define QLA_DRIVER_MINOR_VER 0 #define QLA_DRIVER_PATCH_VER 0 #define QLA_DRIVER_BETA_VER 0 From 452b94b8c8c7eb7dd0d0fa9a9776e0d02cd73b97 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Mar 2017 19:00:47 -0700 Subject: [PATCH 227/297] mm/swap: don't BUG_ON() due to uninitialized swap slot cache This BUG_ON() triggered for me once at shutdown, and I don't see a reason for the check. The code correctly checks whether the swap slot cache is usable or not, so an uninitialized swap slot cache is not actually problematic afaik. I've temporarily just switched the BUG_ON() to a WARN_ON_ONCE(), since I'm not sure why that seemingly pointless check was there. I suspect the real fix is to just remove it entirely, but for now we'll warn about it but not bring the machine down. Cc: "Huang, Ying" Cc: Tim Chen Cc: Michal Hocko Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_slots.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 9b5bc86f96ad..7ebb23836f68 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -267,7 +267,7 @@ int free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; - BUG_ON(!swap_slot_cache_initialized); + WARN_ON_ONCE(!swap_slot_cache_initialized); cache = &get_cpu_var(swp_slots); if (use_swap_slot_cache && cache->slots_ret) { From 97da3854c526d3a6ee05c849c96e48d21527606c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Mar 2017 19:09:39 -0700 Subject: [PATCH 228/297] Linux 4.11-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b841fb36beb2..b2faa9319372 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Fearless Coyote # *DOCUMENTATION* From f363a06642f28caaa78cb6446bbad90c73fe183c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 20 Mar 2017 10:08:19 +0100 Subject: [PATCH 229/297] ALSA: ctxfi: Fix the incorrect check of dma_set_mask() call In the commit [15c75b09f8d1: ALSA: ctxfi: Fallback DMA mask to 32bit], I forgot to put "!" at dam_set_mask() call check in cthw20k1.c (while cthw20k2.c is OK). This patch fixes that obvious bug. (As a side note: although the original commit was completely wrong, it's still working for most of machines, as it sets to 32bit DMA mask in the end. So the bug severity is low.) Fixes: 15c75b09f8d1 ("ALSA: ctxfi: Fallback DMA mask to 32bit") Cc: Signed-off-by: Takashi Iwai --- sound/pci/ctxfi/cthw20k1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/ctxfi/cthw20k1.c b/sound/pci/ctxfi/cthw20k1.c index ab4cdab5cfa5..79edd88d5cd0 100644 --- a/sound/pci/ctxfi/cthw20k1.c +++ b/sound/pci/ctxfi/cthw20k1.c @@ -1905,7 +1905,7 @@ static int hw_card_start(struct hw *hw) return err; /* Set DMA transfer mask */ - if (dma_set_mask(&pci->dev, DMA_BIT_MASK(dma_bits))) { + if (!dma_set_mask(&pci->dev, DMA_BIT_MASK(dma_bits))) { dma_set_coherent_mask(&pci->dev, DMA_BIT_MASK(dma_bits)); } else { dma_set_mask(&pci->dev, DMA_BIT_MASK(32)); From 04d5466a976b096364a39a63ac264c1b3a5f8fa1 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Thu, 9 Mar 2017 13:29:13 +0100 Subject: [PATCH 230/297] ALSA: hda - add support for docking station for HP 820 G2 This tested patch adds missing initialization for Line-In/Out PINs for the docking station for HP 820 G2. Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4e112221d825..8d6b3703d0a2 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4847,6 +4847,7 @@ enum { ALC286_FIXUP_HP_GPIO_LED, ALC280_FIXUP_HP_GPIO2_MIC_HOTKEY, ALC280_FIXUP_HP_DOCK_PINS, + ALC269_FIXUP_HP_DOCK_GPIO_MIC1_LED, ALC280_FIXUP_HP_9480M, ALC288_FIXUP_DELL_HEADSET_MODE, ALC288_FIXUP_DELL1_MIC_NO_PRESENCE, @@ -5388,6 +5389,16 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC280_FIXUP_HP_GPIO4 }, + [ALC269_FIXUP_HP_DOCK_GPIO_MIC1_LED] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x1b, 0x21011020 }, /* line-out */ + { 0x18, 0x2181103f }, /* line-in */ + { }, + }, + .chained = true, + .chain_id = ALC269_FIXUP_HP_GPIO_MIC1_LED + }, [ALC280_FIXUP_HP_9480M] = { .type = HDA_FIXUP_FUNC, .v.func = alc280_fixup_hp_9480m, @@ -5647,7 +5658,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x2256, "HP", ALC269_FIXUP_HP_GPIO_MIC1_LED), SND_PCI_QUIRK(0x103c, 0x2257, "HP", ALC269_FIXUP_HP_GPIO_MIC1_LED), SND_PCI_QUIRK(0x103c, 0x2259, "HP", ALC269_FIXUP_HP_GPIO_MIC1_LED), - SND_PCI_QUIRK(0x103c, 0x225a, "HP", ALC269_FIXUP_HP_GPIO_MIC1_LED), + SND_PCI_QUIRK(0x103c, 0x225a, "HP", ALC269_FIXUP_HP_DOCK_GPIO_MIC1_LED), SND_PCI_QUIRK(0x103c, 0x2260, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1), SND_PCI_QUIRK(0x103c, 0x2263, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1), SND_PCI_QUIRK(0x103c, 0x2264, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1), @@ -5816,6 +5827,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC, .name = "headset-mode-no-hp-mic"}, {.id = ALC269_FIXUP_LENOVO_DOCK, .name = "lenovo-dock"}, {.id = ALC269_FIXUP_HP_GPIO_LED, .name = "hp-gpio-led"}, + {.id = ALC269_FIXUP_HP_DOCK_GPIO_MIC1_LED, .name = "hp-dock-gpio-mic1-led"}, {.id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, .name = "dell-headset-multi"}, {.id = ALC269_FIXUP_DELL2_MIC_NO_PRESENCE, .name = "dell-headset-dock"}, {.id = ALC283_FIXUP_CHROME_BOOK, .name = "alc283-dac-wcaps"}, From cc3a47a248d7791ef0d2c81a35c46769e55e4c6c Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Thu, 9 Mar 2017 13:30:09 +0100 Subject: [PATCH 231/297] ALSA: hda - add support for docking station for HP 840 G3 This tested patch adds missing initialization for Line-In/Out PINs for the docking station for HP 840 G3. Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index c15c51bea26d..69266b8ea2ad 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -261,6 +261,7 @@ enum { CXT_FIXUP_HP_530, CXT_FIXUP_CAP_MIX_AMP_5047, CXT_FIXUP_MUTE_LED_EAPD, + CXT_FIXUP_HP_DOCK, CXT_FIXUP_HP_SPECTRE, CXT_FIXUP_HP_GATE_MIC, }; @@ -778,6 +779,14 @@ static const struct hda_fixup cxt_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = cxt_fixup_mute_led_eapd, }, + [CXT_FIXUP_HP_DOCK] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x16, 0x21011020 }, /* line-out */ + { 0x18, 0x2181103f }, /* line-in */ + { } + } + }, [CXT_FIXUP_HP_SPECTRE] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -839,6 +848,7 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x1025, 0x0543, "Acer Aspire One 522", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1025, 0x054c, "Acer Aspire 3830TG", CXT_FIXUP_ASPIRE_DMIC), SND_PCI_QUIRK(0x1025, 0x054f, "Acer Aspire 4830T", CXT_FIXUP_ASPIRE_DMIC), + SND_PCI_QUIRK(0x103c, 0x8079, "HP EliteBook 840 G3", CXT_FIXUP_HP_DOCK), SND_PCI_QUIRK(0x103c, 0x8174, "HP Spectre x360", CXT_FIXUP_HP_SPECTRE), SND_PCI_QUIRK(0x103c, 0x8115, "HP Z1 Gen3", CXT_FIXUP_HP_GATE_MIC), SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN), @@ -871,6 +881,7 @@ static const struct hda_model_fixup cxt5066_fixup_models[] = { { .id = CXT_PINCFG_LEMOTE_A1205, .name = "lemote-a1205" }, { .id = CXT_FIXUP_OLPC_XO, .name = "olpc-xo" }, { .id = CXT_FIXUP_MUTE_LED_EAPD, .name = "mute-led-eapd" }, + { .id = CXT_FIXUP_HP_DOCK, .name = "hp-dock" }, {} }; From 720037f939fa50fc3531035ae61b4cf4b0ff35e5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 6 Mar 2017 11:59:56 -0800 Subject: [PATCH 232/297] f2fs: don't overwrite node block by SSR This patch fixes that SSR can overwrite previous warm node block consisting of a node chain since the last checkpoint. Fixes: 5b6c6be2d878 ("f2fs: use SSR for warm node as well") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4bd7a8b19332..29ef7088c558 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1163,6 +1163,12 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; + + /* don't overwrite by SSR to keep node chain */ + if (se->type == CURSEG_WARM_NODE) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; + } } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { #ifdef CONFIG_F2FS_CHECK_FS From 9f7e4a2c49fd166f17cf4125766a68dce8716764 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 10 Mar 2017 09:39:57 -0800 Subject: [PATCH 233/297] f2fs: declare static functions This is to avoid build warning reported by kbuild test robot. Signed-off-by: Fengguang Wu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 94967171dee8..a0a060c2979b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1823,7 +1823,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -2383,7 +2384,7 @@ add_out: list_add_tail(&nes->set_list, head); } -void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2638,7 +2639,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) return 0; } -int init_free_nid_cache(struct f2fs_sb_info *sbi) +static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); From 23380b8568b85cd4b7a056891f4dbf131f7b871d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 14:11:06 -0800 Subject: [PATCH 234/297] f2fs: use __set{__clear}_bit_le This patch uses __set{__clear}_bit_le for highter speed. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/node.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4650c9b85de7..8d5c62b07b28 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -750,7 +750,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a0a060c2979b..8c81ff614d1a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -339,7 +339,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, __set_nat_cache_dirty(nm_i, e); if (enabled_nat_bits(sbi, NULL) && new_blkaddr == NEW_ADDR) - clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits); + __clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits); /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) @@ -1834,9 +1834,9 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, return; if (set) - set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); else - clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1848,7 +1848,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; - set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; @@ -2403,16 +2403,16 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, valid++; } if (valid == 0) { - set_bit_le(nat_index, nm_i->empty_nat_bits); - clear_bit_le(nat_index, nm_i->full_nat_bits); + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); return; } - clear_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->empty_nat_bits); if (valid == NAT_ENTRY_PER_BLOCK) - set_bit_le(nat_index, nm_i->full_nat_bits); + __set_bit_le(nat_index, nm_i->full_nat_bits); else - clear_bit_le(nat_index, nm_i->full_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); } static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, From 586d1492f301982e349797cfb05d9f343002ffa2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Mar 2017 17:09:07 +0800 Subject: [PATCH 235/297] f2fs: skip scanning free nid bitmap of full NAT blocks This patch adds to account free nids for each NAT blocks, and while scanning all free nid bitmap, do check count and skip lookuping in full NAT block. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 2 ++ fs/f2fs/node.c | 33 +++++++++++++++++++++++++++------ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a77df377e2e8..ee2d0a485fc3 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -196,6 +196,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); get_cache: si->cache_mem = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e849f83d6114..0a6e115562f6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -561,6 +561,8 @@ struct f2fs_nm_info { struct mutex build_lock; /* lock for build free nids */ unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ + spinlock_t free_nid_lock; /* protect updating of nid count */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8c81ff614d1a..87a2b1f740cc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1824,7 +1824,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set) + bool set, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1837,6 +1837,13 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + + spin_lock(&nm_i->free_nid_lock); + if (set) + nm_i->free_nid_count[nat_ofs]++; + else if (!build) + nm_i->free_nid_count[nat_ofs]--; + spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1848,6 +1855,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; @@ -1862,7 +1872,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed); + update_free_nid_bitmap(sbi, start_nid, freed, true); } } @@ -1878,6 +1888,8 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) continue; + if (!nm_i->free_nid_count[i]) + continue; for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { nid_t nid; @@ -2082,7 +2094,7 @@ retry: __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false); + update_free_nid_bitmap(sbi, *nid, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2138,7 +2150,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&nm_i->nid_list_lock); @@ -2468,11 +2480,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2652,6 +2664,14 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; + + nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks * + sizeof(unsigned short), GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; + + spin_lock_init(&nm_i->free_nid_lock); + return 0; } @@ -2731,6 +2751,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kvfree(nm_i->nat_block_bitmap); kvfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_count); kfree(nm_i->nat_bitmap); kfree(nm_i->nat_bits); From 7041d5d286fb54635f540c1bb3b43980ed65513a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Mar 2017 20:07:49 +0800 Subject: [PATCH 236/297] f2fs: combine nat_bits and free_nid_bitmap cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both nat_bits cache and free_nid_bitmap cache provide same functionality as a intermediate cache between free nid cache and disk, but with different granularity of indicating free nid range, and different persistence policy. nat_bits cache provides better persistence ability, and free_nid_bitmap provides better granularity. In this patch we combine advantage of both caches, so finally policy of the intermediate cache would be: - init: load free nid status from nat_bits into free_nid_bitmap - lookup: scan free_nid_bitmap before load NAT blocks - update: update free_nid_bitmap in real-time - persistence: udpate and persist nat_bits in checkpoint This patch also resolves performance regression reported by lkp-robot. commit: 4ac912427c4214d8031d9ad6fbc3bc75e71512df ("f2fs: introduce free nid bitmap") d00030cf9cd0bb96fdccc41e33d3c91dcbb672ba ("f2fs: use __set{__clear}_bit_le") 1382c0f3f9d3f936c8bc42ed1591cf7a593ef9f7 ("f2fs: combine nat_bits and free_nid_bitmap cache") 4ac912427c4214d8 d00030cf9cd0bb96fdccc41e33 1382c0f3f9d3f936c8bc42ed15 ---------------- -------------------------- -------------------------- %stddev %change %stddev %change %stddev \ | \ | \ 77863 ± 0% +2.1% 79485 ± 1% +50.8% 117404 ± 0% aim7.jobs-per-min 231.63 ± 0% -2.0% 227.01 ± 1% -33.6% 153.80 ± 0% aim7.time.elapsed_time 231.63 ± 0% -2.0% 227.01 ± 1% -33.6% 153.80 ± 0% aim7.time.elapsed_time.max 896604 ± 0% -0.8% 889221 ± 3% -20.2% 715260 ± 1% aim7.time.involuntary_context_switches 2394 ± 1% +4.6% 2503 ± 1% +3.7% 2481 ± 2% aim7.time.maximum_resident_set_size 6240 ± 0% -1.5% 6145 ± 1% -14.1% 5360 ± 1% aim7.time.system_time 1111357 ± 3% +1.9% 1132509 ± 2% -6.2% 1041932 ± 2% aim7.time.voluntary_context_switches ... Signed-off-by: Chao Yu Tested-by: Xiaolong Ye Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 125 +++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 87a2b1f740cc..481aa8dc79f4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -338,9 +338,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); - if (enabled_nat_bits(sbi, NULL) && new_blkaddr == NEW_ADDR) - __clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits); - /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) e = __lookup_nat_cache(nm_i, ni->ino); @@ -1824,7 +1821,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build) + bool set, bool build, bool locked) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1838,12 +1835,14 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - spin_lock(&nm_i->free_nid_lock); + if (!locked) + spin_lock(&nm_i->free_nid_lock); if (set) nm_i->free_nid_count[nat_ofs]++; else if (!build) nm_i->free_nid_count[nat_ofs]--; - spin_unlock(&nm_i->free_nid_lock); + if (!locked) + spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1872,7 +1871,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed, true); + update_free_nid_bitmap(sbi, start_nid, freed, true, false); } } @@ -1920,58 +1919,6 @@ out: up_read(&nm_i->nat_tree_lock); } -static int scan_nat_bits(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct page *page; - unsigned int i = 0; - nid_t nid; - - if (!enabled_nat_bits(sbi, NULL)) - return -EAGAIN; - - down_read(&nm_i->nat_tree_lock); -check_empty: - i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); - if (i >= nm_i->nat_blocks) { - i = 0; - goto check_partial; - } - - for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK; - nid++) { - if (unlikely(nid >= nm_i->max_nid)) - break; - add_free_nid(sbi, nid, true); - } - - if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) - goto out; - i++; - goto check_empty; - -check_partial: - i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); - if (i >= nm_i->nat_blocks) { - disable_nat_bits(sbi, true); - up_read(&nm_i->nat_tree_lock); - return -EINVAL; - } - - nid = i * NAT_ENTRY_PER_BLOCK; - page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); - - if (nm_i->nid_cnt[FREE_NID_LIST] < MAX_FREE_NIDS) { - i++; - goto check_partial; - } -out: - up_read(&nm_i->nat_tree_lock); - return 0; -} - static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1993,21 +1940,6 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) if (nm_i->nid_cnt[FREE_NID_LIST]) return; - - /* try to find free nids with nat_bits */ - if (!scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) - return; - } - - /* find next valid candidate */ - if (enabled_nat_bits(sbi, NULL)) { - int idx = find_next_zero_bit_le(nm_i->full_nat_bits, - nm_i->nat_blocks, 0); - - if (idx >= nm_i->nat_blocks) - set_sbi_flag(sbi, SBI_NEED_FSCK); - else - nid = idx * NAT_ENTRY_PER_BLOCK; } /* readahead nat pages to be scanned */ @@ -2094,7 +2026,7 @@ retry: __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false, false); + update_free_nid_bitmap(sbi, *nid, false, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2150,7 +2082,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); + update_free_nid_bitmap(sbi, nid, true, false, false); spin_unlock(&nm_i->nid_list_lock); @@ -2480,11 +2412,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); + update_free_nid_bitmap(sbi, nid, true, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false, false); + update_free_nid_bitmap(sbi, nid, false, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2590,6 +2522,40 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) return 0; } +inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + + spin_lock(&nm_i->free_nid_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true, true); + spin_unlock(&nm_i->free_nid_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } +} + static int init_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); @@ -2691,6 +2657,9 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + build_free_nids(sbi, true, true); return 0; } From 6be3b6cce1e225f189b68b4e84fc711d19b4277b Mon Sep 17 00:00:00 2001 From: Ryan Hsu Date: Mon, 13 Mar 2017 15:49:03 -0700 Subject: [PATCH 237/297] ath10k: fix incorrect wlan_mac_base in qca6174_regs In the 'commit ebee76f7fa46 ("ath10k: allow setting coverage class")', it inherits the design and the address offset from ath9k, but the address is not applicable to QCA6174, which leads to a random crash while doing the resume() operation, since the set_coverage_class.ops will be called from ieee80211_reconfig() when resume() (if the wow is not configured). Fix the incorrect address offset here to avoid the random crash. Verified on QCA6174/hw3.0 with firmware WLAN.RM.4.4-00022-QCARMSWPZ-2. kvalo: this also seems to fix a regression with firmware restart. Fixes: ebee76f7fa46 ("ath10k: allow setting coverage class") Cc: # v4.10 Signed-off-by: Ryan Hsu Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/hw.c b/drivers/net/wireless/ath/ath10k/hw.c index 33fb26833cd0..d9f37ee4bfdd 100644 --- a/drivers/net/wireless/ath/ath10k/hw.c +++ b/drivers/net/wireless/ath/ath10k/hw.c @@ -51,7 +51,7 @@ const struct ath10k_hw_regs qca6174_regs = { .rtc_soc_base_address = 0x00000800, .rtc_wmac_base_address = 0x00001000, .soc_core_base_address = 0x0003a000, - .wlan_mac_base_address = 0x00020000, + .wlan_mac_base_address = 0x00010000, .ce_wrapper_base_address = 0x00034000, .ce0_base_address = 0x00034400, .ce1_base_address = 0x00034800, From c3104aae5d8cc443556f8613466e16737326e215 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Mar 2017 17:36:25 +0100 Subject: [PATCH 238/297] remoteproc: qcom: fix QCOM_SMD dependencies qcom_smd_register_edge() is provided by either QCOM_SMD or RPMSG_QCOM_SMD, and if both of them are disabled, it does nothing. The check for the PIL drivers however only checks for QCOM_SMD, so it breaks with QCOM_SMD=n && RPMSG_QCOM_SMD=m: drivers/remoteproc/built-in.o: In function `smd_subdev_remove': qcom_wcnss_iris.c:(.text+0x231c): undefined reference to `qcom_smd_unregister_edge' drivers/remoteproc/built-in.o: In function `smd_subdev_probe': qcom_wcnss_iris.c:(.text+0x2344): undefined reference to `qcom_smd_register_edge' drivers/remoteproc/built-in.o: In function `smd_subdev_probe': qcom_q6v5_pil.c:(.text+0x3538): undefined reference to `qcom_smd_register_edge' qcom_q6v5_pil.c:(.text+0x3538): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `qcom_smd_register_edge' This clarifies the Kconfig dependency. Fixes: 4b48921a8f74 ("remoteproc: qcom: Use common SMD edge handler") Signed-off-by: Arnd Bergmann Signed-off-by: Bjorn Andersson --- drivers/remoteproc/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index 65f86bc24c07..1dc43fc5f65f 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -76,7 +76,7 @@ config QCOM_ADSP_PIL depends on OF && ARCH_QCOM depends on REMOTEPROC depends on QCOM_SMEM - depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n) + depends on RPMSG_QCOM_SMD || QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n && RPMSG_QCOM_SMD=n) select MFD_SYSCON select QCOM_MDT_LOADER select QCOM_RPROC_COMMON @@ -93,7 +93,7 @@ config QCOM_Q6V5_PIL depends on OF && ARCH_QCOM depends on QCOM_SMEM depends on REMOTEPROC - depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n) + depends on RPMSG_QCOM_SMD || QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n && RPMSG_QCOM_SMD=n) select MFD_SYSCON select QCOM_RPROC_COMMON select QCOM_SCM @@ -104,7 +104,7 @@ config QCOM_Q6V5_PIL config QCOM_WCNSS_PIL tristate "Qualcomm WCNSS Peripheral Image Loader" depends on OF && ARCH_QCOM - depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n) + depends on RPMSG_QCOM_SMD || QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n && RPMSG_QCOM_SMD=n) depends on QCOM_SMEM depends on REMOTEPROC select QCOM_MDT_LOADER From c6736a94d0e527ddc0d1eb99dbc59886a9ecf471 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 21 Mar 2017 13:26:02 +0100 Subject: [PATCH 239/297] ALSA: x86: Make CONFIG_SND_X86 bool CONFIG_SND_X86 is a menu config to filter only for x86-specific drivers in its sub-menu, and this doesn't have to be tristate but rather it should be a bool. Also, like other sub-menu configs, it's more user-friendly to be default=y; it's merely a menu config and the actual drivers are configured in the sub-menu, after all. Fixes: 287599cf2d77 ("ALSA: add Intel HDMI LPE audio driver for BYT/CHT-T") Signed-off-by: Takashi Iwai --- sound/x86/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/x86/Kconfig b/sound/x86/Kconfig index 84c8f8fc597c..8adf4d1bd46e 100644 --- a/sound/x86/Kconfig +++ b/sound/x86/Kconfig @@ -1,6 +1,7 @@ menuconfig SND_X86 - tristate "X86 sound devices" + bool "X86 sound devices" depends on X86 + default y ---help--- X86 sound devices that don't fall under SoC or PCI categories From c520ff3d03f0b5db7146d9beed6373ad5d2a5e0e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 21 Mar 2017 13:56:04 +0100 Subject: [PATCH 240/297] ALSA: seq: Fix racy cell insertions during snd_seq_pool_done() When snd_seq_pool_done() is called, it marks the closing flag to refuse the further cell insertions. But snd_seq_pool_done() itself doesn't clear the cells but just waits until all cells are cleared by the caller side. That is, it's racy, and this leads to the endless stall as syzkaller spotted. This patch addresses the racy by splitting the setup of pool->closing flag out of snd_seq_pool_done(), and calling it properly before snd_seq_pool_done(). BugLink: http://lkml.kernel.org/r/CACT4Y+aqqy8bZA1fFieifNxR2fAfFQQABcBHj801+u5ePV0URw@mail.gmail.com Reported-and-tested-by: Dmitry Vyukov Cc: Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 1 + sound/core/seq/seq_fifo.c | 3 +++ sound/core/seq/seq_memory.c | 17 +++++++++++++---- sound/core/seq/seq_memory.h | 1 + 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 4c935202ce23..f3b1d7f50b81 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1832,6 +1832,7 @@ static int snd_seq_ioctl_set_client_pool(struct snd_seq_client *client, info->output_pool != client->pool->size)) { if (snd_seq_write_pool_allocated(client)) { /* remove all existing cells */ + snd_seq_pool_mark_closing(client->pool); snd_seq_queue_client_leave_cells(client->number); snd_seq_pool_done(client->pool); } diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c index 448efd4e980e..33980d1c8037 100644 --- a/sound/core/seq/seq_fifo.c +++ b/sound/core/seq/seq_fifo.c @@ -72,6 +72,9 @@ void snd_seq_fifo_delete(struct snd_seq_fifo **fifo) return; *fifo = NULL; + if (f->pool) + snd_seq_pool_mark_closing(f->pool); + snd_seq_fifo_clear(f); /* wake up clients if any */ diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c index 1a1acf3ddda4..d4c61ec9be13 100644 --- a/sound/core/seq/seq_memory.c +++ b/sound/core/seq/seq_memory.c @@ -415,6 +415,18 @@ int snd_seq_pool_init(struct snd_seq_pool *pool) return 0; } +/* refuse the further insertion to the pool */ +void snd_seq_pool_mark_closing(struct snd_seq_pool *pool) +{ + unsigned long flags; + + if (snd_BUG_ON(!pool)) + return; + spin_lock_irqsave(&pool->lock, flags); + pool->closing = 1; + spin_unlock_irqrestore(&pool->lock, flags); +} + /* remove events */ int snd_seq_pool_done(struct snd_seq_pool *pool) { @@ -425,10 +437,6 @@ int snd_seq_pool_done(struct snd_seq_pool *pool) return -EINVAL; /* wait for closing all threads */ - spin_lock_irqsave(&pool->lock, flags); - pool->closing = 1; - spin_unlock_irqrestore(&pool->lock, flags); - if (waitqueue_active(&pool->output_sleep)) wake_up(&pool->output_sleep); @@ -485,6 +493,7 @@ int snd_seq_pool_delete(struct snd_seq_pool **ppool) *ppool = NULL; if (pool == NULL) return 0; + snd_seq_pool_mark_closing(pool); snd_seq_pool_done(pool); kfree(pool); return 0; diff --git a/sound/core/seq/seq_memory.h b/sound/core/seq/seq_memory.h index 4a2ec779b8a7..32f959c17786 100644 --- a/sound/core/seq/seq_memory.h +++ b/sound/core/seq/seq_memory.h @@ -84,6 +84,7 @@ static inline int snd_seq_total_cells(struct snd_seq_pool *pool) int snd_seq_pool_init(struct snd_seq_pool *pool); /* done pool - free events */ +void snd_seq_pool_mark_closing(struct snd_seq_pool *pool); int snd_seq_pool_done(struct snd_seq_pool *pool); /* create pool */ From 49cc4c217c0dbb7d09c402472d1f85a81c093f9f Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Mon, 6 Mar 2017 10:54:57 -0800 Subject: [PATCH 241/297] HID: wacom: Correct Intuos Pro 2 resolution The features struct for the second gen Intuos Pro uses the wrong constant for the resolution. This fix is for commit 4922cd2. Fixes: 4922cd2 ("HID: wacom: Support 2nd-gen Intuos Pro's Bluetooth classic interface") Signed-off-by: Aaron Armstrong Skomra Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 4aa3de9f1163..3d864466d473 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -4197,10 +4197,10 @@ static const struct wacom_features wacom_features_0x343 = WACOM_DTU_OFFSET, WACOM_DTU_OFFSET }; static const struct wacom_features wacom_features_0x360 = { "Wacom Intuos Pro M", 44800, 29600, 8191, 63, - INTUOSP2_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 9, .touch_max = 10 }; + INTUOSP2_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, .touch_max = 10 }; static const struct wacom_features wacom_features_0x361 = { "Wacom Intuos Pro L", 62200, 43200, 8191, 63, - INTUOSP2_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 9, .touch_max = 10 }; + INTUOSP2_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 9, .touch_max = 10 }; static const struct wacom_features wacom_features_HID_ANY_ID = { "Wacom HID", .type = HID_GENERIC, .oVid = HID_ANY_ID, .oPid = HID_ANY_ID }; From b6b1f19b06b7d4dcc261a88d74c5fb0a53988b4e Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Mon, 6 Mar 2017 10:54:58 -0800 Subject: [PATCH 242/297] HID: wacom: don't manually release resources for the EKR Commit 5b779fc introduces the manual release of resources in wacom_remove() as an addition to the driver's use of devm. The EKR resources can only be released through wacom_remote_destroy_one() so we skip the manual release for it. Fixes: 5b779fc ("HID: wacom: release the resources before leaving despite devm") Signed-off-by: Aaron Armstrong Skomra Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index be8f7e2a026f..994bddc55b82 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2579,7 +2579,9 @@ static void wacom_remove(struct hid_device *hdev) /* make sure we don't trigger the LEDs */ wacom_led_groups_release(wacom); - wacom_release_resources(wacom); + + if (wacom->wacom_wac.features.type != REMOTE) + wacom_release_resources(wacom); hid_set_drvdata(hdev, NULL); } From deaba636997557fce46ca7bcb509bff5ea1b0558 Mon Sep 17 00:00:00 2001 From: Oscar Campos Date: Fri, 10 Feb 2017 18:23:00 +0000 Subject: [PATCH 243/297] HID: corsair: support for K65-K70 Rapidfire and Scimitar Pro RGB Add quirks for several corsair gaming devices to avoid long delays on report initialization Supported devices: - Corsair K65RGB Rapidfire Gaming Keyboard - Corsair K70RGB Rapidfire Gaming Keyboard - Corsair Scimitar Pro RGB Gaming Mouse Signed-off-by: Oscar Campos Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 3 +++ drivers/hid/usbhid/hid-quirks.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index b3df60da0297..0e2e7c571d22 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -278,6 +278,9 @@ #define USB_DEVICE_ID_CORSAIR_K70RGB 0x1b13 #define USB_DEVICE_ID_CORSAIR_STRAFE 0x1b15 #define USB_DEVICE_ID_CORSAIR_K65RGB 0x1b17 +#define USB_DEVICE_ID_CORSAIR_K70RGB_RAPIDFIRE 0x1b38 +#define USB_DEVICE_ID_CORSAIR_K65RGB_RAPIDFIRE 0x1b39 +#define USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB 0x1b3e #define USB_VENDOR_ID_CREATIVELABS 0x041e #define USB_DEVICE_ID_CREATIVE_SB_OMNI_SURROUND_51 0x322c diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index d6847a664446..a69a3c88ab29 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -80,6 +80,9 @@ static const struct hid_blacklist { { USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K70RGB, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K65RGB, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_STRAFE, HID_QUIRK_NO_INIT_REPORTS | HID_QUIRK_ALWAYS_POLL }, + { USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K70RGB_RAPIDFIRE, HID_QUIRK_NO_INIT_REPORTS | HID_QUIRK_ALWAYS_POLL }, + { USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K65RGB_RAPIDFIRE, HID_QUIRK_NO_INIT_REPORTS | HID_QUIRK_ALWAYS_POLL }, + { USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB, HID_QUIRK_NO_INIT_REPORTS | HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_CREATIVE_SB_OMNI_SURROUND_51, HID_QUIRK_NOGET }, { USB_VENDOR_ID_DMI, USB_DEVICE_ID_DMI_ENC, HID_QUIRK_NOGET }, { USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_WIIU, HID_QUIRK_MULTI_INPUT }, From 01adc47e885f1127b29d76d0dfb21d8262f9d6b4 Mon Sep 17 00:00:00 2001 From: Oscar Campos Date: Mon, 6 Mar 2017 21:02:39 +0000 Subject: [PATCH 244/297] HID: corsair: Add driver Scimitar Pro RGB gaming mouse 1b1c:1b3e support to hid-corsair This mouse sold by Corsair as Scimitar PRO RGB defines two consecutive Logical Minimum items in its Application (Consumer.0001) report making it non parseable. This patch fixes the report descriptor overriding byte 77 in rdesc from 0x16 (Logical Minimum with 16 bits value) to 0x26 (Logical Maximum with 16 bits value). Signed-off-by: Oscar Campos Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 1 + drivers/hid/hid-core.c | 1 + drivers/hid/hid-corsair.c | 47 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 8eab3200ac9a..8c54cb8f5d6d 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -190,6 +190,7 @@ config HID_CORSAIR Supported devices: - Vengeance K90 + - Scimitar PRO RGB config HID_PRODIKEYS tristate "Prodikeys PC-MIDI Keyboard support" diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index ae01ae601d74..3ceb4a2af381 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1870,6 +1870,7 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_AK1D) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_ACER_SWITCH12) }, { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K90) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB) }, { HID_USB_DEVICE(USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_PRODIKEYS_PCMIDI) }, { HID_USB_DEVICE(USB_VENDOR_ID_CYGNAL, USB_DEVICE_ID_CYGNAL_CP2112) }, { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_1) }, diff --git a/drivers/hid/hid-corsair.c b/drivers/hid/hid-corsair.c index c0303f61c26a..9ba5d98a1180 100644 --- a/drivers/hid/hid-corsair.c +++ b/drivers/hid/hid-corsair.c @@ -3,8 +3,10 @@ * * Supported devices: * - Vengeance K90 Keyboard + * - Scimitar PRO RGB Gaming Mouse * * Copyright (c) 2015 Clement Vuchener + * Copyright (c) 2017 Oscar Campos */ /* @@ -670,10 +672,51 @@ static int corsair_input_mapping(struct hid_device *dev, return 0; } +/* + * The report descriptor of Corsair Scimitar RGB Pro gaming mouse is + * non parseable as they define two consecutive Logical Minimum for + * the Usage Page (Consumer) in rdescs bytes 75 and 77 being 77 0x16 + * that should be obviousy 0x26 for Logical Magimum of 16 bits. This + * prevents poper parsing of the report descriptor due Logical + * Minimum being larger than Logical Maximum. + * + * This driver fixes the report descriptor for: + * - USB ID b1c:1b3e, sold as Scimitar RGB Pro Gaming mouse + */ + +static __u8 *corsair_mouse_report_fixup(struct hid_device *hdev, __u8 *rdesc, + unsigned int *rsize) +{ + struct usb_interface *intf = to_usb_interface(hdev->dev.parent); + + if (intf->cur_altsetting->desc.bInterfaceNumber == 1) { + /* + * Corsair Scimitar RGB Pro report descriptor is broken and + * defines two different Logical Minimum for the Consumer + * Application. The byte 77 should be a 0x26 defining a 16 + * bits integer for the Logical Maximum but it is a 0x16 + * instead (Logical Minimum) + */ + switch (hdev->product) { + case USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB: + if (*rsize >= 172 && rdesc[75] == 0x15 && rdesc[77] == 0x16 + && rdesc[78] == 0xff && rdesc[79] == 0x0f) { + hid_info(hdev, "Fixing up report descriptor\n"); + rdesc[77] = 0x26; + } + break; + } + + } + return rdesc; +} + static const struct hid_device_id corsair_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K90), .driver_data = CORSAIR_USE_K90_MACRO | CORSAIR_USE_K90_BACKLIGHT }, + { HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, + USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB) }, {} }; @@ -686,10 +729,14 @@ static struct hid_driver corsair_driver = { .event = corsair_event, .remove = corsair_remove, .input_mapping = corsair_input_mapping, + .report_fixup = corsair_mouse_report_fixup, }; module_hid_driver(corsair_driver); MODULE_LICENSE("GPL"); +/* Original K90 driver author */ MODULE_AUTHOR("Clement Vuchener"); +/* Scimitar PRO RGB driver author */ +MODULE_AUTHOR("Oscar Campos"); MODULE_DESCRIPTION("HID driver for Corsair devices"); From 6e5364f5f472d4ea66d37459e299071b0190362c Mon Sep 17 00:00:00 2001 From: Ping Cheng Date: Tue, 14 Mar 2017 17:08:16 -0700 Subject: [PATCH 245/297] HID: wacom: generic: Wacom mouse is only provided for opaque tablets Commit f85c9dc ("Support tool ID and additional tool types") introduced mouse and lens cursor tools to generic codepath, which covers both display (direct) and opaque tablets (indirect devices). However, mouse and lens cursor tools are only provided for opaque tablets. This patch ignores mouse and lens cursor tools if the device is a display tablet. Signed-off-by: Ping Cheng Reviewed-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 3d864466d473..94250c293be2 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1959,8 +1959,10 @@ static void wacom_wac_pen_usage_mapping(struct hid_device *hdev, input_set_capability(input, EV_KEY, BTN_TOOL_BRUSH); input_set_capability(input, EV_KEY, BTN_TOOL_PENCIL); input_set_capability(input, EV_KEY, BTN_TOOL_AIRBRUSH); - input_set_capability(input, EV_KEY, BTN_TOOL_MOUSE); - input_set_capability(input, EV_KEY, BTN_TOOL_LENS); + if (!(features->device_type & WACOM_DEVICETYPE_DIRECT)) { + input_set_capability(input, EV_KEY, BTN_TOOL_MOUSE); + input_set_capability(input, EV_KEY, BTN_TOOL_LENS); + } break; case WACOM_HID_WD_FINGERWHEEL: wacom_map_usage(input, usage, field, EV_ABS, ABS_WHEEL, 0); From 093b995e3b55a0ae0670226ddfcb05bfbf0099ae Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 20 Mar 2017 14:26:42 +0800 Subject: [PATCH 246/297] mm, swap: Remove WARN_ON_ONCE() in free_swap_slot() Before commit 452b94b8c8c7 ("mm/swap: don't BUG_ON() due to uninitialized swap slot cache"), the following bug is reported, ------------[ cut here ]------------ kernel BUG at mm/swap_slots.c:270! invalid opcode: 0000 [#1] SMP CPU: 5 PID: 1745 Comm: (sd-pam) Not tainted 4.11.0-rc1-00243-g24c534bb161b #1 Hardware name: System manufacturer System Product Name/Z170-K, BIOS 1803 05/06/2016 RIP: 0010:free_swap_slot+0xba/0xd0 Call Trace: swap_free+0x36/0x40 do_swap_page+0x360/0x6d0 __handle_mm_fault+0x880/0x1080 handle_mm_fault+0xd0/0x240 __do_page_fault+0x232/0x4d0 do_page_fault+0x20/0x70 page_fault+0x22/0x30 ---[ end trace aefc9ede53e0ab21 ]--- This is raised by the BUG_ON(!swap_slot_cache_initialized) in free_swap_slot(). This is incorrect, because even if the swap slots cache fails to be initialized, the swap should operate properly without the swap slots cache. And the use_swap_slot_cache check later in the function will protect the uninitialized swap slots cache case. In commit 452b94b8c8c7, the BUG_ON() is replaced by WARN_ON_ONCE(). In the patch, the WARN_ON_ONCE() is removed too. Reported-by: Linus Torvalds Acked-by: Tim Chen Cc: Michal Hocko Signed-off-by: "Huang, Ying" Signed-off-by: Linus Torvalds --- mm/swap_slots.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 7ebb23836f68..b1ccb58ad397 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -267,8 +267,6 @@ int free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; - WARN_ON_ONCE(!swap_slot_cache_initialized); - cache = &get_cpu_var(swp_slots); if (use_swap_slot_cache && cache->slots_ret) { spin_lock_irq(&cache->free_lock); From 98d068ab52b4b11d403995ed14154660797e7136 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 14 Mar 2017 14:15:20 +0800 Subject: [PATCH 247/297] r8152: fix the list rx_done may be used without initialization The list rx_done would be initialized when the linking on occurs. Therefore, if a napi is scheduled without any linking on before, the following kernel panic would happen. BUG: unable to handle kernel NULL pointer dereference at 000000000000008 IP: [] r8152_poll+0xe1e/0x1210 [r8152] PGD 0 Oops: 0002 [#1] SMP Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 986243c932cc..bb3eedd07fbe 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -1362,6 +1362,7 @@ static int alloc_all_mem(struct r8152 *tp) spin_lock_init(&tp->rx_lock); spin_lock_init(&tp->tx_lock); INIT_LIST_HEAD(&tp->tx_free); + INIT_LIST_HEAD(&tp->rx_done); skb_queue_head_init(&tp->tx_queue); skb_queue_head_init(&tp->rx_queue); From 8a0f5ccfb33b0b8b51de65b7b3bf342ba10b4fb6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 14 Mar 2017 18:25:57 +0800 Subject: [PATCH 248/297] crypto: deadlock between crypto_alg_sem/rtnl_mutex/genl_mutex On Tue, Mar 14, 2017 at 10:44:10AM +0100, Dmitry Vyukov wrote: > > Yes, please. > Disregarding some reports is not a good way long term. Please try this patch. ---8<--- Subject: netlink: Annotate nlk cb_mutex by protocol Currently all occurences of nlk->cb_mutex are annotated by lockdep as a single class. This causes a false lcokdep cycle involving genl and crypto_user. This patch fixes it by dividing cb_mutex into individual classes based on the netlink protocol. As genl and crypto_user do not use the same netlink protocol this breaks the false dependency loop. Reported-by: Dmitry Vyukov Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7b73c7c161a9..596eaff66649 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -96,6 +96,44 @@ EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); +static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS]; + +static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { + "nlk_cb_mutex-ROUTE", + "nlk_cb_mutex-1", + "nlk_cb_mutex-USERSOCK", + "nlk_cb_mutex-FIREWALL", + "nlk_cb_mutex-SOCK_DIAG", + "nlk_cb_mutex-NFLOG", + "nlk_cb_mutex-XFRM", + "nlk_cb_mutex-SELINUX", + "nlk_cb_mutex-ISCSI", + "nlk_cb_mutex-AUDIT", + "nlk_cb_mutex-FIB_LOOKUP", + "nlk_cb_mutex-CONNECTOR", + "nlk_cb_mutex-NETFILTER", + "nlk_cb_mutex-IP6_FW", + "nlk_cb_mutex-DNRTMSG", + "nlk_cb_mutex-KOBJECT_UEVENT", + "nlk_cb_mutex-GENERIC", + "nlk_cb_mutex-17", + "nlk_cb_mutex-SCSITRANSPORT", + "nlk_cb_mutex-ECRYPTFS", + "nlk_cb_mutex-RDMA", + "nlk_cb_mutex-CRYPTO", + "nlk_cb_mutex-SMC", + "nlk_cb_mutex-23", + "nlk_cb_mutex-24", + "nlk_cb_mutex-25", + "nlk_cb_mutex-26", + "nlk_cb_mutex-27", + "nlk_cb_mutex-28", + "nlk_cb_mutex-29", + "nlk_cb_mutex-30", + "nlk_cb_mutex-31", + "nlk_cb_mutex-MAX_LINKS" +}; + static int netlink_dump(struct sock *sk); static void netlink_skb_destructor(struct sk_buff *skb); @@ -585,6 +623,9 @@ static int __netlink_create(struct net *net, struct socket *sock, } else { nlk->cb_mutex = &nlk->cb_def_mutex; mutex_init(nlk->cb_mutex); + lockdep_set_class_and_name(nlk->cb_mutex, + nlk_cb_mutex_keys + protocol, + nlk_cb_mutex_key_strings[protocol]); } init_waitqueue_head(&nlk->wait); From 36d277bac8080202684e67162ebb157f16631581 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:14 +0800 Subject: [PATCH 249/297] vsock: track pkt owner vsock So that we can cancel a queued pkt later if necessary. Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 3 +++ net/vmw_vsock/virtio_transport_common.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 9638bfeb0d1f..584f9a647ad4 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -48,6 +48,8 @@ struct virtio_vsock_pkt { struct virtio_vsock_hdr hdr; struct work_struct work; struct list_head list; + /* socket refcnt not held, only use for cancellation */ + struct vsock_sock *vsk; void *buf; u32 len; u32 off; @@ -56,6 +58,7 @@ struct virtio_vsock_pkt { struct virtio_vsock_pkt_info { u32 remote_cid, remote_port; + struct vsock_sock *vsk; struct msghdr *msg; u32 pkt_len; u16 type; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 8d592a45b597..af087b44ceea 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -58,6 +58,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, pkt->len = len; pkt->hdr.len = cpu_to_le32(len); pkt->reply = info->reply; + pkt->vsk = info->vsk; if (info->msg && len > 0) { pkt->buf = kmalloc(len, GFP_KERNEL); @@ -180,6 +181,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk, struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, .type = type, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -519,6 +521,7 @@ int virtio_transport_connect(struct vsock_sock *vsk) struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, .type = VIRTIO_VSOCK_TYPE_STREAM, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -534,6 +537,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | (mode & SEND_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_SEND : 0), + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -560,6 +564,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk, .type = VIRTIO_VSOCK_TYPE_STREAM, .msg = msg, .pkt_len = len, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -581,6 +586,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk, .op = VIRTIO_VSOCK_OP_RST, .type = VIRTIO_VSOCK_TYPE_STREAM, .reply = !!pkt, + .vsk = vsk, }; /* Send RST only if the original pkt is not a RST pkt */ @@ -826,6 +832,7 @@ virtio_transport_send_response(struct vsock_sock *vsk, .remote_cid = le64_to_cpu(pkt->hdr.src_cid), .remote_port = le32_to_cpu(pkt->hdr.src_port), .reply = true, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); From 16320f363ae128d9b9c70e60f00f2a572f57c23d Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:15 +0800 Subject: [PATCH 250/297] vhost-vsock: add pkt cancel capability To allow canceling all packets of a connection. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- drivers/vhost/vsock.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/net/af_vsock.h | 3 +++ 2 files changed, 44 insertions(+) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index ce5e63d2c66a..44eed8eb0725 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -223,6 +223,46 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt) return len; } +static int +vhost_transport_cancel_pkt(struct vsock_sock *vsk) +{ + struct vhost_vsock *vsock; + struct virtio_vsock_pkt *pkt, *n; + int cnt = 0; + LIST_HEAD(freeme); + + /* Find the vhost_vsock according to guest context id */ + vsock = vhost_vsock_get(vsk->remote_addr.svm_cid); + if (!vsock) + return -ENODEV; + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + if (pkt->reply) + cnt++; + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + if (cnt) { + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; + int new_cnt; + + new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); + if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num) + vhost_poll_queue(&tx_vq->poll); + } + + return 0; +} + static struct virtio_vsock_pkt * vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, unsigned int out, unsigned int in) @@ -675,6 +715,7 @@ static struct virtio_transport vhost_transport = { .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, + .cancel_pkt = vhost_transport_cancel_pkt, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_dequeue = virtio_transport_dgram_dequeue, diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f2758964ce6f..f32ed9ac181a 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -100,6 +100,9 @@ struct vsock_transport { void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); + /* Cancel all pending packets sent on vsock. */ + int (*cancel_pkt)(struct vsock_sock *vsk); + /* Connections. */ int (*connect)(struct vsock_sock *); From 073b4f2c50fe67c7c66a059a4d6db52bb1465490 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:16 +0800 Subject: [PATCH 251/297] vsock: add pkt cancel capability Reviewed-by: Stefan Hajnoczi Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport.c | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 9d24c0e958b1..68675a151f22 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -213,6 +213,47 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) return len; } +static int +virtio_transport_cancel_pkt(struct vsock_sock *vsk) +{ + struct virtio_vsock *vsock; + struct virtio_vsock_pkt *pkt, *n; + int cnt = 0; + LIST_HEAD(freeme); + + vsock = virtio_vsock_get(); + if (!vsock) { + return -ENODEV; + } + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + if (pkt->reply) + cnt++; + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + if (cnt) { + struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; + int new_cnt; + + new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); + if (new_cnt + cnt >= virtqueue_get_vring_size(rx_vq) && + new_cnt < virtqueue_get_vring_size(rx_vq)) + queue_work(virtio_vsock_workqueue, &vsock->rx_work); + } + + return 0; +} + static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) { int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; @@ -462,6 +503,7 @@ static struct virtio_transport virtio_transport = { .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, + .cancel_pkt = virtio_transport_cancel_pkt, .dgram_bind = virtio_transport_dgram_bind, .dgram_dequeue = virtio_transport_dgram_dequeue, From 380feae0def7e6a115124a3219c3ec9b654dca32 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:17 +0800 Subject: [PATCH 252/297] vsock: cancel packets when failing to connect Otherwise we'll leave the packets queued until releasing vsock device. E.g., if guest is slow to start up, resulting ETIMEDOUT on connect, guest will get the connect requests from failed host sockets. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9f770f33c100..6f7f6757ceef 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1102,10 +1102,19 @@ static const struct proto_ops vsock_dgram_ops = { .sendpage = sock_no_sendpage, }; +static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) +{ + if (!transport->cancel_pkt) + return -EOPNOTSUPP; + + return transport->cancel_pkt(vsk); +} + static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; + int cancel = 0; vsk = container_of(work, struct vsock_sock, dwork.work); sk = sk_vsock(vsk); @@ -1116,8 +1125,11 @@ static void vsock_connect_timeout(struct work_struct *work) sk->sk_state = SS_UNCONNECTED; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); + cancel = 1; } release_sock(sk); + if (cancel) + vsock_transport_cancel_pkt(vsk); sock_put(sk); } @@ -1224,11 +1236,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, err = sock_intr_errno(timeout); sk->sk_state = SS_UNCONNECTED; sock->state = SS_UNCONNECTED; + vsock_transport_cancel_pkt(vsk); goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; sk->sk_state = SS_UNCONNECTED; sock->state = SS_UNCONNECTED; + vsock_transport_cancel_pkt(vsk); goto out_wait; } From 7df9c24625b9981779afb8fcdbe2bb4765e61147 Mon Sep 17 00:00:00 2001 From: Andrey Ulanov Date: Tue, 14 Mar 2017 20:16:42 -0700 Subject: [PATCH 253/297] net: unix: properly re-increment inflight counter of GC discarded candidates Dmitry has reported that a BUG_ON() condition in unix_notinflight() may be triggered by a simple code that forwards unix socket in an SCM_RIGHTS message. That is caused by incorrect unix socket GC implementation in unix_gc(). The GC first collects list of candidates, then (a) decrements their "children's" inflight counter, (b) checks which inflight counters are now 0, and then (c) increments all inflight counters back. (a) and (c) are done by calling scan_children() with inc_inflight or dec_inflight as the second argument. Commit 6209344f5a37 ("net: unix: fix inflight counting bug in garbage collector") changed scan_children() such that it no longer considers sockets that do not have UNIX_GC_CANDIDATE flag. It also added a block of code that that unsets this flag _before_ invoking scan_children(, dec_iflight, ). This may lead to incorrect inflight counters for some sockets. This change fixes this bug by changing order of operations: UNIX_GC_CANDIDATE is now unset only after all inflight counters are restored to the original state. kernel BUG at net/unix/garbage.c:149! RIP: 0010:[] [] unix_notinflight+0x3b4/0x490 net/unix/garbage.c:149 Call Trace: [] unix_detach_fds.isra.19+0xff/0x170 net/unix/af_unix.c:1487 [] unix_destruct_scm+0xf9/0x210 net/unix/af_unix.c:1496 [] skb_release_head_state+0x101/0x200 net/core/skbuff.c:655 [] skb_release_all+0x1a/0x60 net/core/skbuff.c:668 [] __kfree_skb+0x1a/0x30 net/core/skbuff.c:684 [] kfree_skb+0x184/0x570 net/core/skbuff.c:705 [] unix_release_sock+0x5b5/0xbd0 net/unix/af_unix.c:559 [] unix_release+0x49/0x90 net/unix/af_unix.c:836 [] sock_release+0x92/0x1f0 net/socket.c:570 [] sock_close+0x1b/0x20 net/socket.c:1017 [] __fput+0x34e/0x910 fs/file_table.c:208 [] ____fput+0x1a/0x20 fs/file_table.c:244 [] task_work_run+0x1a0/0x280 kernel/task_work.c:116 [< inline >] exit_task_work include/linux/task_work.h:21 [] do_exit+0x183a/0x2640 kernel/exit.c:828 [] do_group_exit+0x14e/0x420 kernel/exit.c:931 [] get_signal+0x663/0x1880 kernel/signal.c:2307 [] do_signal+0xc5/0x2190 arch/x86/kernel/signal.c:807 [] exit_to_usermode_loop+0x1ea/0x2d0 arch/x86/entry/common.c:156 [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:190 [] syscall_return_slowpath+0x4d3/0x570 arch/x86/entry/common.c:259 [] entry_SYSCALL_64_fastpath+0xc4/0xc6 Link: https://lkml.org/lkml/2017/3/6/252 Signed-off-by: Andrey Ulanov Reported-by: Dmitry Vyukov Fixes: 6209344 ("net: unix: fix inflight counting bug in garbage collector") Signed-off-by: David S. Miller --- net/unix/garbage.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6a0d48525fcf..c36757e72844 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -146,6 +146,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp) if (s) { struct unix_sock *u = unix_sk(s); + BUG_ON(!atomic_long_read(&u->inflight)); BUG_ON(list_empty(&u->link)); if (atomic_long_dec_and_test(&u->inflight)) @@ -341,6 +342,14 @@ void unix_gc(void) } list_del(&cursor); + /* Now gc_candidates contains only garbage. Restore original + * inflight counters for these as well, and remove the skbuffs + * which are creating the cycle(s). + */ + skb_queue_head_init(&hitlist); + list_for_each_entry(u, &gc_candidates, link) + scan_children(&u->sk, inc_inflight, &hitlist); + /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ @@ -350,14 +359,6 @@ void unix_gc(void) list_move_tail(&u->link, &gc_inflight_list); } - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, inc_inflight, &hitlist); - spin_unlock(&unix_gc_lock); /* Here we are. Hitlist is filled. Die. */ From 09050957fae896e001498af1aa35c446a11cb47d Mon Sep 17 00:00:00 2001 From: Yaroslav Isakov Date: Thu, 16 Mar 2017 22:44:10 +0300 Subject: [PATCH 254/297] tun: fix inability to set offloads after disabling them via ethtool Added missing logic in tun driver, which prevents apps to set offloads using tun ioctl, if offloads were previously disabled via ethtool Signed-off-by: Yaroslav Isakov Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 34cc3c590aa5..cc88cd7856f5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1931,6 +1931,8 @@ static int set_offload(struct tun_struct *tun, unsigned long arg) return -EINVAL; tun->set_features = features; + tun->dev->wanted_features &= ~TUN_USER_FEATURES; + tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; From aea92fb2e09e29653b023d4254ac9fbf94221538 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2017 08:05:28 -0700 Subject: [PATCH 255/297] sch_dsmark: fix invalid skb_cow() usage skb_cow(skb, sizeof(ip header)) is not very helpful in this context. First we need to use pskb_may_pull() to make sure the ip header is in skb linear part, then use skb_try_make_writable() to address clones issues. Fixes: 4c30719f4f55 ("[PKT_SCHED] dsmark: handle cloned and non-linear skb's") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 802ac7c2e5e8..5334e309f17f 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -201,9 +201,13 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); if (p->set_tc_index) { + int wlen = skb_network_offset(skb); + switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): - if (skb_cow_head(skb, sizeof(struct iphdr))) + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) goto drop; skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) @@ -211,7 +215,9 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; case htons(ETH_P_IPV6): - if (skb_cow_head(skb, sizeof(struct ipv6hdr))) + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) goto drop; skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) From 6bd845d1cf98b45c634baacb8381436dad3c2dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 17 Mar 2017 17:20:48 +0100 Subject: [PATCH 256/297] qmi_wwan: add Dell DW5811e MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a Dell branded Sierra Wireless EM7455. It is operating in MBIM mode by default, but can be configured to provide two QMI/RMNET functions. Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 805674550683..f8d55aa058ec 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -925,6 +925,8 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x413c, 0x81a9, 8)}, /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81b1, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81b3, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ + {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ + {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ {QMI_FIXED_INTF(0x1e0e, 0x9001, 5)}, /* SIMCom 7230E */ From 13e2d5187f6b965ba3556caedb914baf81b98ed2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 17 Mar 2017 23:52:35 +0300 Subject: [PATCH 257/297] bna: integer overflow bug in debugfs We could allocate less memory than intended because we do: bnad->regdata = kzalloc(len << 2, GFP_KERNEL); The shift can overflow leading to a crash. This is debugfs code so the impact is very small. Fixes: 7afc5dbde091 ("bna: Add debugfs interface.") Signed-off-by: Dan Carpenter Acked-by: Rasesh Mody Signed-off-by: David S. Miller --- drivers/net/ethernet/brocade/bna/bnad_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad_debugfs.c b/drivers/net/ethernet/brocade/bna/bnad_debugfs.c index 05c1c1dd7751..cebfe3bd086e 100644 --- a/drivers/net/ethernet/brocade/bna/bnad_debugfs.c +++ b/drivers/net/ethernet/brocade/bna/bnad_debugfs.c @@ -325,7 +325,7 @@ bnad_debugfs_write_regrd(struct file *file, const char __user *buf, return PTR_ERR(kern_buf); rc = sscanf(kern_buf, "%x:%x", &addr, &len); - if (rc < 2) { + if (rc < 2 || len > UINT_MAX >> 2) { netdev_warn(bnad->netdev, "failed to read user buffer\n"); kfree(kern_buf); return -EINVAL; From 3dc857f0e8fc22610a59cbb346ba62c6e921863f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 17 Mar 2017 16:07:11 -0700 Subject: [PATCH 258/297] net: vrf: Reset rt6i_idev in local dst after put The VRF driver takes a reference to the inet6_dev on the VRF device for its rt6_local dst when handling local traffic through the VRF device as a loopback. When the device is deleted the driver does a put on the idev but does not reset rt6i_idev in the rt6_info struct. When the dst is destroyed, dst_destroy calls ip6_dst_destroy which does a second put for what is essentially the same reference causing it to be prematurely freed. Reset rt6i_idev after the put in the vrf driver. Fixes: b4869aa2f881e ("net: vrf: ipv6 support for local traffic to local addresses") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index fea687f35b5a..d6988db1930d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -462,8 +462,10 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) } if (rt6_local) { - if (rt6_local->rt6i_idev) + if (rt6_local->rt6i_idev) { in6_dev_put(rt6_local->rt6i_idev); + rt6_local->rt6i_idev = NULL; + } dst = &rt6_local->dst; dev_put(dst->dev); From 486a43db2e26b87125b5629e1ade516f90833934 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 18 Mar 2017 19:12:22 +0800 Subject: [PATCH 259/297] sctp: remove temporary variable confirm from sctp_packet_transmit Commit c86a773c7802 ("sctp: add dst_pending_confirm flag") introduced a temporary variable "confirm" in sctp_packet_transmit. But it broke the rule that longer lines should be above shorter ones. Besides, this variable is not necessary, so this patch is to just remove it and use tp->dst_pending_confirm directly. Fixes: c86a773c7802 ("sctp: add dst_pending_confirm flag") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/output.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sctp/output.c b/net/sctp/output.c index 71ce6b945dcb..1224421036b3 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -546,7 +546,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; - int confirm; struct dst_entry *dst; struct sk_buff *head; struct sctphdr *sh; @@ -625,13 +624,13 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; - confirm = tp->dst_pending_confirm; - if (confirm) + if (tp->dst_pending_confirm) skb_set_dst_pending_confirm(head, 1); /* neighbour should be confirmed on successful transmission or * positive error */ - if (tp->af_specific->sctp_xmit(head, tp) >= 0 && confirm) + if (tp->af_specific->sctp_xmit(head, tp) >= 0 && + tp->dst_pending_confirm) tp->dst_pending_confirm = 0; out: From 1f904495b79003cd3d881de8731377d48fcbc7e3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 18 Mar 2017 19:27:23 +0800 Subject: [PATCH 260/297] sctp: define dst_pending_confirm as a bit in sctp_transport As tp->dst_pending_confirm's value can only be set 0 or 1, this patch is to change to define it as a bit instead of __u32. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 07a0b128625a..4f645198e9bd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -753,6 +753,8 @@ struct sctp_transport { /* Is the Path MTU update pending on this tranport */ pmtu_pending:1, + dst_pending_confirm:1, /* need to confirm neighbour */ + /* Has this transport moved the ctsn since we last sacked */ sack_generation:1; u32 dst_cookie; @@ -806,8 +808,6 @@ struct sctp_transport { __u32 burst_limited; /* Holds old cwnd when max.burst is applied */ - __u32 dst_pending_confirm; /* need to confirm neighbour */ - /* Destination */ struct dst_entry *dst; /* Source address. */ From 23bb09cfbe04076ef647da3889a5a5ab6cbe6f15 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 18 Mar 2017 20:03:59 +0800 Subject: [PATCH 261/297] sctp: out_qlen should be updated when pruning unsent queue This patch is to fix the issue that sctp_prsctp_prune_sent forgot to update q->out_qlen when removing a chunk from unsent queue. Fixes: 8dbdf1f5b09c ("sctp: implement prsctp PRIO policy") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index db352e5d61f8..025ccff67072 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -382,17 +382,18 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, } static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, - struct sctp_sndrcvinfo *sinfo, - struct list_head *queue, int msg_len) + struct sctp_sndrcvinfo *sinfo, int msg_len) { + struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; - list_for_each_entry_safe(chk, temp, queue, list) { + list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; list_del_init(&chk->list); + q->out_qlen -= chk->skb->len; asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; @@ -431,9 +432,7 @@ void sctp_prsctp_prune(struct sctp_association *asoc, return; } - sctp_prsctp_prune_unsent(asoc, sinfo, - &asoc->outqueue.out_chunk_list, - msg_len); + sctp_prsctp_prune_unsent(asoc, sinfo, msg_len); } /* Mark all the eligible packets on a transport for retransmission. */ From 8605330aac5a5785630aec8f64378a54891937cc Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 18 Mar 2017 17:02:59 -0400 Subject: [PATCH 262/297] tcp: fix SCM_TIMESTAMPING_OPT_STATS for normal skbs __sock_recv_timestamp can be called for both normal skbs (for receive timestamps) and for skbs on the error queue (for transmit timestamps). Commit 1c885808e456 (tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING) assumes any skb passed to __sock_recv_timestamp are from the error queue, containing OPT_STATS in the content of the skb. This results in accessing invalid memory or generating junk data. To fix this, set skb->pkt_type to PACKET_OUTGOING for packets on the error queue. This is safe because on the receive path on local sockets skb->pkt_type is never set to PACKET_OUTGOING. With that, copy OPT_STATS from a packet, only if its pkt_type is PACKET_OUTGOING. Fixes: 1c885808e456 ("tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING") Reported-by: JongHwan Kim Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 ++++++++++ net/socket.c | 13 ++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cd4ba8c6b609..b1fbd1958eb6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3694,6 +3694,15 @@ static void sock_rmem_free(struct sk_buff *skb) atomic_sub(skb->truesize, &sk->sk_rmem_alloc); } +static void skb_set_err_queue(struct sk_buff *skb) +{ + /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. + * So, it is safe to (mis)use it to mark skbs on the error queue. + */ + skb->pkt_type = PACKET_OUTGOING; + BUILD_BUG_ON(PACKET_OUTGOING == 0); +} + /* * Note: We dont mem charge error packets (no sk_forward_alloc changes) */ @@ -3707,6 +3716,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_rmem_free; atomic_add(skb->truesize, &sk->sk_rmem_alloc); + skb_set_err_queue(skb); /* before exiting rcu section, make sure dst is refcounted */ skb_dst_force(skb); diff --git a/net/socket.c b/net/socket.c index e034fe4164be..692d6989d2c2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -652,6 +652,16 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL(kernel_sendmsg); +static bool skb_is_err_queue(const struct sk_buff *skb) +{ + /* pkt_type of skbs enqueued on the error queue are set to + * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do + * in recvmsg, since skbs received on a local socket will never + * have a pkt_type of PACKET_OUTGOING. + */ + return skb->pkt_type == PACKET_OUTGOING; +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ @@ -695,7 +705,8 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, sizeof(tss), &tss); - if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) + if (skb_is_err_queue(skb) && skb->len && + (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, skb->len, skb->data); } From 4ef1b2869447411ad3ef91ad7d4891a83c1a509a Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 18 Mar 2017 17:03:00 -0400 Subject: [PATCH 263/297] tcp: mark skbs with SCM_TIMESTAMPING_OPT_STATS SOF_TIMESTAMPING_OPT_STATS can be enabled and disabled while packets are collected on the error queue. So, checking SOF_TIMESTAMPING_OPT_STATS in sk->sk_tsflags is not enough to safely assume that the skb contains OPT_STATS data. Add a bit in sock_exterr_skb to indicate whether the skb contains opt_stats data. Fixes: 1c885808e456 ("tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING") Reported-by: JongHwan Kim Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/errqueue.h | 2 ++ net/core/skbuff.c | 17 +++++++++++------ net/socket.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/errqueue.h b/include/linux/errqueue.h index 9ca23fcfb5d7..6fdfc884fdeb 100644 --- a/include/linux/errqueue.h +++ b/include/linux/errqueue.h @@ -20,6 +20,8 @@ struct sock_exterr_skb { struct sock_extended_err ee; u16 addr_offset; __be16 port; + u8 opt_stats:1, + unused:7; }; #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b1fbd1958eb6..9f781092fda9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3793,16 +3793,20 @@ EXPORT_SYMBOL(skb_clone_sk); static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, - int tstype) + int tstype, + bool opt_stats) { struct sock_exterr_skb *serr; int err; + BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; + serr->opt_stats = opt_stats; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk->sk_protocol == IPPROTO_TCP && @@ -3843,7 +3847,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, */ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); } } @@ -3854,7 +3858,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; - bool tsonly; + bool tsonly, opt_stats = false; if (!sk) return; @@ -3867,9 +3871,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, #ifdef CONFIG_INET if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) + sk->sk_type == SOCK_STREAM) { skb = tcp_get_timestamping_opt_stats(sk); - else + opt_stats = true; + } else #endif skb = alloc_skb(0, GFP_ATOMIC); } else { @@ -3888,7 +3893,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, else skb->tstamp = ktime_get_real(); - __skb_complete_tx_timestamp(skb, sk, tstype); + __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); diff --git a/net/socket.c b/net/socket.c index 692d6989d2c2..985ef06792d6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -706,7 +706,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, SCM_TIMESTAMPING, sizeof(tss), &tss); if (skb_is_err_queue(skb) && skb->len && - (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) + SKB_EXT_ERR(skb)->opt_stats) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, skb->len, skb->data); } From e8f1f34a344d060eaf1918089369c4c1172a153b Mon Sep 17 00:00:00 2001 From: Zi Shen Lim Date: Sun, 19 Mar 2017 23:03:14 -0700 Subject: [PATCH 264/297] selftests/bpf: fix broken build, take 2 Merge of 'linux-kselftest-4.11-rc1': 1. Partially removed use of 'test_objs' target, breaking force rebuild of BPFOBJ, introduced in commit d498f8719a09 ("bpf: Rebuild bpf.o for any dependency update"). Update target so dependency on BPFOBJ is restored. 2. Introduced commit 2047f1d8ba28 ("selftests: Fix the .c linking rule") which fixes order of LDLIBS. Commit d02d8986a768 ("bpf: Always test unprivileged programs") added libcap dependency into CFLAGS. Use LDLIBS instead to fix linking of test_verifier. 3. Introduced commit d83c3ba0b926 ("selftests: Fix selftests build to just build, not run tests"). Reordering the Makefile allows us to remove the 'all' target. Tested both: selftests/bpf$ make and selftests$ make TARGETS=bpf on Ubuntu 16.04.2. Signed-off-by: Zi Shen Lim Acked-by: Daniel Borkmann Tested-by: Daniel Borkmann Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Acked-by: Shuah Khan Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/Makefile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 67531f47781b..6a1ad58cb66f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,22 +1,23 @@ LIBDIR := ../../../lib -BPFOBJ := $(LIBDIR)/bpf/bpf.o +BPFDIR := $(LIBDIR)/bpf -CFLAGS += -Wall -O2 -lcap -I../../../include/uapi -I$(LIBDIR) $(BPFOBJ) +CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR) +LDLIBS += -lcap TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map TEST_PROGS := test_kmod.sh -all: $(TEST_GEN_PROGS) +include ../lib.mk -.PHONY: all clean force +BPFOBJ := $(OUTPUT)/bpf.o + +$(TEST_GEN_PROGS): $(BPFOBJ) + +.PHONY: force # force a rebuild of BPFOBJ when its dependencies are updated force: $(BPFOBJ): force - $(MAKE) -C $(dir $(BPFOBJ)) - -$(test_objs): $(BPFOBJ) - -include ../lib.mk + $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ From 4071898bf0f4d79ff353db327af2a15123272548 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Sun, 19 Mar 2017 09:19:57 -0700 Subject: [PATCH 265/297] net: qmi_wwan: Add USB IDs for MDM6600 modem on Motorola Droid 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gets qmicli working with the MDM6600 modem. Cc: Bjørn Mork Reviewed-by: Sebastian Reichel Tested-by: Sebastian Reichel Signed-off-by: Tony Lindgren Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index f8d55aa058ec..156f7f85e486 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -580,6 +580,10 @@ static const struct usb_device_id products[] = { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 0x01, 0x69), .driver_info = (unsigned long)&qmi_wwan_info, }, + { /* Motorola Mapphone devices with MDM6600 */ + USB_VENDOR_AND_INTERFACE_INFO(0x22b8, USB_CLASS_VENDOR_SPEC, 0xfb, 0xff), + .driver_info = (unsigned long)&qmi_wwan_info, + }, /* 2. Combined interface devices matching on class+protocol */ { /* Huawei E367 and possibly others in "Windows mode" */ From a05d4fd9176003e0c1f9c3d083f4dac19fd346ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Mar 2017 19:25:56 -0400 Subject: [PATCH 266/297] cgroup, net_cls: iterate the fds of only the tasks which are being migrated The net_cls controller controls the classid field of each socket which is associated with the cgroup. Because the classid is per-socket attribute, when a task migrates to another cgroup or the configured classid of the cgroup changes, the controller needs to walk all sockets and update the classid value, which was implemented by 3b13758f51de ("cgroups: Allow dynamically changing net_classid"). While the approach is not scalable, migrating tasks which have a lot of fds attached to them is rare and the cost is born by the ones initiating the operations. However, for simplicity, both the migration and classid config change paths call update_classid() which scans all fds of all tasks in the target css. This is an overkill for the migration path which only needs to cover a much smaller subset of tasks which are actually getting migrated in. On cgroup v1, this can lead to unexpected scalability issues when one tries to migrate a task or process into a net_cls cgroup which already contains a lot of fds. Even if the migration traget doesn't have many to get scanned, update_classid() ends up scanning all fds in the target cgroup which can be extremely numerous. Unfortunately, on cgroup v2 which doesn't use net_cls, the problem is even worse. Before bfc2cf6f61fc ("cgroup: call subsys->*attach() only for subsystems which are actually affected by migration"), cgroup core would call the ->css_attach callback even for controllers which don't see actual migration to a different css. As net_cls is always disabled but still mounted on cgroup v2, whenever a process is migrated on the cgroup v2 hierarchy, net_cls sees identity migration from root to root and cgroup core used to call ->css_attach callback for those. The net_cls ->css_attach ends up calling update_classid() on the root net_cls css to which all processes on the system belong to as the controller isn't used. This makes any cgroup v2 migration O(total_number_of_fds_on_the_system) which is horrible and easily leads to noticeable stalls triggering RCU stall warnings and so on. The worst symptom is already fixed in upstream by bfc2cf6f61fc ("cgroup: call subsys->*attach() only for subsystems which are actually affected by migration"); however, backporting that commit is too invasive and we want to avoid other cases too. This patch updates net_cls's cgrp_attach() to iterate fds of only the processes which are actually getting migrated. This removes the surprising migration cost which is dependent on the total number of fds in the target cgroup. As this leaves write_classid() the only user of update_classid(), open-code the helper into write_classid(). Reported-by: David Goode Fixes: 3b13758f51de ("cgroups: Allow dynamically changing net_classid") Cc: stable@vger.kernel.org # v4.4+ Cc: Nina Schiff Cc: David S. Miller Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- net/core/netclassid_cgroup.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6ae56037bb13..029a61ac6cdd 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -71,27 +71,17 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) return 0; } -static void update_classid(struct cgroup_subsys_state *css, void *v) -{ - struct css_task_iter it; - struct task_struct *p; - - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { - task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, v); - task_unlock(p); - } - css_task_iter_end(&it); -} - static void cgrp_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; + struct task_struct *p; - cgroup_taskset_first(tset, &css); - update_classid(css, - (void *)(unsigned long)css_cls_state(css)->classid); + cgroup_taskset_for_each(p, css, tset) { + task_lock(p); + iterate_fd(p->files, 0, update_classid_sock, + (void *)(unsigned long)css_cls_state(css)->classid); + task_unlock(p); + } } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) @@ -103,12 +93,22 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { struct cgroup_cls_state *cs = css_cls_state(css); + struct css_task_iter it; + struct task_struct *p; cgroup_sk_alloc_disable(); cs->classid = (u32)value; - update_classid(css, (void *)(unsigned long)cs->classid); + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { + task_lock(p); + iterate_fd(p->files, 0, update_classid_sock, + (void *)(unsigned long)cs->classid); + task_unlock(p); + } + css_task_iter_end(&it); + return 0; } From 210c4f70b4c630b27f0840c8043c138c955edc9e Mon Sep 17 00:00:00 2001 From: hayeswang Date: Mon, 20 Mar 2017 16:13:44 +0800 Subject: [PATCH 267/297] r8152: set the RMS of RTL8153 according to the mtu Set the received maximum size (RMS) according to the mtu size. It is unnecessary to receive a packet which is more than the size we could transmit. Besides, this could let the rx buffer be used effectively. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index bb3eedd07fbe..525c25817013 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2899,7 +2899,8 @@ static void r8153_first_init(struct r8152 *tp) rtl_rx_vlan_en(tp, tp->netdev->features & NETIF_F_HW_VLAN_CTAG_RX); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, RTL8153_RMS); + ocp_data = tp->netdev->mtu + VLAN_ETH_HLEN + CRC_SIZE; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, ocp_data); ocp_write_byte(tp, MCU_TYPE_PLA, PLA_MTPS, MTPS_JUMBO); ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_TCR0); @@ -2951,7 +2952,8 @@ static void r8153_enter_oob(struct r8152 *tp) usleep_range(1000, 2000); } - ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, RTL8153_RMS); + ocp_data = tp->netdev->mtu + VLAN_ETH_HLEN + CRC_SIZE; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, ocp_data); ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_TEREDO_CFG); ocp_data &= ~TEREDO_WAKE_MASK; @@ -4201,8 +4203,14 @@ static int rtl8152_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; - if (netif_running(dev) && netif_carrier_ok(dev)) - r8153_set_rx_early_size(tp); + if (netif_running(dev)) { + u32 rms = new_mtu + VLAN_ETH_HLEN + CRC_SIZE; + + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, rms); + + if (netif_carrier_ok(dev)) + r8153_set_rx_early_size(tp); + } mutex_unlock(&tp->control); From b20cb60e2b865638459e6ec82ad3536d3734e555 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Mon, 20 Mar 2017 16:13:45 +0800 Subject: [PATCH 268/297] r8152: fix the rx early size of RTL8153 revert commit a59e6d815226 ("r8152: correct the rx early size") and fix the rx early size as (rx buffer size - rx packet size - rx desc size - alignment) / 4 Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 525c25817013..0b1b9188625d 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -32,7 +32,7 @@ #define NETNEXT_VERSION "08" /* Information for net */ -#define NET_VERSION "8" +#define NET_VERSION "9" #define DRIVER_VERSION "v1." NETNEXT_VERSION "." NET_VERSION #define DRIVER_AUTHOR "Realtek linux nic maintainers " @@ -501,6 +501,8 @@ enum rtl_register_content { #define RTL8153_RMS RTL8153_MAX_PACKET #define RTL8152_TX_TIMEOUT (5 * HZ) #define RTL8152_NAPI_WEIGHT 64 +#define rx_reserved_size(x) ((x) + VLAN_ETH_HLEN + CRC_SIZE + \ + sizeof(struct rx_desc) + RX_ALIGN) /* rtl8152 flags */ enum rtl8152_flags { @@ -2253,8 +2255,7 @@ static void r8153_set_rx_early_timeout(struct r8152 *tp) static void r8153_set_rx_early_size(struct r8152 *tp) { - u32 mtu = tp->netdev->mtu; - u32 ocp_data = (agg_buf_sz - mtu - VLAN_ETH_HLEN - VLAN_HLEN) / 8; + u32 ocp_data = (agg_buf_sz - rx_reserved_size(tp->netdev->mtu)) / 4; ocp_write_word(tp, MCU_TYPE_USB, USB_RX_EARLY_SIZE, ocp_data); } From be9ca0d33c850192198c22518eeb1f41401268e8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Mar 2017 09:52:50 +0100 Subject: [PATCH 269/297] cpsw/netcp: work around reverse cpts dependency The dependency is reversed: cpsw and netcp call into cpts, but cpts depends on the other two in Kconfig. This can lead to cpts being a loadable module and its callers built-in: drivers/net/ethernet/ti/cpsw.o: In function `cpsw_remove': cpsw.c:(.text.cpsw_remove+0xd0): undefined reference to `cpts_release' drivers/net/ethernet/ti/cpsw.o: In function `cpsw_rx_handler': cpsw.c:(.text.cpsw_rx_handler+0x2dc): undefined reference to `cpts_rx_timestamp' drivers/net/ethernet/ti/cpsw.o: In function `cpsw_tx_handler': cpsw.c:(.text.cpsw_tx_handler+0x7c): undefined reference to `cpts_tx_timestamp' drivers/net/ethernet/ti/cpsw.o: In function `cpsw_ndo_stop': As a workaround, I'm introducing another Kconfig symbol to control the compilation of cpts, while making the actual module controlled by a silent symbol that is =y when necessary. Fixes: 6246168b4a38 ("net: ethernet: ti: netcp: add support of cpts") Signed-off-by: Arnd Bergmann Reviewed-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/Kconfig | 8 +++++++- drivers/net/ethernet/ti/Makefile | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index 296c8efd0038..d923890a9fda 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -74,7 +74,7 @@ config TI_CPSW will be called cpsw. config TI_CPTS - tristate "TI Common Platform Time Sync (CPTS) Support" + bool "TI Common Platform Time Sync (CPTS) Support" depends on TI_CPSW || TI_KEYSTONE_NETCP imply PTP_1588_CLOCK ---help--- @@ -83,6 +83,12 @@ config TI_CPTS The unit can time stamp PTP UDP/IPv4 and Layer 2 packets, and the driver offers a PTP Hardware Clock. +config TI_CPTS_MOD + tristate + depends on TI_CPTS + default y if TI_CPSW=y || TI_KEYSTONE_NETCP=y + default m + config TI_KEYSTONE_NETCP tristate "TI Keystone NETCP Core Support" select TI_CPSW_ALE diff --git a/drivers/net/ethernet/ti/Makefile b/drivers/net/ethernet/ti/Makefile index 1e7c10bf8713..10e6b0ce51ba 100644 --- a/drivers/net/ethernet/ti/Makefile +++ b/drivers/net/ethernet/ti/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_TI_DAVINCI_MDIO) += davinci_mdio.o obj-$(CONFIG_TI_DAVINCI_CPDMA) += davinci_cpdma.o obj-$(CONFIG_TI_CPSW_PHY_SEL) += cpsw-phy-sel.o obj-$(CONFIG_TI_CPSW_ALE) += cpsw_ale.o -obj-$(CONFIG_TI_CPTS) += cpts.o +obj-$(CONFIG_TI_CPTS_MOD) += cpts.o obj-$(CONFIG_TI_CPSW) += ti_cpsw.o ti_cpsw-y := cpsw.o From 07fef3623407444e51c12ea57cd91df38c1069e0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Mar 2017 09:58:33 +0100 Subject: [PATCH 270/297] cpsw/netcp: cpts depends on posix_timers With posix timers having become optional, we get a build error with the cpts time sync option of the CPSW driver: drivers/net/ethernet/ti/cpts.c: In function 'cpts_find_ts': drivers/net/ethernet/ti/cpts.c:291:23: error: implicit declaration of function 'ptp_classify_raw';did you mean 'ptp_classifier_init'? [-Werror=implicit-function-declaration] This adds a hard dependency on PTP_CLOCK to avoid the problem, as building it without PTP support makes no sense anyway. Fixes: baa73d9e478f ("posix-timers: Make them configurable") Cc: Nicolas Pitre Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann Acked-by: Nicolas Pitre Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index d923890a9fda..9e631952b86f 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -76,7 +76,7 @@ config TI_CPSW config TI_CPTS bool "TI Common Platform Time Sync (CPTS) Support" depends on TI_CPSW || TI_KEYSTONE_NETCP - imply PTP_1588_CLOCK + depends on PTP_1588_CLOCK ---help--- This driver supports the Common Platform Time Sync unit of the CPSW Ethernet Switch and Keystone 2 1g/10g Switch Subsystem. From 1511949c61ec63e4b646c34d602ac6990b38ce30 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 20 Mar 2017 17:46:27 +0800 Subject: [PATCH 271/297] sctp: declare struct sctp_stream before using it sctp_stream_free uses struct sctp_stream as a param, but struct sctp_stream is defined after it's declaration. This patch is to declare struct sctp_stream before sctp_stream_free. Fixes: a83863174a61 ("sctp: prepare asoc stream for stream reconf") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 4f645198e9bd..592decebac75 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -83,6 +83,7 @@ struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; +struct sctp_stream; #include From 581947787eaf1ad801959d00b42b9d0131aacb6a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 20 Mar 2017 18:00:28 +0800 Subject: [PATCH 272/297] sctp: remove useless err from sctp_association_init This patch is to remove the unnecessary temporary variable 'err' from sctp_association_init. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/associola.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 2a6835b4562b..0439a1a68367 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -71,9 +71,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a { struct net *net = sock_net(sk); struct sctp_sock *sp; - int i; sctp_paramhdr_t *p; - int err; + int i; /* Retrieve the SCTP per socket area. */ sp = sctp_sk((struct sock *)sk); @@ -264,8 +263,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* AUTH related initializations */ INIT_LIST_HEAD(&asoc->endpoint_shared_keys); - err = sctp_auth_asoc_copy_shkeys(ep, asoc, gfp); - if (err) + if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp)) goto fail_init; asoc->active_key_id = ep->active_key_id; From 557d054c01da0337ca81de9e9d9206d57245b57e Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 21 Mar 2017 10:47:49 +0100 Subject: [PATCH 273/297] tipc: fix nametbl deadlock at tipc_nametbl_unsubscribe Until now, tipc_nametbl_unsubscribe() is called at subscriptions reference count cleanup. Usually the subscriptions cleanup is called at subscription timeout or at subscription cancel or at subscriber delete. We have ignored the possibility of this being called from other locations, which causes deadlock as we try to grab the tn->nametbl_lock while holding it already. CPU1: CPU2: ---------- ---------------- tipc_nametbl_publish spin_lock_bh(&tn->nametbl_lock) tipc_nametbl_insert_publ tipc_nameseq_insert_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <> CPU1: CPU2: ---------- ---------------- tipc_nametbl_stop spin_lock_bh(&tn->nametbl_lock) tipc_purge_publications tipc_nameseq_remove_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <> In this commit, we advance the calling of tipc_nametbl_unsubscribe() from the refcount cleanup to the intended callers. Fixes: d094c4d5f5c7 ("tipc: add subscription refcount to avoid invalid delete") Reported-by: John Thompson Acked-by: Jon Maloy Signed-off-by: Ying Xue Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/subscr.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 9d94e65d0894..271cd66e4b3b 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -141,6 +141,11 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, static void tipc_subscrp_timeout(unsigned long data) { struct tipc_subscription *sub = (struct tipc_subscription *)data; + struct tipc_subscriber *subscriber = sub->subscriber; + + spin_lock_bh(&subscriber->lock); + tipc_nametbl_unsubscribe(sub); + spin_unlock_bh(&subscriber->lock); /* Notify subscriber of timeout */ tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, @@ -173,7 +178,6 @@ static void tipc_subscrp_kref_release(struct kref *kref) struct tipc_subscriber *subscriber = sub->subscriber; spin_lock_bh(&subscriber->lock); - tipc_nametbl_unsubscribe(sub); list_del(&sub->subscrp_list); atomic_dec(&tn->subscription_count); spin_unlock_bh(&subscriber->lock); @@ -205,6 +209,7 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) continue; + tipc_nametbl_unsubscribe(sub); tipc_subscrp_get(sub); spin_unlock_bh(&subscriber->lock); tipc_subscrp_delete(sub); From 1f30a86c58093046dc3e49c23d2618894e098f7a Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 21 Mar 2017 15:59:12 +0200 Subject: [PATCH 274/297] net/mlx5: Add missing entries for set/query rate limit commands The switch cases for the rate limit set and query commands were missing, which could get us wrong under fw error or driver reset flow, fix that. Fixes: 1466cc5b23d1 ('net/mlx5: Rate limit tables support') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index caa837e5e2b9..a380353a78c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -361,6 +361,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_QUERY_VPORT_COUNTER: case MLX5_CMD_OP_ALLOC_Q_COUNTER: case MLX5_CMD_OP_QUERY_Q_COUNTER: + case MLX5_CMD_OP_SET_RATE_LIMIT: + case MLX5_CMD_OP_QUERY_RATE_LIMIT: case MLX5_CMD_OP_ALLOC_PD: case MLX5_CMD_OP_ALLOC_UAR: case MLX5_CMD_OP_CONFIG_INT_MODERATION: @@ -497,6 +499,8 @@ const char *mlx5_command_str(int command) MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER); MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER); MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER); + MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT); + MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT); MLX5_COMMAND_STR_CASE(ALLOC_PD); MLX5_COMMAND_STR_CASE(DEALLOC_PD); MLX5_COMMAND_STR_CASE(ALLOC_UAR); From d85cdccbb3fe9a632ec9d0f4e4526c8c84fc3523 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 21 Mar 2017 15:59:13 +0200 Subject: [PATCH 275/297] net/mlx5e: Change the TC offload rule add/del code path to be per NIC or E-Switch Refactor the code to deal with add/del TC rules to have handler per NIC/E-switch offloading use case, and push the latter into the e-switch code. This provides better separation and is to be used in down-stream patch for applying a fix. Fixes: bffaa916588e ("net/mlx5: E-Switch, Add control for inline mode") Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 59 ++++++++++++------- .../net/ethernet/mellanox/mlx5/core/eswitch.h | 5 ++ .../mellanox/mlx5/core/eswitch_offloads.c | 14 +++++ 3 files changed, 58 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 79481f4cf264..2825b5665456 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -133,6 +133,23 @@ err_create_ft: return rule; } +static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_fc *counter = NULL; + + if (!IS_ERR(flow->rule)) { + counter = mlx5_flow_rule_counter(flow->rule); + mlx5_del_flow_rules(flow->rule); + mlx5_fc_destroy(priv->mdev, counter); + } + + if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) { + mlx5_destroy_flow_table(priv->fs.tc.t); + priv->fs.tc.t = NULL; + } +} + static struct mlx5_flow_handle * mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec, @@ -149,7 +166,24 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, } static void mlx5e_detach_encap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow) { + struct mlx5e_tc_flow *flow); + +static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + mlx5_eswitch_del_offloaded_rule(esw, flow->rule, flow->attr); + + mlx5_eswitch_del_vlan_action(esw, flow->attr); + + if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) + mlx5e_detach_encap(priv, flow); +} + +static void mlx5e_detach_encap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ struct list_head *next = flow->encap.next; list_del(&flow->encap); @@ -173,25 +207,10 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv, static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow) { - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5_fc *counter = NULL; - - if (!IS_ERR(flow->rule)) { - counter = mlx5_flow_rule_counter(flow->rule); - mlx5_del_flow_rules(flow->rule); - mlx5_fc_destroy(priv->mdev, counter); - } - - if (flow->flags & MLX5E_TC_FLOW_ESWITCH) { - mlx5_eswitch_del_vlan_action(esw, flow->attr); - if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) - mlx5e_detach_encap(priv, flow); - } - - if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) { - mlx5_destroy_flow_table(priv->fs.tc.t); - priv->fs.tc.t = NULL; - } + if (flow->flags & MLX5E_TC_FLOW_ESWITCH) + mlx5e_tc_del_fdb_flow(priv, flow); + else + mlx5e_tc_del_nic_flow(priv, flow); } static void parse_vxlan_attr(struct mlx5_flow_spec *spec, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 5b78883d5654..9227a83a97e3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -271,6 +271,11 @@ struct mlx5_flow_handle * mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, struct mlx5_flow_spec *spec, struct mlx5_esw_flow_attr *attr); +void +mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_esw_flow_attr *attr); + struct mlx5_flow_handle * mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 4f5b0d47d5f3..bfabefe20ac0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -97,6 +97,20 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, return rule; } +void +mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_esw_flow_attr *attr) +{ + struct mlx5_fc *counter = NULL; + + if (!IS_ERR(rule)) { + counter = mlx5_flow_rule_counter(rule); + mlx5_del_flow_rules(rule); + mlx5_fc_destroy(esw->dev, counter); + } +} + static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val) { struct mlx5_eswitch_rep *rep; From 375f51e2b5b7b9a42b3139aea519cbb1bfc5d6ef Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 21 Mar 2017 15:59:14 +0200 Subject: [PATCH 276/297] net/mlx5: E-Switch, Don't allow changing inline mode when flows are configured Changing the eswitch inline mode can potentially cause already configured flows not to match the policy. E.g. set policy L4, add some L4 rules, set policy to L2 --> bad! Hence we disallow it. Keep track of how many offloaded rules are now set and refuse inline mode changes if this isn't zero. Fixes: bffaa916588e ("net/mlx5: E-Switch, Add control for inline mode") Signed-off-by: Roi Dayan Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 1 + .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 9227a83a97e3..ad329b1680b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -209,6 +209,7 @@ struct mlx5_esw_offload { struct mlx5_eswitch_rep *vport_reps; DECLARE_HASHTABLE(encap_tbl, 8); u8 inline_mode; + u64 num_flows; }; struct mlx5_eswitch { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index bfabefe20ac0..307ec6c5fd3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -93,6 +93,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, spec, &flow_act, dest, i); if (IS_ERR(rule)) mlx5_fc_destroy(esw->dev, counter); + else + esw->offloads.num_flows++; return rule; } @@ -108,6 +110,7 @@ mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, counter = mlx5_flow_rule_counter(rule); mlx5_del_flow_rules(rule); mlx5_fc_destroy(esw->dev, counter); + esw->offloads.num_flows--; } } @@ -922,6 +925,11 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode) MLX5_CAP_INLINE_MODE_VPORT_CONTEXT) return -EOPNOTSUPP; + if (esw->offloads.num_flows > 0) { + esw_warn(dev, "Can't set inline mode when flows are configured\n"); + return -EOPNOTSUPP; + } + err = esw_inline_mode_from_devlink(mode, &mlx5_mode); if (err) goto out; From 09c91ddf2cd33489c2c14edfef43ae38d412888e Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 21 Mar 2017 15:59:15 +0200 Subject: [PATCH 277/297] net/mlx5e: Use the proper UAPI values when offloading TC vlan actions Currently we use the non UAPI values and we miss erring on the modify action which is not supported, fix that. Fixes: 8b32580df1cb ('net/mlx5e: Add TC vlan action for SRIOV offloads') Signed-off-by: Or Gerlitz Reported-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 2825b5665456..9c13abaf3885 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1131,14 +1131,16 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, } if (is_tcf_vlan(a)) { - if (tcf_vlan_action(a) == VLAN_F_POP) { + if (tcf_vlan_action(a) == TCA_VLAN_ACT_POP) { attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; - } else if (tcf_vlan_action(a) == VLAN_F_PUSH) { + } else if (tcf_vlan_action(a) == TCA_VLAN_ACT_PUSH) { if (tcf_vlan_push_proto(a) != htons(ETH_P_8021Q)) return -EOPNOTSUPP; attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH; attr->vlan = tcf_vlan_push_vid(a); + } else { /* action is TCA_VLAN_ACT_MODIFY */ + return -EOPNOTSUPP; } continue; } From 1ad9a00ae0efc2e9337148d6c382fad3d27bf99a Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 21 Mar 2017 15:59:16 +0200 Subject: [PATCH 278/297] net/mlx5e: Avoid supporting udp tunnel port ndo for VF reps This was added to allow the TC offloading code to identify offloading encap/decap vxlan rules. The VF reps are effectively related to the same mlx5 PCI device as the PF. Since the kernel invokes the (say) delete ndo for each netdev, the FW erred on multiple vxlan dst port deletes when the port was deleted from the system. We fix that by keeping the registration to be carried out only by the PF. Since the PF serves as the uplink device, the VF reps will look up a port there and realize if they are ok to offload that. Tested: ip link add vxlan1 type vxlan id 44 dev ens5f0 dstport 9999 ip link set vxlan1 up ip link del dev vxlan1 Fixes: 4a25730eb202 ('net/mlx5e: Add ndo_udp_tunnel_add to VF representors') Signed-off-by: Paul Blakey Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 ---- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 -- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 9 +++++++-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index f6a6ded204f6..dc52053128bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -928,10 +928,6 @@ void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv); int mlx5e_attach_netdev(struct mlx5_core_dev *mdev, struct net_device *netdev); void mlx5e_detach_netdev(struct mlx5_core_dev *mdev, struct net_device *netdev); u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout); -void mlx5e_add_vxlan_port(struct net_device *netdev, - struct udp_tunnel_info *ti); -void mlx5e_del_vxlan_port(struct net_device *netdev, - struct udp_tunnel_info *ti); int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, void *sp); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 8ef64c4db2c2..66c133757a5e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3100,8 +3100,8 @@ static int mlx5e_get_vf_stats(struct net_device *dev, vf_stats); } -void mlx5e_add_vxlan_port(struct net_device *netdev, - struct udp_tunnel_info *ti) +static void mlx5e_add_vxlan_port(struct net_device *netdev, + struct udp_tunnel_info *ti) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -3114,8 +3114,8 @@ void mlx5e_add_vxlan_port(struct net_device *netdev, mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 1); } -void mlx5e_del_vxlan_port(struct net_device *netdev, - struct udp_tunnel_info *ti) +static void mlx5e_del_vxlan_port(struct net_device *netdev, + struct udp_tunnel_info *ti) { struct mlx5e_priv *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 2c864574a9d5..f621373bd7a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -393,8 +393,6 @@ static const struct net_device_ops mlx5e_netdev_ops_rep = { .ndo_get_phys_port_name = mlx5e_rep_get_phys_port_name, .ndo_setup_tc = mlx5e_rep_ndo_setup_tc, .ndo_get_stats64 = mlx5e_rep_get_stats, - .ndo_udp_tunnel_add = mlx5e_add_vxlan_port, - .ndo_udp_tunnel_del = mlx5e_del_vxlan_port, .ndo_has_offload_stats = mlx5e_has_offload_stats, .ndo_get_offload_stats = mlx5e_get_offload_stats, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9c13abaf3885..fade7233dac5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -267,12 +267,15 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, skb_flow_dissector_target(f->dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, f->mask); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw); + struct mlx5e_priv *up_priv = netdev_priv(up_dev); /* Full udp dst port must be given */ if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst))) goto vxlan_match_offload_err; - if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->dst)) && + if (mlx5e_vxlan_lookup_port(up_priv, be16_to_cpu(key->dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) parse_vxlan_attr(spec, f); else { @@ -995,6 +998,8 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, struct mlx5_esw_flow_attr *attr) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw); + struct mlx5e_priv *up_priv = netdev_priv(up_dev); unsigned short family = ip_tunnel_info_af(tun_info); struct ip_tunnel_key *key = &tun_info->key; struct mlx5_encap_entry *e; @@ -1015,7 +1020,7 @@ vxlan_encap_offload_err: return -EOPNOTSUPP; } - if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) && + if (mlx5e_vxlan_lookup_port(up_priv, be16_to_cpu(key->tp_dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { tunnel_type = MLX5_HEADER_TYPE_VXLAN; } else { From 5f40b4ed975c26016cf41953b7510fe90718e21c Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 21 Mar 2017 15:59:17 +0200 Subject: [PATCH 279/297] net/mlx5: Increase number of max QPs in default profile With ConnectX-4 sharing SRQs from the same space as QPs, we hit a limit preventing some applications to allocate needed QPs amount. Double the size to 256K. Fixes: e126ba97dba9e ('mlx5: Add driver for Mellanox Connect-IB adapters') Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e2bd600d19de..60154a175bd3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -87,7 +87,7 @@ static struct mlx5_profile profile[] = { [2] = { .mask = MLX5_PROF_MASK_QP_SIZE | MLX5_PROF_MASK_MR_CACHE, - .log_max_qp = 17, + .log_max_qp = 18, .mr_cache[0] = { .size = 500, .limit = 250 From d3a4e4da54c7adb420d5f48e89be913b14bdeff1 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 21 Mar 2017 15:59:18 +0200 Subject: [PATCH 280/297] net/mlx5e: Count GSO packets correctly TX packets statistics ('tx_packets' counter) used to count GSO packets as one, even though it contains multiple segments. This patch will increment the counter by the number of segments, and align the driver with the behavior of other drivers in the stack. Note that no information is lost in this patch due to 'tx_tso_packets' counter existence. Before, ethtool showed: $ ethtool -S ens6 | egrep "tx_packets|tx_tso_packets" tx_packets: 61340 tx_tso_packets: 60954 tx_packets_phy: 2451115 Now, we will see the more logical statistics: $ ethtool -S ens6 | egrep "tx_packets|tx_tso_packets" tx_packets: 2451115 tx_tso_packets: 60954 tx_packets_phy: 2451115 Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files") Signed-off-by: Gal Pressman Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index f193128bac4b..57f5e2d7ebd1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -274,15 +274,18 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) sq->stats.tso_bytes += skb->len - ihs; } + sq->stats.packets += skb_shinfo(skb)->gso_segs; num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs; } else { bf = sq->bf_budget && !skb->xmit_more && !skb_shinfo(skb)->nr_frags; ihs = mlx5e_get_inline_hdr_size(sq, skb, bf); + sq->stats.packets++; num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); } + sq->stats.bytes += num_bytes; wi->num_bytes = num_bytes; ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; @@ -381,8 +384,6 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) if (bf) sq->bf_budget--; - sq->stats.packets++; - sq->stats.bytes += num_bytes; return NETDEV_TX_OK; dma_unmap_wqe_err: From 8ab7e2ae15d84ba758b2c8c6f4075722e9bd2a08 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 21 Mar 2017 15:59:19 +0200 Subject: [PATCH 281/297] net/mlx5e: Count LRO packets correctly RX packets statistics ('rx_packets' counter) used to count LRO packets as one, even though it contains multiple segments. This patch will increment the counter by the number of segments, and align the driver with the behavior of other drivers in the stack. Note that no information is lost in this patch due to 'rx_lro_packets' counter existence. Before, ethtool showed: $ ethtool -S ens6 | egrep "rx_packets|rx_lro_packets" rx_packets: 435277 rx_lro_packets: 35847 rx_packets_phy: 1935066 Now, we will see the more logical statistics: $ ethtool -S ens6 | egrep "rx_packets|rx_lro_packets" rx_packets: 1935066 rx_lro_packets: 35847 rx_packets_phy: 1935066 Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files") Signed-off-by: Gal Pressman Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 3d371688fbbb..bafcb349a50c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -601,6 +601,10 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, if (lro_num_seg > 1) { mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt); skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg); + /* Subtract one since we already counted this as one + * "regular" packet in mlx5e_complete_rx_cqe() + */ + rq->stats.packets += lro_num_seg - 1; rq->stats.lro_packets++; rq->stats.lro_bytes += cqe_bcnt; } From ac23d3cac1f339febd95c403a245ae072dfd0e84 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 21 Mar 2017 11:44:25 -0400 Subject: [PATCH 282/297] fjes: Do not load fjes driver if system does not have extended socket device. The fjes driver is used only by FUJITSU servers and almost of all servers in the world never use it. But currently if ACPI PNP0C02 is defined in the ACPI table, the following message is always shown: "FUJITSU Extended Socket Network Device Driver - version 1.2 - Copyright (c) 2015 FUJITSU LIMITED" The message makes users confused because there is no reason that the message is shown in other vendor servers. To avoid the confusion, the patch adds a check that the server has a extended socket device or not. Signed-off-by: Yasuaki Ishimatsu CC: Taku Izumi Signed-off-by: David S. Miller --- drivers/net/fjes/fjes_main.c | 52 ++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index c4b3c4b77a9c..7b589649ab46 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -45,6 +45,8 @@ MODULE_DESCRIPTION("FUJITSU Extended Socket Network Device Driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); +#define ACPI_MOTHERBOARD_RESOURCE_HID "PNP0C02" + static int fjes_request_irq(struct fjes_adapter *); static void fjes_free_irq(struct fjes_adapter *); @@ -78,7 +80,7 @@ static void fjes_rx_irq(struct fjes_adapter *, int); static int fjes_poll(struct napi_struct *, int); static const struct acpi_device_id fjes_acpi_ids[] = { - {"PNP0C02", 0}, + {ACPI_MOTHERBOARD_RESOURCE_HID, 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, fjes_acpi_ids); @@ -115,18 +117,17 @@ static struct resource fjes_resource[] = { }, }; -static int fjes_acpi_add(struct acpi_device *device) +static bool is_extended_socket_device(struct acpi_device *device) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL}; char str_buf[sizeof(FJES_ACPI_SYMBOL) + 1]; - struct platform_device *plat_dev; union acpi_object *str; acpi_status status; int result; status = acpi_evaluate_object(device->handle, "_STR", NULL, &buffer); if (ACPI_FAILURE(status)) - return -ENODEV; + return false; str = buffer.pointer; result = utf16s_to_utf8s((wchar_t *)str->string.pointer, @@ -136,10 +137,21 @@ static int fjes_acpi_add(struct acpi_device *device) if (strncmp(FJES_ACPI_SYMBOL, str_buf, strlen(FJES_ACPI_SYMBOL)) != 0) { kfree(buffer.pointer); - return -ENODEV; + return false; } kfree(buffer.pointer); + return true; +} + +static int fjes_acpi_add(struct acpi_device *device) +{ + struct platform_device *plat_dev; + acpi_status status; + + if (!is_extended_socket_device(device)) + return -ENODEV; + status = acpi_walk_resources(device->handle, METHOD_NAME__CRS, fjes_get_acpi_resource, fjes_resource); if (ACPI_FAILURE(status)) @@ -1473,11 +1485,41 @@ static void fjes_watch_unshare_task(struct work_struct *work) } } +static acpi_status +acpi_find_extended_socket_device(acpi_handle obj_handle, u32 level, + void *context, void **return_value) +{ + struct acpi_device *device; + bool *found = context; + int result; + + result = acpi_bus_get_device(obj_handle, &device); + if (result) + return AE_OK; + + if (strcmp(acpi_device_hid(device), ACPI_MOTHERBOARD_RESOURCE_HID)) + return AE_OK; + + if (!is_extended_socket_device(device)) + return AE_OK; + + *found = true; + return AE_CTRL_TERMINATE; +} + /* fjes_init_module - Driver Registration Routine */ static int __init fjes_init_module(void) { + bool found = false; int result; + acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, + acpi_find_extended_socket_device, NULL, &found, + NULL); + + if (!found) + return -ENODEV; + pr_info("%s - version %s - %s\n", fjes_driver_string, fjes_driver_version, fjes_copyright); From 2b396d302650f1ebb770ed758ddcf5a64328ffd5 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 21 Mar 2017 11:46:35 -0400 Subject: [PATCH 283/297] fjes: Do not load fjes driver if extended socket device is not power on. The extended device socket cannot turn on/off while system is running. So when system boots up and the device is not power on, the fjes driver does not need be loaded. To check the status of the device, the patch adds ACPI _STA method check. Signed-off-by: Yasuaki Ishimatsu CC: Taku Izumi Signed-off-by: David S. Miller --- drivers/net/fjes/fjes_main.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index 7b589649ab46..ae48c809bac9 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -144,6 +144,24 @@ static bool is_extended_socket_device(struct acpi_device *device) return true; } +static int acpi_check_extended_socket_status(struct acpi_device *device) +{ + unsigned long long sta; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "_STA", NULL, &sta); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!((sta & ACPI_STA_DEVICE_PRESENT) && + (sta & ACPI_STA_DEVICE_ENABLED) && + (sta & ACPI_STA_DEVICE_UI) && + (sta & ACPI_STA_DEVICE_FUNCTIONING))) + return -ENODEV; + + return 0; +} + static int fjes_acpi_add(struct acpi_device *device) { struct platform_device *plat_dev; @@ -152,6 +170,9 @@ static int fjes_acpi_add(struct acpi_device *device) if (!is_extended_socket_device(device)) return -ENODEV; + if (acpi_check_extended_socket_status(device)) + return -ENODEV; + status = acpi_walk_resources(device->handle, METHOD_NAME__CRS, fjes_get_acpi_resource, fjes_resource); if (ACPI_FAILURE(status)) @@ -1503,6 +1524,9 @@ acpi_find_extended_socket_device(acpi_handle obj_handle, u32 level, if (!is_extended_socket_device(device)) return AE_OK; + if (acpi_check_extended_socket_status(device)) + return AE_OK; + *found = true; return AE_CTRL_TERMINATE; } From d515684d78148884d5fc425ba904c50f03844020 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 21 Mar 2017 17:14:27 +0100 Subject: [PATCH 284/297] ipv6: make sure to initialize sockc.tsflags before first use In the case udp_sk(sk)->pending is AF_INET6, udpv6_sendmsg() would jump to do_append_data, skipping the initialization of sockc.tsflags. Fix the problem by moving sockc.tsflags initialization earlier. The bug was detected with KMSAN. Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") Signed-off-by: Alexander Potapenko Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4e4c401e3bc6..e28082f0a307 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1035,6 +1035,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.hlimit = -1; ipc6.tclass = -1; ipc6.dontfrag = -1; + sockc.tsflags = sk->sk_tsflags; /* destination address check */ if (sin6) { @@ -1159,7 +1160,6 @@ do_udp_sendmsg: fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { opt = &opt_space; From 31739eae738ccbe8b9d627c3f2251017ca03f4d2 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Tue, 21 Mar 2017 14:01:06 -0700 Subject: [PATCH 285/297] net: bcmgenet: remove bcmgenet_internal_phy_setup() Commit 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset") removed the bcmgenet_mii_reset() function from bcmgenet_power_up() and bcmgenet_internal_phy_setup() functions. In so doing it broke the reset of the internal PHY devices used by the GENETv1-GENETv3 which required this reset before the UniMAC was enabled. It also broke the internal GPHY devices used by the GENETv4 because the config_init that installed the AFE workaround was no longer occurring after the reset of the GPHY performed by bcmgenet_phy_power_set() in bcmgenet_internal_phy_setup(). In addition the code in bcmgenet_internal_phy_setup() related to the "enable APD" comment goes with the bcmgenet_mii_reset() so it should have also been removed. Commit bd4060a6108b ("net: bcmgenet: Power on integrated GPHY in bcmgenet_power_up()") moved the bcmgenet_phy_power_set() call to the bcmgenet_power_up() function, but failed to remove it from the bcmgenet_internal_phy_setup() function. Had it done so, the bcmgenet_internal_phy_setup() function would have been empty and could have been removed at that time. Commit 5dbebbb44a6a ("net: bcmgenet: Software reset EPHY after power on") was submitted to correct the functional problems introduced by commit 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset"). It was included in v4.4 and made available on 4.3-stable. Unfortunately, it didn't fully revert the commit because this bcmgenet_mii_reset() doesn't apply the soft reset to the internal GPHY used by GENETv4 like the previous one did. This prevents the restoration of the AFE work- arounds for internal GPHY devices after the bcmgenet_phy_power_set() in bcmgenet_internal_phy_setup(). This commit takes the alternate approach of removing the unnecessary bcmgenet_internal_phy_setup() function which shouldn't have been in v4.3 so that when bcmgenet_mii_reset() was restored it should have only gone into bcmgenet_power_up(). This will avoid the problems while also removing the redundancy (and hopefully some of the confusion). Fixes: 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset") Signed-off-by: Doug Berger Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmmii.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index e87607621e62..2f9281936f0e 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -220,20 +220,6 @@ void bcmgenet_phy_power_set(struct net_device *dev, bool enable) udelay(60); } -static void bcmgenet_internal_phy_setup(struct net_device *dev) -{ - struct bcmgenet_priv *priv = netdev_priv(dev); - u32 reg; - - /* Power up PHY */ - bcmgenet_phy_power_set(dev, true); - /* enable APD */ - reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT); - reg |= EXT_PWR_DN_EN_LD; - bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT); - bcmgenet_mii_reset(dev); -} - static void bcmgenet_moca_phy_setup(struct bcmgenet_priv *priv) { u32 reg; @@ -281,7 +267,6 @@ int bcmgenet_mii_config(struct net_device *dev) if (priv->internal_phy) { phy_name = "internal PHY"; - bcmgenet_internal_phy_setup(dev); } else if (priv->phy_interface == PHY_INTERFACE_MODE_MOCA) { phy_name = "MoCA"; bcmgenet_moca_phy_setup(priv); From dd1ef79120e1600cb48320cf80a612ee6510110c Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Tue, 21 Mar 2017 15:07:48 -0700 Subject: [PATCH 286/297] enic: update enic maintainers update enic maintainers Signed-off-by: Govindarajulu Varadarajan Signed-off-by: David S. Miller --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 078c38217daa..c45c02bc6082 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3216,7 +3216,6 @@ F: drivers/platform/chrome/ CISCO VIC ETHERNET NIC DRIVER M: Christian Benvenuti -M: Sujith Sankar M: Govindarajulu Varadarajan <_govind@gmx.com> M: Neel Patel S: Supported From 8c290e60fa2a51806159522331c9ed41252a8fb3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 Mar 2017 19:05:04 -0700 Subject: [PATCH 287/297] bpf: fix hashmap extra_elems logic In both kmalloc and prealloc mode the bpf_map_update_elem() is using per-cpu extra_elems to do atomic update when the map is full. There are two issues with it. The logic can be misused, since it allows max_entries+num_cpus elements to be present in the map. And alloc_extra_elems() at map creation time can fail percpu alloc for large map values with a warn: WARNING: CPU: 3 PID: 2752 at ../mm/percpu.c:892 pcpu_alloc+0x119/0xa60 illegal size (32824) or align (8) for percpu allocation The fixes for both of these issues are different for kmalloc and prealloc modes. For prealloc mode allocate extra num_possible_cpus elements and store their pointers into extra_elems array instead of actual elements. Hence we can use these hidden(spare) elements not only when the map is full but during bpf_map_update_elem() that replaces existing element too. That also improves performance, since pcpu_freelist_pop/push is avoided. Unfortunately this approach cannot be used for kmalloc mode which needs to kfree elements after rcu grace period. Therefore switch it back to normal kmalloc even when full and old element exists like it was prior to commit 6c9059817432 ("bpf: pre-allocate hash map elements"). Add tests to check for over max_entries and large map values. Reported-by: Dave Jones Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 146 ++++++++++++------------ tools/testing/selftests/bpf/test_maps.c | 29 ++++- 2 files changed, 98 insertions(+), 77 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index afe5bab376c9..361a69dfe543 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -30,18 +30,12 @@ struct bpf_htab { struct pcpu_freelist freelist; struct bpf_lru lru; }; - void __percpu *extra_elems; + struct htab_elem *__percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; -enum extra_elem_state { - HTAB_NOT_AN_EXTRA_ELEM = 0, - HTAB_EXTRA_ELEM_FREE, - HTAB_EXTRA_ELEM_USED -}; - /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { @@ -56,7 +50,6 @@ struct htab_elem { }; union { struct rcu_head rcu; - enum extra_elem_state state; struct bpf_lru_node lru_node; }; u32 hash; @@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static bool htab_is_prealloc(const struct bpf_htab *htab) +{ + return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, static int prealloc_init(struct bpf_htab *htab) { + u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; - htab->elems = bpf_map_area_alloc(htab->elem_size * - htab->map.max_entries); + if (!htab_is_percpu(htab) && !htab_is_lru(htab)) + num_entries += num_possible_cpus(); + + htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); if (!htab->elems) return -ENOMEM; if (!htab_is_percpu(htab)) goto skip_percpu_elems; - for (i = 0; i < htab->map.max_entries; i++) { + for (i = 0; i < num_entries; i++) { u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; @@ -166,11 +167,11 @@ skip_percpu_elems: if (htab_is_lru(htab)) bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node), - htab->elem_size, htab->map.max_entries); + htab->elem_size, num_entries); else pcpu_freelist_populate(&htab->freelist, htab->elems + offsetof(struct htab_elem, fnode), - htab->elem_size, htab->map.max_entries); + htab->elem_size, num_entries); return 0; @@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab) static int alloc_extra_elems(struct bpf_htab *htab) { - void __percpu *pptr; + struct htab_elem *__percpu *pptr, *l_new; + struct pcpu_freelist_node *l; int cpu; - pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, + GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; for_each_possible_cpu(cpu) { - ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = - HTAB_EXTRA_ELEM_FREE; + l = pcpu_freelist_pop(&htab->freelist); + /* pop will succeed, since prealloc_init() + * preallocated extra num_possible_cpus elements + */ + l_new = container_of(l, struct htab_elem, fnode); + *per_cpu_ptr(pptr, cpu) = l_new; } htab->extra_elems = pptr; return 0; @@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ - err = alloc_extra_elems(htab); - if (err) - goto free_buckets; - } - if (prealloc) { err = prealloc_init(htab); if (err) - goto free_extra_elems; + goto free_buckets; + + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ + err = alloc_extra_elems(htab); + if (err) + goto free_prealloc; + } } return &htab->map; -free_extra_elems: - free_percpu(htab->extra_elems); +free_prealloc: + prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); free_htab: @@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { - if (l->state == HTAB_EXTRA_ELEM_USED) { - l->state = HTAB_EXTRA_ELEM_FREE; - return; - } - - if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + if (htab_is_prealloc(htab)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -610,47 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - bool old_elem_exists) + struct htab_elem *old_elem) { u32 size = htab->map.value_size; - bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); - struct htab_elem *l_new; + bool prealloc = htab_is_prealloc(htab); + struct htab_elem *l_new, **pl_new; void __percpu *pptr; - int err = 0; if (prealloc) { - struct pcpu_freelist_node *l; - - l = pcpu_freelist_pop(&htab->freelist); - if (!l) - err = -E2BIG; - else - l_new = container_of(l, struct htab_elem, fnode); - } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - atomic_dec(&htab->count); - err = -E2BIG; + if (old_elem) { + /* if we're updating the existing element, + * use per-cpu extra elems to avoid freelist_pop/push + */ + pl_new = this_cpu_ptr(htab->extra_elems); + l_new = *pl_new; + *pl_new = old_elem; } else { - l_new = kmalloc(htab->elem_size, - GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); + struct pcpu_freelist_node *l; + + l = pcpu_freelist_pop(&htab->freelist); + if (!l) + return ERR_PTR(-E2BIG); + l_new = container_of(l, struct htab_elem, fnode); } - } - - if (err) { - if (!old_elem_exists) - return ERR_PTR(err); - - /* if we're updating the existing element and the hash table - * is full, use per-cpu extra elems - */ - l_new = this_cpu_ptr(htab->extra_elems); - if (l_new->state != HTAB_EXTRA_ELEM_FREE) - return ERR_PTR(-E2BIG); - l_new->state = HTAB_EXTRA_ELEM_USED; } else { - l_new->state = HTAB_NOT_AN_EXTRA_ELEM; + if (atomic_inc_return(&htab->count) > htab->map.max_entries) + if (!old_elem) { + /* when map is full and update() is replacing + * old element, it's ok to allocate, since + * old element will be freed immediately. + * Otherwise return an error + */ + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } memcpy(l_new->key, key, key_size); @@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, goto err; l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - !!l_old); + l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -744,7 +742,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); - free_htab_elem(htab, l_old); + if (!htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); } ret = 0; err: @@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, false); + hash, true, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -1024,8 +1023,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { hlist_nulls_del_rcu(&l->hash_node); - if (l->state != HTAB_EXTRA_ELEM_USED) - htab_elem_free(htab, l); + htab_elem_free(htab, l); } } } @@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) + if (!htab_is_prealloc(htab)) delete_all_elements(htab); else prealloc_destroy(htab); diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index cada17ac00b8..a0aa2009b0e0 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -80,8 +80,9 @@ static void test_hashmap(int task, void *data) assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0); key = 2; assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); - key = 1; - assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); + key = 3; + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && + errno == E2BIG); /* Check that key = 0 doesn't exist. */ key = 0; @@ -110,6 +111,24 @@ static void test_hashmap(int task, void *data) close(fd); } +static void test_hashmap_sizes(int task, void *data) +{ + int fd, i, j; + + for (i = 1; i <= 512; i <<= 1) + for (j = 1; j <= 1 << 18; j <<= 1) { + fd = bpf_create_map(BPF_MAP_TYPE_HASH, i, j, + 2, map_flags); + if (fd < 0) { + printf("Failed to create hashmap key=%d value=%d '%s'\n", + i, j, strerror(errno)); + exit(1); + } + close(fd); + usleep(10); /* give kernel time to destroy */ + } +} + static void test_hashmap_percpu(int task, void *data) { unsigned int nr_cpus = bpf_num_possible_cpus(); @@ -317,7 +336,10 @@ static void test_arraymap_percpu(int task, void *data) static void test_arraymap_percpu_many_keys(void) { unsigned int nr_cpus = bpf_num_possible_cpus(); - unsigned int nr_keys = 20000; + /* nr_keys is not too large otherwise the test stresses percpu + * allocator more than anything else + */ + unsigned int nr_keys = 2000; long values[nr_cpus]; int key, fd, i; @@ -419,6 +441,7 @@ static void test_map_stress(void) { run_parallel(100, test_hashmap, NULL); run_parallel(100, test_hashmap_percpu, NULL); + run_parallel(100, test_hashmap_sizes, NULL); run_parallel(100, test_arraymap, NULL); run_parallel(100, test_arraymap_percpu, NULL); From c64c0b3cac4c5b8cb093727d2c19743ea3965c0b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Mar 2017 19:22:28 -0700 Subject: [PATCH 288/297] ipv4: provide stronger user input validation in nl_fib_input() Alexander reported a KMSAN splat caused by reads of uninitialized field (tb_id_in) from user provided struct fib_result_nl It turns out nl_fib_input() sanity tests on user input is a bit wrong : User can pretend nlh->nlmsg_len is big enough, but provide at sendmsg() time a too small buffer. Reported-by: Alexander Potapenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 42bfd08109dd..8f2133ffc2ff 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1083,7 +1083,8 @@ static void nl_fib_input(struct sk_buff *skb) net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); - if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || + if (skb->len < nlmsg_total_size(sizeof(*frn)) || + skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; From a97e50cc4cb67e1e7bff56f6b41cda62ca832336 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 22 Mar 2017 13:08:08 +0100 Subject: [PATCH 289/297] socket, bpf: fix sk_filter use after free in sk_clone_lock In sk_clone_lock(), we create a new socket and inherit most of the parent's members via sock_copy() which memcpy()'s various sections. Now, in case the parent socket had a BPF socket filter attached, then newsk->sk_filter points to the same instance as the original sk->sk_filter. sk_filter_charge() is then called on the newsk->sk_filter to take a reference and should that fail due to hitting max optmem, we bail out and release the newsk instance. The issue is that commit 278571baca2a ("net: filter: simplify socket charging") wrongly combined the dismantle path with the failure path of xfrm_sk_clone_policy(). This means, even when charging failed, we call sk_free_unlock_clone() on the newsk, which then still points to the same sk_filter as the original sk. Thus, sk_free_unlock_clone() calls into __sk_destruct() eventually where it tests for present sk_filter and calls sk_filter_uncharge() on it, which potentially lets sk_omem_alloc wrap around and releases the eBPF prog and sk_filter structure from the (still intact) parent. Fix it by making sure that when sk_filter_charge() failed, we reset newsk->sk_filter back to NULL before passing to sk_free_unlock_clone(), so that we don't mess with the parents sk_filter. Only if xfrm_sk_clone_policy() fails, we did reach the point where either the parent's filter was NULL and as a result newsk's as well or where we previously had a successful sk_filter_charge(), thus for that case, we do need sk_filter_uncharge() to release the prior taken reference on sk_filter. Fixes: 278571baca2a ("net: filter: simplify socket charging") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index acb0d4137499..2c4f574168fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1544,6 +1544,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) is_charged = sk_filter_charge(newsk, filter); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. + */ + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); sk_free_unlock_clone(newsk); newsk = NULL; goto out; From 1d2a6a5e4bf2921531071fcff8538623dce74efa Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 22 Mar 2017 16:08:33 +0100 Subject: [PATCH 290/297] genetlink: fix counting regression on ctrl_dumpfamily() Commit 2ae0f17df1cd ("genetlink: use idr to track families") replaced if (++n < fams_to_skip) continue; into: if (n++ < fams_to_skip) continue; This subtle change cause that on retry ctrl_dumpfamily() call we omit one family that failed to do ctrl_fill_info() on previous call, because cb->args[0] = n number counts also family that failed to do ctrl_fill_info(). Patch fixes the problem and avoid confusion in the future just decrease n counter when ctrl_fill_info() fail. User visible problem caused by this bug is failure to get access to some genetlink family i.e. nl80211. However problem is reproducible only if number of registered genetlink families is big enough to cause second call of ctrl_dumpfamily(). Cc: Xose Vazquez Perez Cc: Larry Finger Cc: Johannes Berg Fixes: 2ae0f17df1cd ("genetlink: use idr to track families") Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index fb6e10fdb217..92e0981f7404 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -783,8 +783,10 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - skb, CTRL_CMD_NEWFAMILY) < 0) + skb, CTRL_CMD_NEWFAMILY) < 0) { + n--; break; + } } cb->args[0] = n; From 15bb7745e94a665caf42bfaabf0ce062845b533b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Mar 2017 08:10:21 -0700 Subject: [PATCH 291/297] tcp: initialize icsk_ack.lrcvtime at session start time icsk_ack.lrcvtime has a 0 value at socket creation time. tcpi_last_data_recv can have bogus value if no payload is ever received. This patch initializes icsk_ack.lrcvtime for active sessions in tcp_finish_connect(), and for passive sessions in tcp_create_openreq_child() Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_minisocks.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 39c393cc0fd3..c43119726a62 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5541,6 +5541,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) struct inet_connection_sock *icsk = inet_csk(sk); tcp_set_state(sk, TCP_ESTABLISHED); + icsk->icsk_ack.lrcvtime = tcp_time_stamp; if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -5759,7 +5760,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); - icsk->icsk_ack.lrcvtime = tcp_time_stamp; tcp_enter_quickack_mode(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7e16243cdb58..65c0f3d13eca 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -460,6 +460,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_time_stamp; newtp->packets_out = 0; newtp->retrans_out = 0; From ec4fbd64751de18729eaa816ec69e4b504b5a7a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Mar 2017 08:57:15 -0700 Subject: [PATCH 292/297] inet: frag: release spinlock before calling icmp_send() Dmitry reported a lockdep splat [1] (false positive) that we can fix by releasing the spinlock before calling icmp_send() from ip_expire() This is a false positive because sending an ICMP message can not possibly re-enter the IP frag engine. [1] [ INFO: possible circular locking dependency detected ] 4.10.0+ #29 Not tainted ------------------------------------------------------- modprobe/12392 is trying to acquire lock: (_xmit_ETHER#2){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] (_xmit_ETHER#2){+.-...}, at: [] __netif_tx_lock include/linux/netdevice.h:3486 [inline] (_xmit_ETHER#2){+.-...}, at: [] sch_direct_xmit+0x282/0x6d0 net/sched/sch_generic.c:180 but task is already holding lock: (&(&q->lock)->rlock){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] (&(&q->lock)->rlock){+.-...}, at: [] ip_expire+0x51/0x6c0 net/ipv4/ip_fragment.c:201 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&(&q->lock)->rlock){+.-...}: validate_chain kernel/locking/lockdep.c:2267 [inline] __lock_acquire+0x2149/0x3430 kernel/locking/lockdep.c:3340 lock_acquire+0x2a1/0x630 kernel/locking/lockdep.c:3755 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] ip_defrag+0x3a2/0x4130 net/ipv4/ip_fragment.c:669 ip_check_defrag+0x4e3/0x8b0 net/ipv4/ip_fragment.c:713 packet_rcv_fanout+0x282/0x800 net/packet/af_packet.c:1459 deliver_skb net/core/dev.c:1834 [inline] dev_queue_xmit_nit+0x294/0xa90 net/core/dev.c:1890 xmit_one net/core/dev.c:2903 [inline] dev_hard_start_xmit+0x16b/0xab0 net/core/dev.c:2923 sch_direct_xmit+0x31f/0x6d0 net/sched/sch_generic.c:182 __dev_xmit_skb net/core/dev.c:3092 [inline] __dev_queue_xmit+0x13e5/0x1e60 net/core/dev.c:3358 dev_queue_xmit+0x17/0x20 net/core/dev.c:3423 neigh_resolve_output+0x6b9/0xb10 net/core/neighbour.c:1308 neigh_output include/net/neighbour.h:478 [inline] ip_finish_output2+0x8b8/0x15a0 net/ipv4/ip_output.c:228 ip_do_fragment+0x1d93/0x2720 net/ipv4/ip_output.c:672 ip_fragment.constprop.54+0x145/0x200 net/ipv4/ip_output.c:545 ip_finish_output+0x82d/0xe10 net/ipv4/ip_output.c:314 NF_HOOK_COND include/linux/netfilter.h:246 [inline] ip_output+0x1f0/0x7a0 net/ipv4/ip_output.c:404 dst_output include/net/dst.h:486 [inline] ip_local_out+0x95/0x170 net/ipv4/ip_output.c:124 ip_send_skb+0x3c/0xc0 net/ipv4/ip_output.c:1492 ip_push_pending_frames+0x64/0x80 net/ipv4/ip_output.c:1512 raw_sendmsg+0x26de/0x3a00 net/ipv4/raw.c:655 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:761 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 ___sys_sendmsg+0x4a3/0x9f0 net/socket.c:1985 __sys_sendmmsg+0x25c/0x750 net/socket.c:2075 SYSC_sendmmsg net/socket.c:2106 [inline] SyS_sendmmsg+0x35/0x60 net/socket.c:2101 do_syscall_64+0x2e8/0x930 arch/x86/entry/common.c:281 return_from_SYSCALL_64+0x0/0x7a -> #0 (_xmit_ETHER#2){+.-...}: check_prev_add kernel/locking/lockdep.c:1830 [inline] check_prevs_add+0xa8f/0x19f0 kernel/locking/lockdep.c:1940 validate_chain kernel/locking/lockdep.c:2267 [inline] __lock_acquire+0x2149/0x3430 kernel/locking/lockdep.c:3340 lock_acquire+0x2a1/0x630 kernel/locking/lockdep.c:3755 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] __netif_tx_lock include/linux/netdevice.h:3486 [inline] sch_direct_xmit+0x282/0x6d0 net/sched/sch_generic.c:180 __dev_xmit_skb net/core/dev.c:3092 [inline] __dev_queue_xmit+0x13e5/0x1e60 net/core/dev.c:3358 dev_queue_xmit+0x17/0x20 net/core/dev.c:3423 neigh_hh_output include/net/neighbour.h:468 [inline] neigh_output include/net/neighbour.h:476 [inline] ip_finish_output2+0xf6c/0x15a0 net/ipv4/ip_output.c:228 ip_finish_output+0xa29/0xe10 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:246 [inline] ip_output+0x1f0/0x7a0 net/ipv4/ip_output.c:404 dst_output include/net/dst.h:486 [inline] ip_local_out+0x95/0x170 net/ipv4/ip_output.c:124 ip_send_skb+0x3c/0xc0 net/ipv4/ip_output.c:1492 ip_push_pending_frames+0x64/0x80 net/ipv4/ip_output.c:1512 icmp_push_reply+0x372/0x4d0 net/ipv4/icmp.c:394 icmp_send+0x156c/0x1c80 net/ipv4/icmp.c:754 ip_expire+0x40e/0x6c0 net/ipv4/ip_fragment.c:239 call_timer_fn+0x241/0x820 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 [inline] __run_timers+0x960/0xcf0 kernel/time/timer.c:1601 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614 __do_softirq+0x31f/0xbe7 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:657 [inline] smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:962 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:707 __read_once_size include/linux/compiler.h:254 [inline] atomic_read arch/x86/include/asm/atomic.h:26 [inline] rcu_dynticks_curr_cpu_in_eqs kernel/rcu/tree.c:350 [inline] __rcu_is_watching kernel/rcu/tree.c:1133 [inline] rcu_is_watching+0x83/0x110 kernel/rcu/tree.c:1147 rcu_read_lock_held+0x87/0xc0 kernel/rcu/update.c:293 radix_tree_deref_slot include/linux/radix-tree.h:238 [inline] filemap_map_pages+0x6d4/0x1570 mm/filemap.c:2335 do_fault_around mm/memory.c:3231 [inline] do_read_fault mm/memory.c:3265 [inline] do_fault+0xbd5/0x2080 mm/memory.c:3370 handle_pte_fault mm/memory.c:3600 [inline] __handle_mm_fault+0x1062/0x2cb0 mm/memory.c:3714 handle_mm_fault+0x1e2/0x480 mm/memory.c:3751 __do_page_fault+0x4f6/0xb60 arch/x86/mm/fault.c:1397 do_page_fault+0x54/0x70 arch/x86/mm/fault.c:1460 page_fault+0x28/0x30 arch/x86/entry/entry_64.S:1011 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(&q->lock)->rlock); lock(_xmit_ETHER#2); lock(&(&q->lock)->rlock); lock(_xmit_ETHER#2); *** DEADLOCK *** 10 locks held by modprobe/12392: #0: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x2b8/0xb60 arch/x86/mm/fault.c:1336 #1: (rcu_read_lock){......}, at: [] filemap_map_pages+0x1e6/0x1570 mm/filemap.c:2324 #2: (&(ptlock_ptr(page))->rlock#2){+.+...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] #2: (&(ptlock_ptr(page))->rlock#2){+.+...}, at: [] pte_alloc_one_map mm/memory.c:2944 [inline] #2: (&(ptlock_ptr(page))->rlock#2){+.+...}, at: [] alloc_set_pte+0x13b8/0x1b90 mm/memory.c:3072 #3: (((&q->timer))){+.-...}, at: [] lockdep_copy_map include/linux/lockdep.h:175 [inline] #3: (((&q->timer))){+.-...}, at: [] call_timer_fn+0x1c2/0x820 kernel/time/timer.c:1258 #4: (&(&q->lock)->rlock){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] #4: (&(&q->lock)->rlock){+.-...}, at: [] ip_expire+0x51/0x6c0 net/ipv4/ip_fragment.c:201 #5: (rcu_read_lock){......}, at: [] ip_expire+0x1b3/0x6c0 net/ipv4/ip_fragment.c:216 #6: (slock-AF_INET){+.-...}, at: [] spin_trylock include/linux/spinlock.h:309 [inline] #6: (slock-AF_INET){+.-...}, at: [] icmp_xmit_lock net/ipv4/icmp.c:219 [inline] #6: (slock-AF_INET){+.-...}, at: [] icmp_send+0x803/0x1c80 net/ipv4/icmp.c:681 #7: (rcu_read_lock_bh){......}, at: [] ip_finish_output2+0x2c1/0x15a0 net/ipv4/ip_output.c:198 #8: (rcu_read_lock_bh){......}, at: [] __dev_queue_xmit+0x23e/0x1e60 net/core/dev.c:3324 #9: (dev->qdisc_running_key ?: &qdisc_running_key){+.....}, at: [] dev_queue_xmit+0x17/0x20 net/core/dev.c:3423 stack backtrace: CPU: 0 PID: 12392 Comm: modprobe Not tainted 4.10.0+ #29 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x2ee/0x3ef lib/dump_stack.c:52 print_circular_bug+0x307/0x3b0 kernel/locking/lockdep.c:1204 check_prev_add kernel/locking/lockdep.c:1830 [inline] check_prevs_add+0xa8f/0x19f0 kernel/locking/lockdep.c:1940 validate_chain kernel/locking/lockdep.c:2267 [inline] __lock_acquire+0x2149/0x3430 kernel/locking/lockdep.c:3340 lock_acquire+0x2a1/0x630 kernel/locking/lockdep.c:3755 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] __netif_tx_lock include/linux/netdevice.h:3486 [inline] sch_direct_xmit+0x282/0x6d0 net/sched/sch_generic.c:180 __dev_xmit_skb net/core/dev.c:3092 [inline] __dev_queue_xmit+0x13e5/0x1e60 net/core/dev.c:3358 dev_queue_xmit+0x17/0x20 net/core/dev.c:3423 neigh_hh_output include/net/neighbour.h:468 [inline] neigh_output include/net/neighbour.h:476 [inline] ip_finish_output2+0xf6c/0x15a0 net/ipv4/ip_output.c:228 ip_finish_output+0xa29/0xe10 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:246 [inline] ip_output+0x1f0/0x7a0 net/ipv4/ip_output.c:404 dst_output include/net/dst.h:486 [inline] ip_local_out+0x95/0x170 net/ipv4/ip_output.c:124 ip_send_skb+0x3c/0xc0 net/ipv4/ip_output.c:1492 ip_push_pending_frames+0x64/0x80 net/ipv4/ip_output.c:1512 icmp_push_reply+0x372/0x4d0 net/ipv4/icmp.c:394 icmp_send+0x156c/0x1c80 net/ipv4/icmp.c:754 ip_expire+0x40e/0x6c0 net/ipv4/ip_fragment.c:239 call_timer_fn+0x241/0x820 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 [inline] __run_timers+0x960/0xcf0 kernel/time/timer.c:1601 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614 __do_softirq+0x31f/0xbe7 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:657 [inline] smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:962 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:707 RIP: 0010:__read_once_size include/linux/compiler.h:254 [inline] RIP: 0010:atomic_read arch/x86/include/asm/atomic.h:26 [inline] RIP: 0010:rcu_dynticks_curr_cpu_in_eqs kernel/rcu/tree.c:350 [inline] RIP: 0010:__rcu_is_watching kernel/rcu/tree.c:1133 [inline] RIP: 0010:rcu_is_watching+0x83/0x110 kernel/rcu/tree.c:1147 RSP: 0000:ffff8801c391f120 EFLAGS: 00000a03 ORIG_RAX: ffffffffffffff10 RAX: dffffc0000000000 RBX: ffff8801c391f148 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000055edd4374000 RDI: ffff8801dbe1ae0c RBP: ffff8801c391f1a0 R08: 0000000000000002 R09: 0000000000000000 R10: dffffc0000000000 R11: 0000000000000002 R12: 1ffff10038723e25 R13: ffff8801dbe1ae00 R14: ffff8801c391f680 R15: dffffc0000000000 rcu_read_lock_held+0x87/0xc0 kernel/rcu/update.c:293 radix_tree_deref_slot include/linux/radix-tree.h:238 [inline] filemap_map_pages+0x6d4/0x1570 mm/filemap.c:2335 do_fault_around mm/memory.c:3231 [inline] do_read_fault mm/memory.c:3265 [inline] do_fault+0xbd5/0x2080 mm/memory.c:3370 handle_pte_fault mm/memory.c:3600 [inline] __handle_mm_fault+0x1062/0x2cb0 mm/memory.c:3714 handle_mm_fault+0x1e2/0x480 mm/memory.c:3751 __do_page_fault+0x4f6/0xb60 arch/x86/mm/fault.c:1397 do_page_fault+0x54/0x70 arch/x86/mm/fault.c:1460 page_fault+0x28/0x30 arch/x86/entry/entry_64.S:1011 RIP: 0033:0x7f83172f2786 RSP: 002b:00007fffe859ae80 EFLAGS: 00010293 RAX: 000055edd4373040 RBX: 00007f83175111c8 RCX: 000055edd4373238 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007f8317510970 RBP: 00007fffe859afd0 R08: 0000000000000009 R09: 0000000000000000 R10: 0000000000000064 R11: 0000000000000000 R12: 000055edd4373040 R13: 0000000000000000 R14: 00007fffe859afe8 R15: 0000000000000000 Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index bbe7f72db9c1..b3cdeec85f1f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -198,6 +198,7 @@ static void ip_expire(unsigned long arg) qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); + rcu_read_lock(); spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) @@ -207,7 +208,7 @@ static void ip_expire(unsigned long arg) __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); if (!inet_frag_evicting(&qp->q)) { - struct sk_buff *head = qp->q.fragments; + struct sk_buff *clone, *head = qp->q.fragments; const struct iphdr *iph; int err; @@ -216,32 +217,40 @@ static void ip_expire(unsigned long arg) if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) goto out; - rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) - goto out_rcu_unlock; + goto out; + /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); if (err) - goto out_rcu_unlock; + goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ if (frag_expire_skip_icmp(qp->user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out_rcu_unlock; + goto out; + + clone = skb_clone(head, GFP_ATOMIC); /* Send an ICMP "Fragment Reassembly Timeout" message. */ - icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); -out_rcu_unlock: - rcu_read_unlock(); + if (clone) { + spin_unlock(&qp->q.lock); + icmp_send(clone, ICMP_TIME_EXCEEDED, + ICMP_EXC_FRAGTIME, 0); + consume_skb(clone); + goto out_rcu_unlock; + } } out: spin_unlock(&qp->q.lock); +out_rcu_unlock: + rcu_read_unlock(); ipq_put(qp); } From 6e9e6cc8f4e4f2cd67931510c9f39abf3d9e0d3b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Mar 2017 15:31:10 -0700 Subject: [PATCH 293/297] Bluetooth: btqcomsmd: fix compile-test dependency compile-testing fails when QCOM_SMD is a loadable module: drivers/bluetooth/built-in.o: In function `btqcomsmd_send': btqca.c:(.text+0xa8): undefined reference to `qcom_smd_send' drivers/bluetooth/built-in.o: In function `btqcomsmd_probe': btqca.c:(.text+0x3ec): undefined reference to `qcom_wcnss_open_channel' btqca.c:(.text+0x46c): undefined reference to `qcom_smd_set_drvdata' This clarifies the dependency to allow compile-testing only when SMD is completely disabled, otherwise the dependency on QCOM_SMD will make sure we can link against it. Fixes: e27ee2b16bad ("Bluetooth: btqcomsmd: Allow driver to build if COMPILE_TEST is enabled") Signed-off-by: Arnd Bergmann [bjorn: Restructure and clarify dependency to QCOM_WCNSS_CTRL] Signed-off-by: Bjorn Andersson Acked-by: Marcel Holtmann Signed-off-by: David S. Miller --- drivers/bluetooth/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index c2c14a12713b..08e054507d0b 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -344,7 +344,8 @@ config BT_WILINK config BT_QCOMSMD tristate "Qualcomm SMD based HCI support" - depends on (QCOM_SMD && QCOM_WCNSS_CTRL) || COMPILE_TEST + depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n) + depends on QCOM_WCNSS_CTRL || (COMPILE_TEST && QCOM_WCNSS_CTRL=n) select BT_QCA help Qualcomm SMD based HCI driver. From c04ca616eed02b9abe7afd311382c3ed5eef5c40 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 22 Mar 2017 12:10:02 +0300 Subject: [PATCH 294/297] sfc: cleanup a condition in efx_udp_tunnel_del() Presumably if there is an "add" function, there is also a "del" function. But it causes a static checker warning because it looks like a common cut and paste bug. Signed-off-by: Dan Carpenter Acked-by: Jarod Wilson Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 334bcc6df6b2..50d28261b6b9 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -2404,7 +2404,7 @@ static void efx_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *t tnl.type = (u16)efx_tunnel_type; tnl.port = ti->port; - if (efx->type->udp_tnl_add_port) + if (efx->type->udp_tnl_del_port) (void)efx->type->udp_tnl_del_port(efx, tnl); } From f43feef4e6acde10857fcbfdede790d6b3f2c71d Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Wed, 22 Mar 2017 17:25:27 -0500 Subject: [PATCH 295/297] amd-xgbe: Fix the ECC-related bit position definitions The ECC bit positions that describe whether the ECC interrupt is for Tx, Rx or descriptor memory and whether the it is a single correctable or double detected error were defined in incorrectly (reversed order). Fix the bit position definitions for these settings so that the proper ECC handling is performed. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-common.h | 24 ++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index 86f1626816ff..127adbeefb10 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -984,29 +984,29 @@ #define XP_ECC_CNT1_DESC_DED_WIDTH 8 #define XP_ECC_CNT1_DESC_SEC_INDEX 0 #define XP_ECC_CNT1_DESC_SEC_WIDTH 8 -#define XP_ECC_IER_DESC_DED_INDEX 0 +#define XP_ECC_IER_DESC_DED_INDEX 5 #define XP_ECC_IER_DESC_DED_WIDTH 1 -#define XP_ECC_IER_DESC_SEC_INDEX 1 +#define XP_ECC_IER_DESC_SEC_INDEX 4 #define XP_ECC_IER_DESC_SEC_WIDTH 1 -#define XP_ECC_IER_RX_DED_INDEX 2 +#define XP_ECC_IER_RX_DED_INDEX 3 #define XP_ECC_IER_RX_DED_WIDTH 1 -#define XP_ECC_IER_RX_SEC_INDEX 3 +#define XP_ECC_IER_RX_SEC_INDEX 2 #define XP_ECC_IER_RX_SEC_WIDTH 1 -#define XP_ECC_IER_TX_DED_INDEX 4 +#define XP_ECC_IER_TX_DED_INDEX 1 #define XP_ECC_IER_TX_DED_WIDTH 1 -#define XP_ECC_IER_TX_SEC_INDEX 5 +#define XP_ECC_IER_TX_SEC_INDEX 0 #define XP_ECC_IER_TX_SEC_WIDTH 1 -#define XP_ECC_ISR_DESC_DED_INDEX 0 +#define XP_ECC_ISR_DESC_DED_INDEX 5 #define XP_ECC_ISR_DESC_DED_WIDTH 1 -#define XP_ECC_ISR_DESC_SEC_INDEX 1 +#define XP_ECC_ISR_DESC_SEC_INDEX 4 #define XP_ECC_ISR_DESC_SEC_WIDTH 1 -#define XP_ECC_ISR_RX_DED_INDEX 2 +#define XP_ECC_ISR_RX_DED_INDEX 3 #define XP_ECC_ISR_RX_DED_WIDTH 1 -#define XP_ECC_ISR_RX_SEC_INDEX 3 +#define XP_ECC_ISR_RX_SEC_INDEX 2 #define XP_ECC_ISR_RX_SEC_WIDTH 1 -#define XP_ECC_ISR_TX_DED_INDEX 4 +#define XP_ECC_ISR_TX_DED_INDEX 1 #define XP_ECC_ISR_TX_DED_WIDTH 1 -#define XP_ECC_ISR_TX_SEC_INDEX 5 +#define XP_ECC_ISR_TX_SEC_INDEX 0 #define XP_ECC_ISR_TX_SEC_WIDTH 1 #define XP_I2C_MUTEX_BUSY_INDEX 31 #define XP_I2C_MUTEX_BUSY_WIDTH 1 From 68c386590375b2aea5a3154f17882a30170707bf Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Thu, 23 Mar 2017 02:20:39 +0300 Subject: [PATCH 296/297] net:ethernet:aquantia: Fix for RX checksum offload. Since AQC-100/107/108 chips supports hardware checksums for RX we should indicate this via NETIF_F_RXCSUM flag. v1->v2: 'Signed-off-by' tag added. Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h | 1 + .../net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h index 1093ea18823a..0592a0330cf0 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h @@ -137,6 +137,7 @@ static struct aq_hw_caps_s hw_atl_a0_hw_caps_ = { .tx_rings = HW_ATL_A0_TX_RINGS, .rx_rings = HW_ATL_A0_RX_RINGS, .hw_features = NETIF_F_HW_CSUM | + NETIF_F_RXCSUM | NETIF_F_RXHASH | NETIF_F_SG | NETIF_F_TSO, diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h index 8bdee3ddd5a0..f3957e930340 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h @@ -188,6 +188,7 @@ static struct aq_hw_caps_s hw_atl_b0_hw_caps_ = { .tx_rings = HW_ATL_B0_TX_RINGS, .rx_rings = HW_ATL_B0_RX_RINGS, .hw_features = NETIF_F_HW_CSUM | + NETIF_F_RXCSUM | NETIF_F_RXHASH | NETIF_F_SG | NETIF_F_TSO | From 3f307834e695f59dac4337a40316bdecfb9d0508 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 23 Mar 2017 10:00:25 +0800 Subject: [PATCH 297/297] ALSA: hda - Adding a group of pin definition to fix headset problem A new Dell laptop needs to apply ALC269_FIXUP_DELL1_MIC_NO_PRESENCE to fix the headset problem, and the pin definiton of this machine is not in the pin quirk table yet, now adding it to the table. Signed-off-by: Hui Wang Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8d6b3703d0a2..7f989898cbd9 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6102,6 +6102,8 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { ALC295_STANDARD_PINS, {0x17, 0x21014040}, {0x18, 0x21a19050}), + SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, + ALC295_STANDARD_PINS), SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, ALC298_STANDARD_PINS, {0x17, 0x90170110}),