From 1cbbbf39efab05fae67f59e6ed01bb85061c69e2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 28 Apr 2019 22:14:51 +0800 Subject: [PATCH 001/266] ieee802154: hwsim: Fix error handle path in hwsim_init_module KASAN report this: BUG: unable to handle kernel paging request at fffffbfff834f001 PGD 237fe8067 P4D 237fe8067 PUD 237e64067 PMD 1c968d067 PTE 0 Oops: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 8871 Comm: syz-executor.0 Tainted: G C 5.0.0+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 RIP: 0010:strcmp+0x31/0xa0 lib/string.c:328 Code: 00 00 00 00 fc ff df 55 53 48 83 ec 08 eb 0a 84 db 48 89 ef 74 5a 4c 89 e6 48 89 f8 48 89 fa 48 8d 6f 01 48 c1 e8 03 83 e2 07 <42> 0f b6 04 28 38 d0 7f 04 84 c0 75 50 48 89 f0 48 89 f2 0f b6 5d RSP: 0018:ffff8881e0c57800 EFLAGS: 00010246 RAX: 1ffffffff834f001 RBX: ffffffffc1a78000 RCX: ffffffff827b9503 RDX: 0000000000000000 RSI: ffffffffc1a40008 RDI: ffffffffc1a78008 RBP: ffffffffc1a78009 R08: fffffbfff6a92195 R09: fffffbfff6a92195 R10: ffff8881e0c578b8 R11: fffffbfff6a92194 R12: ffffffffc1a40008 R13: dffffc0000000000 R14: ffffffffc1a3e470 R15: ffffffffc1a40000 FS: 00007fdcc02ff700(0000) GS:ffff8881f7300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff834f001 CR3: 00000001b3134003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: genl_family_find_byname+0x7f/0xf0 net/netlink/genetlink.c:104 genl_register_family+0x1e1/0x1070 net/netlink/genetlink.c:333 ? 0xffffffffc1978000 hwsim_init_module+0x6a/0x1000 [mac802154_hwsim] ? 0xffffffffc1978000 ? 0xffffffffc1978000 ? 0xffffffffc1978000 do_one_initcall+0xbc/0x47d init/main.c:887 do_init_module+0x1b5/0x547 kernel/module.c:3456 load_module+0x6405/0x8c10 kernel/module.c:3804 __do_sys_finit_module+0x162/0x190 kernel/module.c:3898 do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x462e99 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fdcc02fec58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99 RDX: 0000000000000000 RSI: 0000000020000200 RDI: 0000000000000003 RBP: 00007fdcc02fec70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fdcc02ff6bc R13: 00000000004bcefa R14: 00000000006f6fb0 R15: 0000000000000004 Modules linked in: mac802154_hwsim(+) mac802154 ieee802154 speakup(C) rc_proteus_2309 rtc_rk808 streebog_generic rds vboxguest madera_spi madera da9052_wdt mISDN_core ueagle_atm usbatm atm ir_imon_decoder scsi_transport_sas rc_dntv_live_dvb_t panel_samsung_s6d16d0 drm drm_panel_orientation_quirks lib80211 fb_agm1264k_fl(C) gspca_pac7302 gspca_main videobuf2_v4l2 soundwire_intel_init i2c_dln2 dln2 usbcore hid_gaff 88pm8607 nfnetlink axp20x_i2c axp20x uio pata_marvell pmbus_core snd_sonicvibes gameport snd_pcm snd_opl3_lib snd_timer snd_hwdep snd_mpu401_uart snd_rawmidi snd_seq_device snd soundcore rtc_ds1511 rtc_ds1742 vsock dwc_xlgmac rtc_rx8010 libphy twofish_x86_64_3way twofish_x86_64 twofish_common ad5696_i2c ad5686 lp8788_charger cxd2880_spi dvb_core videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops fbtft(C) sysimgblt sysfillrect syscopyarea fb_sys_fops janz_ican3 firewire_net firewire_core crc_itu_t spi_slave_system_control i2c_matroxfb i2c_algo_bit matroxfb_base fb fbdev matroxfb_DAC1064 matroxfb_accel cfbcopyarea cfbimgblt cfbfillrect matroxfb_Ti3026 matroxfb_g450 g450_pll matroxfb_misc leds_blinkm ti_dac7311 intel_spi_pci intel_spi spi_nor hid_elan hid async_tx rc_cinergy_1400 rc_core intel_ishtp kxcjk_1013 industrialio_triggered_buffer kfifo_buf can_dev intel_th spi_pxa2xx_platform pata_artop vme_ca91cx42 gb_gbphy(C) greybus(C) industrialio mptbase st_drv cmac ttpci_eeprom via_wdt gpio_xra1403 mtd iptable_security iptable_raw iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun joydev mousedev ppdev kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 input_leds crypto_simd cryptd glue_helper ide_pci_generic piix psmouse ide_core serio_raw ata_generic i2c_piix4 pata_acpi parport_pc parport floppy rtc_cmos intel_agp intel_gtt agpgart sch_fq_codel ip_tables x_tables sha1_ssse3 sha1_generic ipv6 [last unloaded: speakup] Dumping ftrace buffer: (ftrace buffer empty) CR2: fffffbfff834f001 ---[ end trace 5aa772c793e0e971 ]--- RIP: 0010:strcmp+0x31/0xa0 lib/string.c:328 Code: 00 00 00 00 fc ff df 55 53 48 83 ec 08 eb 0a 84 db 48 89 ef 74 5a 4c 89 e6 48 89 f8 48 89 fa 48 8d 6f 01 48 c1 e8 03 83 e2 07 <42> 0f b6 04 28 38 d0 7f 04 84 c0 75 50 48 89 f0 48 89 f2 0f b6 5d RSP: 0018:ffff8881e0c57800 EFLAGS: 00010246 RAX: 1ffffffff834f001 RBX: ffffffffc1a78000 RCX: ffffffff827b9503 RDX: 0000000000000000 RSI: ffffffffc1a40008 RDI: ffffffffc1a78008 RBP: ffffffffc1a78009 R08: fffffbfff6a92195 R09: fffffbfff6a92195 R10: ffff8881e0c578b8 R11: fffffbfff6a92194 R12: ffffffffc1a40008 R13: dffffc0000000000 R14: ffffffffc1a3e470 R15: ffffffffc1a40000 FS: 00007fdcc02ff700(0000) GS:ffff8881f7300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff834f001 CR3: 00000001b3134003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 The error handing path misplace the cleanup in hwsim_init_module, switch the two cleanup functions to fix above issues. Reported-by: Hulk Robot Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Signed-off-by: YueHaibing Acked-by: Alexander Aring Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 3b88846de31b..c2b6ffb5771b 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -912,9 +912,9 @@ static __init int hwsim_init_module(void) return 0; platform_drv: - genl_unregister_family(&hwsim_genl_family); -platform_dev: platform_device_unregister(mac802154hwsim_dev); +platform_dev: + genl_unregister_family(&hwsim_genl_family); return rc; } From de166bbe861738c8bc3e5dad5b03f45d7d6ef914 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 28 Apr 2019 23:48:10 +0800 Subject: [PATCH 002/266] ieee802154: hwsim: unregister hw while hwsim_subscribe_all_others fails KASAN report this: kernel BUG at net/mac802154/main.c:130! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 0 PID: 19932 Comm: modprobe Not tainted 5.1.0-rc6+ #22 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 RIP: 0010:ieee802154_free_hw+0x2a/0x30 [mac802154] Code: 55 48 8d 57 38 48 89 e5 53 48 89 fb 48 8b 47 38 48 39 c2 75 15 48 8d 7f 48 e8 82 85 16 e1 48 8b 7b 28 e8 f9 ef 83 e2 5b 5d c3 <0f> 0b 0f 1f 40 00 55 48 89 e5 53 48 89 fb 0f b6 86 80 00 00 00 88 RSP: 0018:ffffc90001c7b9f0 EFLAGS: 00010206 RAX: ffff88822df3aa80 RBX: ffff88823143d5c0 RCX: 0000000000000002 RDX: ffff88823143d5f8 RSI: ffff88822b1fabc0 RDI: ffff88823143d5c0 RBP: ffffc90001c7b9f8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000fffffff4 R13: ffff88822dea4f50 R14: ffff88823143d7c0 R15: 00000000fffffff4 FS: 00007ff52e999540(0000) GS:ffff888237a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fdc06dba768 CR3: 000000023160a000 CR4: 00000000000006f0 Call Trace: hwsim_add_one+0x2dd/0x540 [mac802154_hwsim] hwsim_probe+0x2f/0xb0 [mac802154_hwsim] platform_drv_probe+0x3a/0x90 ? driver_sysfs_add+0x79/0xb0 really_probe+0x1d4/0x2d0 driver_probe_device+0x50/0xf0 device_driver_attach+0x54/0x60 __driver_attach+0x7e/0xd0 ? device_driver_attach+0x60/0x60 bus_for_each_dev+0x68/0xc0 driver_attach+0x19/0x20 bus_add_driver+0x15e/0x200 driver_register+0x5b/0xf0 __platform_driver_register+0x31/0x40 hwsim_init_module+0x74/0x1000 [mac802154_hwsim] ? 0xffffffffa00e9000 do_one_initcall+0x6c/0x3cc ? kmem_cache_alloc_trace+0x248/0x3b0 do_init_module+0x5b/0x1f1 load_module+0x1db1/0x2690 ? m_show+0x1d0/0x1d0 __do_sys_finit_module+0xc5/0xd0 __x64_sys_finit_module+0x15/0x20 do_syscall_64+0x6b/0x1d0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7ff52e4a2839 Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 1f f6 2c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffffa7b3c08 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 00005647560a2a00 RCX: 00007ff52e4a2839 RDX: 0000000000000000 RSI: 00005647547f3c2e RDI: 0000000000000003 RBP: 00005647547f3c2e R08: 0000000000000000 R09: 00005647560a2a00 R10: 0000000000000003 R11: 0000000000000246 R12: 0000000000000000 R13: 00005647560a2c10 R14: 0000000000040000 R15: 00005647560a2a00 Modules linked in: mac802154_hwsim(+) mac802154 [last unloaded: mac802154_hwsim] In hwsim_add_one, if hwsim_subscribe_all_others fails, we should call ieee802154_unregister_hw to free resources. Reported-by: Hulk Robot Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Signed-off-by: YueHaibing Acked-by: Alexander Aring Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index c2b6ffb5771b..3d9ffd2188f9 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -813,7 +813,7 @@ static int hwsim_add_one(struct genl_info *info, struct device *dev, err = hwsim_subscribe_all_others(phy); if (err < 0) { mutex_unlock(&hwsim_phys_lock); - goto err_reg; + goto err_subscribe; } } list_add_tail(&phy->list, &hwsim_phys); @@ -823,6 +823,8 @@ static int hwsim_add_one(struct genl_info *info, struct device *dev, return idx; +err_subscribe: + ieee802154_unregister_hw(phy->hw); err_reg: kfree(pib); err_pib: From 9cdde85804833af77c6afbf7c53f0d959c42eb9f Mon Sep 17 00:00:00 2001 From: Hyungwoo Yang Date: Wed, 29 May 2019 21:03:54 -0700 Subject: [PATCH 003/266] platform/chrome: cros_ec_ishtp: fix crash during suspend Kernel crashes during suspend due to wrong conversion in suspend and resume functions. Use the proper helper to get ishtp_cl_device instance. Cc: # 5.2.x: b12bbdc5: HID: intel-ish-hid: fix wrong driver_data usage Signed-off-by: Hyungwoo Yang Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_ishtp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_ishtp.c b/drivers/platform/chrome/cros_ec_ishtp.c index e504d255d5ce..430731cdf827 100644 --- a/drivers/platform/chrome/cros_ec_ishtp.c +++ b/drivers/platform/chrome/cros_ec_ishtp.c @@ -707,7 +707,7 @@ static int cros_ec_ishtp_reset(struct ishtp_cl_device *cl_device) */ static int __maybe_unused cros_ec_ishtp_suspend(struct device *device) { - struct ishtp_cl_device *cl_device = dev_get_drvdata(device); + struct ishtp_cl_device *cl_device = ishtp_dev_to_cl_device(device); struct ishtp_cl *cros_ish_cl = ishtp_get_drvdata(cl_device); struct ishtp_cl_data *client_data = ishtp_get_client_data(cros_ish_cl); @@ -722,7 +722,7 @@ static int __maybe_unused cros_ec_ishtp_suspend(struct device *device) */ static int __maybe_unused cros_ec_ishtp_resume(struct device *device) { - struct ishtp_cl_device *cl_device = dev_get_drvdata(device); + struct ishtp_cl_device *cl_device = ishtp_dev_to_cl_device(device); struct ishtp_cl *cros_ish_cl = ishtp_get_drvdata(cl_device); struct ishtp_cl_data *client_data = ishtp_get_client_data(cros_ish_cl); From 38f054d549a869f22a02224cd276a27bf14b6171 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Tue, 23 Jul 2019 15:26:28 +0200 Subject: [PATCH 004/266] modules: always page-align module section allocations Some arches (e.g., arm64, x86) have moved towards non-executable module_alloc() allocations for security hardening reasons. That means that the module loader will need to set the text section of a module to executable, regardless of whether or not CONFIG_STRICT_MODULE_RWX is set. When CONFIG_STRICT_MODULE_RWX=y, module section allocations are always page-aligned to handle memory rwx permissions. On some arches with CONFIG_STRICT_MODULE_RWX=n however, when setting the module text to executable, the BUG_ON() in frob_text() gets triggered since module section allocations are not page-aligned when CONFIG_STRICT_MODULE_RWX=n. Since the set_memory_* API works with pages, and since we need to call set_memory_x() regardless of whether CONFIG_STRICT_MODULE_RWX is set, we might as well page-align all module section allocations for ease of managing rwx permissions of module sections (text, rodata, etc). Fixes: 2eef1399a866 ("modules: fix BUG when load module with rodata=n") Reported-by: Martin Kaiser Reported-by: Bartosz Golaszewski Tested-by: David Lechner Tested-by: Martin Kaiser Tested-by: Bartosz Golaszewski Signed-off-by: Jessica Yu --- kernel/module.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index 5933395af9a0..cd8df516666d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -64,14 +64,9 @@ /* * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_STRICT_MODULE_RWX=y + * to ensure complete separation of code and data */ -#ifdef CONFIG_STRICT_MODULE_RWX # define debug_align(X) ALIGN(X, PAGE_SIZE) -#else -# define debug_align(X) (X) -#endif /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) From 97abfd5d801abb6eac35e2d7d725123950e2153d Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Tue, 16 Jul 2019 23:50:34 +0300 Subject: [PATCH 005/266] ARCv2: entry: early return from exception need not clear U & DE bits Exception handlers call FAKE_RET_FROM_EXCPN to - clear AE bit: drop down from exception active to pure kernel mode allowing further excptions - set IE bit: re-enable interrupts It additionally also clears U bit (user mode) and DE bit (delay slot execution) which is redundant as hardware does that already on any taken exception. Morevoer the current software clearing is bogus anyways as the KFLAG instruction being used for purpose can't possibly write those bits anyways. So don't pretend to clear them. Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta [vgupta: rewrote changelog] --- arch/arc/include/asm/entry-arcv2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/include/asm/entry-arcv2.h b/arch/arc/include/asm/entry-arcv2.h index f5ae394ebe06..41b16f21beec 100644 --- a/arch/arc/include/asm/entry-arcv2.h +++ b/arch/arc/include/asm/entry-arcv2.h @@ -256,7 +256,7 @@ .macro FAKE_RET_FROM_EXCPN lr r9, [status32] - bic r9, r9, (STATUS_U_MASK|STATUS_DE_MASK|STATUS_AE_MASK) + bic r9, r9, STATUS_AE_MASK or r9, r9, STATUS_IE_MASK kflag r9 .endm From da31076f96fc41af41d64e94b9fefe0d21c8ee9c Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Fri, 19 Jul 2019 21:46:00 +0300 Subject: [PATCH 006/266] ARC: fix typo in setup_dma_ops log message Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/mm/dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 62c210e7ee4c..70a3fbe79fba 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -101,7 +101,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, if (is_isa_arcv2() && ioc_enable && coherent) dev->dma_coherent = true; - dev_info(dev, "use %sncoherent DMA ops\n", + dev_info(dev, "use %scoherent DMA ops\n", dev->dma_coherent ? "" : "non"); } From ce0eff0d9b4d37702df48a39e3fddb5e39b2c25b Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Mon, 22 Jul 2019 12:31:45 +0300 Subject: [PATCH 007/266] ARC: [plat-hsdk]: allow to switch between AXI DMAC port configurations We want to use DW AXI DMAC on HSDK board in our automated verification to test cache & dma kernel code changes. This is perfect candidate as we don't depend on any external peripherals like MMC card / USB storage / etc. To increase test coverage we want to test both options: * DW AXI DMAC is connected through IOC port & dma direct ops used * DW AXI DMAC is connected to DDR port & dma noncoherent ops used Introduce 'arc_hsdk_axi_dmac_coherent' global variable which can be modified by debugger (same way as we patch 'ioc_enable') to switch between these options without recompiling the kernel. Depend on this value we tweak memory bridge configuration and "dma-coherent" DTS property of DW AXI DMAC. Signed-off-by: Eugeniy Paltsev Acked-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/Makefile | 3 ++ arch/arc/plat-hsdk/platform.c | 87 ++++++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/arch/arc/boot/dts/Makefile b/arch/arc/boot/dts/Makefile index a83c4f5e928b..8483a86c743d 100644 --- a/arch/arc/boot/dts/Makefile +++ b/arch/arc/boot/dts/Makefile @@ -12,3 +12,6 @@ dtb-y := $(builtindtb-y).dtb # for CONFIG_OF_ALL_DTBS test dtstree := $(srctree)/$(src) dtb- := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) + +# board-specific dtc flags +DTC_FLAGS_hsdk += --pad 20 diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c index 7dd2dd335cf6..0b961a2a10b8 100644 --- a/arch/arc/plat-hsdk/platform.c +++ b/arch/arc/plat-hsdk/platform.c @@ -6,11 +6,15 @@ */ #include +#include +#include #include #include #include #include +int arc_hsdk_axi_dmac_coherent __section(.data) = 0; + #define ARC_CCM_UNUSED_ADDR 0x60000000 static void __init hsdk_init_per_cpu(unsigned int cpu) @@ -97,6 +101,42 @@ static void __init hsdk_enable_gpio_intc_wire(void) iowrite32(GPIO_INT_CONNECTED_MASK, (void __iomem *) GPIO_INTEN); } +static int __init hsdk_tweak_node_coherency(const char *path, bool coherent) +{ + void *fdt = initial_boot_params; + const void *prop; + int node, ret; + bool dt_coh_set; + + node = fdt_path_offset(fdt, path); + if (node < 0) + goto tweak_fail; + + prop = fdt_getprop(fdt, node, "dma-coherent", &ret); + if (!prop && ret != -FDT_ERR_NOTFOUND) + goto tweak_fail; + + dt_coh_set = ret != -FDT_ERR_NOTFOUND; + ret = 0; + + /* need to remove "dma-coherent" property */ + if (dt_coh_set && !coherent) + ret = fdt_delprop(fdt, node, "dma-coherent"); + + /* need to set "dma-coherent" property */ + if (!dt_coh_set && coherent) + ret = fdt_setprop(fdt, node, "dma-coherent", NULL, 0); + + if (ret < 0) + goto tweak_fail; + + return 0; + +tweak_fail: + pr_err("failed to tweak %s to %scoherent\n", path, coherent ? "" : "non"); + return -EFAULT; +} + enum hsdk_axi_masters { M_HS_CORE = 0, M_HS_RTT, @@ -162,6 +202,39 @@ enum hsdk_axi_masters { #define CREG_PAE ((void __iomem *)(CREG_BASE + 0x180)) #define CREG_PAE_UPDT ((void __iomem *)(CREG_BASE + 0x194)) +static void __init hsdk_init_memory_bridge_axi_dmac(void) +{ + bool coherent = !!arc_hsdk_axi_dmac_coherent; + u32 axi_m_slv1, axi_m_oft1; + + /* + * Don't tweak memory bridge configuration if we failed to tweak DTB + * as we will end up in a inconsistent state. + */ + if (hsdk_tweak_node_coherency("/soc/dmac@80000", coherent)) + return; + + if (coherent) { + axi_m_slv1 = 0x77999999; + axi_m_oft1 = 0x76DCBA98; + } else { + axi_m_slv1 = 0x77777777; + axi_m_oft1 = 0x76543210; + } + + writel(0x77777777, CREG_AXI_M_SLV0(M_DMAC_0)); + writel(0xFEDCBA98, CREG_AXI_M_OFT0(M_DMAC_0)); + writel(axi_m_slv1, CREG_AXI_M_SLV1(M_DMAC_0)); + writel(axi_m_oft1, CREG_AXI_M_OFT1(M_DMAC_0)); + writel(UPDATE_VAL, CREG_AXI_M_UPDT(M_DMAC_0)); + + writel(0x77777777, CREG_AXI_M_SLV0(M_DMAC_1)); + writel(0xFEDCBA98, CREG_AXI_M_OFT0(M_DMAC_1)); + writel(axi_m_slv1, CREG_AXI_M_SLV1(M_DMAC_1)); + writel(axi_m_oft1, CREG_AXI_M_OFT1(M_DMAC_1)); + writel(UPDATE_VAL, CREG_AXI_M_UPDT(M_DMAC_1)); +} + static void __init hsdk_init_memory_bridge(void) { u32 reg; @@ -227,24 +300,14 @@ static void __init hsdk_init_memory_bridge(void) writel(0x76543210, CREG_AXI_M_OFT1(M_GPU)); writel(UPDATE_VAL, CREG_AXI_M_UPDT(M_GPU)); - writel(0x77777777, CREG_AXI_M_SLV0(M_DMAC_0)); - writel(0x77777777, CREG_AXI_M_SLV1(M_DMAC_0)); - writel(0xFEDCBA98, CREG_AXI_M_OFT0(M_DMAC_0)); - writel(0x76543210, CREG_AXI_M_OFT1(M_DMAC_0)); - writel(UPDATE_VAL, CREG_AXI_M_UPDT(M_DMAC_0)); - - writel(0x77777777, CREG_AXI_M_SLV0(M_DMAC_1)); - writel(0x77777777, CREG_AXI_M_SLV1(M_DMAC_1)); - writel(0xFEDCBA98, CREG_AXI_M_OFT0(M_DMAC_1)); - writel(0x76543210, CREG_AXI_M_OFT1(M_DMAC_1)); - writel(UPDATE_VAL, CREG_AXI_M_UPDT(M_DMAC_1)); - writel(0x00000000, CREG_AXI_M_SLV0(M_DVFS)); writel(0x60000000, CREG_AXI_M_SLV1(M_DVFS)); writel(0x00000000, CREG_AXI_M_OFT0(M_DVFS)); writel(0x00000000, CREG_AXI_M_OFT1(M_DVFS)); writel(UPDATE_VAL, CREG_AXI_M_UPDT(M_DVFS)); + hsdk_init_memory_bridge_axi_dmac(); + /* * PAE remapping for DMA clients does not work due to an RTL bug, so * CREG_PAE register must be programmed to all zeroes, otherwise it From e86d94fdda8e11a2acbe0a910e82f7519f6088b7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 5 Aug 2019 14:32:32 -0500 Subject: [PATCH 008/266] ARC: unwind: Mark expected switch fall-throughs Mark switch cases where we are expecting to fall through. This patch fixes the following warnings (Building: haps_hs_defconfig arc): arch/arc/kernel/unwind.c:827:20: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/arc/kernel/unwind.c:836:20: warning: this statement may fall through [-Wimplicit-fallthrough=] Reviewed-by: Kees Cook Signed-off-by: Gustavo A. R. Silva Signed-off-by: Vineet Gupta --- arch/arc/kernel/unwind.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c index c2663fce7f6c..445e4d702f43 100644 --- a/arch/arc/kernel/unwind.c +++ b/arch/arc/kernel/unwind.c @@ -826,7 +826,7 @@ static int processCFI(const u8 *start, const u8 *end, unsigned long targetLoc, case DW_CFA_def_cfa: state->cfa.reg = get_uleb128(&ptr.p8, end); unw_debug("cfa_def_cfa: r%lu ", state->cfa.reg); - /*nobreak*/ + /* fall through */ case DW_CFA_def_cfa_offset: state->cfa.offs = get_uleb128(&ptr.p8, end); unw_debug("cfa_def_cfa_offset: 0x%lx ", @@ -834,7 +834,7 @@ static int processCFI(const u8 *start, const u8 *end, unsigned long targetLoc, break; case DW_CFA_def_cfa_sf: state->cfa.reg = get_uleb128(&ptr.p8, end); - /*nobreak */ + /* fall through */ case DW_CFA_def_cfa_offset_sf: state->cfa.offs = get_sleb128(&ptr.p8, end) * state->dataAlign; From bf32e7dbfce87d518c0ca77af890eae9ab8d6ab9 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Thu, 8 Aug 2019 16:49:28 +0200 Subject: [PATCH 009/266] clk: samsung: Change signature of exynos5_subcmus_init() function In order to make it easier in subsequent patch to create different subcmu lists for exynos5420 and exynos5800 SoCs the code is rewritten so we pass an array of pointers to the subcmus initialization function. Fixes: b06a532bf1fa ("clk: samsung: Add Exynos5 sub-CMU clock driver") Tested-by: Jaafar Ali Signed-off-by: Sylwester Nawrocki Link: https://lkml.kernel.org/r/20190808144929.18685-1-s.nawrocki@samsung.com Reviewed-by: Marek Szyprowski Signed-off-by: Stephen Boyd --- drivers/clk/samsung/clk-exynos5-subcmu.c | 16 +++---- drivers/clk/samsung/clk-exynos5-subcmu.h | 2 +- drivers/clk/samsung/clk-exynos5250.c | 7 ++- drivers/clk/samsung/clk-exynos5420.c | 60 ++++++++++++++---------- 4 files changed, 49 insertions(+), 36 deletions(-) diff --git a/drivers/clk/samsung/clk-exynos5-subcmu.c b/drivers/clk/samsung/clk-exynos5-subcmu.c index 91db7894125d..65c82d922b05 100644 --- a/drivers/clk/samsung/clk-exynos5-subcmu.c +++ b/drivers/clk/samsung/clk-exynos5-subcmu.c @@ -14,7 +14,7 @@ #include "clk-exynos5-subcmu.h" static struct samsung_clk_provider *ctx; -static const struct exynos5_subcmu_info *cmu; +static const struct exynos5_subcmu_info **cmu; static int nr_cmus; static void exynos5_subcmu_clk_save(void __iomem *base, @@ -56,17 +56,17 @@ static void exynos5_subcmu_defer_gate(struct samsung_clk_provider *ctx, * when OF-core populates all device-tree nodes. */ void exynos5_subcmus_init(struct samsung_clk_provider *_ctx, int _nr_cmus, - const struct exynos5_subcmu_info *_cmu) + const struct exynos5_subcmu_info **_cmu) { ctx = _ctx; cmu = _cmu; nr_cmus = _nr_cmus; for (; _nr_cmus--; _cmu++) { - exynos5_subcmu_defer_gate(ctx, _cmu->gate_clks, - _cmu->nr_gate_clks); - exynos5_subcmu_clk_save(ctx->reg_base, _cmu->suspend_regs, - _cmu->nr_suspend_regs); + exynos5_subcmu_defer_gate(ctx, (*_cmu)->gate_clks, + (*_cmu)->nr_gate_clks); + exynos5_subcmu_clk_save(ctx->reg_base, (*_cmu)->suspend_regs, + (*_cmu)->nr_suspend_regs); } } @@ -163,9 +163,9 @@ static int __init exynos5_clk_probe(struct platform_device *pdev) if (of_property_read_string(np, "label", &name) < 0) continue; for (i = 0; i < nr_cmus; i++) - if (strcmp(cmu[i].pd_name, name) == 0) + if (strcmp(cmu[i]->pd_name, name) == 0) exynos5_clk_register_subcmu(&pdev->dev, - &cmu[i], np); + cmu[i], np); } return 0; } diff --git a/drivers/clk/samsung/clk-exynos5-subcmu.h b/drivers/clk/samsung/clk-exynos5-subcmu.h index 755ee8aaa3de..9ae5356f25aa 100644 --- a/drivers/clk/samsung/clk-exynos5-subcmu.h +++ b/drivers/clk/samsung/clk-exynos5-subcmu.h @@ -21,6 +21,6 @@ struct exynos5_subcmu_info { }; void exynos5_subcmus_init(struct samsung_clk_provider *ctx, int nr_cmus, - const struct exynos5_subcmu_info *cmu); + const struct exynos5_subcmu_info **cmu); #endif diff --git a/drivers/clk/samsung/clk-exynos5250.c b/drivers/clk/samsung/clk-exynos5250.c index f2b896881768..931c70a4da19 100644 --- a/drivers/clk/samsung/clk-exynos5250.c +++ b/drivers/clk/samsung/clk-exynos5250.c @@ -681,6 +681,10 @@ static const struct exynos5_subcmu_info exynos5250_disp_subcmu = { .pd_name = "DISP1", }; +static const struct exynos5_subcmu_info *exynos5250_subcmus[] = { + &exynos5250_disp_subcmu, +}; + static const struct samsung_pll_rate_table vpll_24mhz_tbl[] __initconst = { /* sorted in descending order */ /* PLL_36XX_RATE(rate, m, p, s, k) */ @@ -843,7 +847,8 @@ static void __init exynos5250_clk_init(struct device_node *np) samsung_clk_sleep_init(reg_base, exynos5250_clk_regs, ARRAY_SIZE(exynos5250_clk_regs)); - exynos5_subcmus_init(ctx, 1, &exynos5250_disp_subcmu); + exynos5_subcmus_init(ctx, ARRAY_SIZE(exynos5250_subcmus), + exynos5250_subcmus); samsung_clk_of_add_provider(np, ctx); diff --git a/drivers/clk/samsung/clk-exynos5420.c b/drivers/clk/samsung/clk-exynos5420.c index 01bca5a498b2..fdb17c799aa5 100644 --- a/drivers/clk/samsung/clk-exynos5420.c +++ b/drivers/clk/samsung/clk-exynos5420.c @@ -1281,32 +1281,40 @@ static struct exynos5_subcmu_reg_dump exynos5x_mfc_suspend_regs[] = { { DIV4_RATIO, 0, 0x3 }, /* DIV dout_mfc_blk */ }; -static const struct exynos5_subcmu_info exynos5x_subcmus[] = { - { - .div_clks = exynos5x_disp_div_clks, - .nr_div_clks = ARRAY_SIZE(exynos5x_disp_div_clks), - .gate_clks = exynos5x_disp_gate_clks, - .nr_gate_clks = ARRAY_SIZE(exynos5x_disp_gate_clks), - .suspend_regs = exynos5x_disp_suspend_regs, - .nr_suspend_regs = ARRAY_SIZE(exynos5x_disp_suspend_regs), - .pd_name = "DISP", - }, { - .div_clks = exynos5x_gsc_div_clks, - .nr_div_clks = ARRAY_SIZE(exynos5x_gsc_div_clks), - .gate_clks = exynos5x_gsc_gate_clks, - .nr_gate_clks = ARRAY_SIZE(exynos5x_gsc_gate_clks), - .suspend_regs = exynos5x_gsc_suspend_regs, - .nr_suspend_regs = ARRAY_SIZE(exynos5x_gsc_suspend_regs), - .pd_name = "GSC", - }, { - .div_clks = exynos5x_mfc_div_clks, - .nr_div_clks = ARRAY_SIZE(exynos5x_mfc_div_clks), - .gate_clks = exynos5x_mfc_gate_clks, - .nr_gate_clks = ARRAY_SIZE(exynos5x_mfc_gate_clks), - .suspend_regs = exynos5x_mfc_suspend_regs, - .nr_suspend_regs = ARRAY_SIZE(exynos5x_mfc_suspend_regs), - .pd_name = "MFC", - }, +static const struct exynos5_subcmu_info exynos5x_disp_subcmu = { + .div_clks = exynos5x_disp_div_clks, + .nr_div_clks = ARRAY_SIZE(exynos5x_disp_div_clks), + .gate_clks = exynos5x_disp_gate_clks, + .nr_gate_clks = ARRAY_SIZE(exynos5x_disp_gate_clks), + .suspend_regs = exynos5x_disp_suspend_regs, + .nr_suspend_regs = ARRAY_SIZE(exynos5x_disp_suspend_regs), + .pd_name = "DISP", +}; + +static const struct exynos5_subcmu_info exynos5x_gsc_subcmu = { + .div_clks = exynos5x_gsc_div_clks, + .nr_div_clks = ARRAY_SIZE(exynos5x_gsc_div_clks), + .gate_clks = exynos5x_gsc_gate_clks, + .nr_gate_clks = ARRAY_SIZE(exynos5x_gsc_gate_clks), + .suspend_regs = exynos5x_gsc_suspend_regs, + .nr_suspend_regs = ARRAY_SIZE(exynos5x_gsc_suspend_regs), + .pd_name = "GSC", +}; + +static const struct exynos5_subcmu_info exynos5x_mfc_subcmu = { + .div_clks = exynos5x_mfc_div_clks, + .nr_div_clks = ARRAY_SIZE(exynos5x_mfc_div_clks), + .gate_clks = exynos5x_mfc_gate_clks, + .nr_gate_clks = ARRAY_SIZE(exynos5x_mfc_gate_clks), + .suspend_regs = exynos5x_mfc_suspend_regs, + .nr_suspend_regs = ARRAY_SIZE(exynos5x_mfc_suspend_regs), + .pd_name = "MFC", +}; + +static const struct exynos5_subcmu_info *exynos5x_subcmus[] = { + &exynos5x_disp_subcmu, + &exynos5x_gsc_subcmu, + &exynos5x_mfc_subcmu, }; static const struct samsung_pll_rate_table exynos5420_pll2550x_24mhz_tbl[] __initconst = { From b6adeb6bc61c2567b9efd815d61a61b34a2e51a6 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Thu, 8 Aug 2019 16:49:29 +0200 Subject: [PATCH 010/266] clk: samsung: exynos5800: Move MAU subsystem clocks to MAU sub-CMU This patch fixes broken sound on Exynos5422/5800 platforms after system/suspend resume cycle in cases where the audio root clock is derived from MAU_EPLL_CLK. In order to preserve state of the USER_MUX_MAU_EPLL_CLK clock mux during system suspend/resume cycle for Exynos5800 we group the MAU block input clocks in "MAU" sub-CMU and add the clock mux control bit to .suspend_regs. This ensures that user configuration of the mux is not lost after the PMU block changes the mux setting to OSC_DIV when switching off the MAU power domain. Adding the SRC_TOP9 register to exynos5800_clk_regs[] array is not sufficient as at the time of the syscore_ops suspend call MAU power domain is already turned off and we already save and subsequently restore an incorrect register's value. Fixes: b06a532bf1fa ("clk: samsung: Add Exynos5 sub-CMU clock driver") Reported-by: Jaafar Ali Suggested-by: Marek Szyprowski Tested-by: Jaafar Ali Signed-off-by: Sylwester Nawrocki Link: https://lkml.kernel.org/r/20190808144929.18685-2-s.nawrocki@samsung.com Signed-off-by: Stephen Boyd --- drivers/clk/samsung/clk-exynos5420.c | 54 ++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/drivers/clk/samsung/clk-exynos5420.c b/drivers/clk/samsung/clk-exynos5420.c index fdb17c799aa5..2d18e1ae25d7 100644 --- a/drivers/clk/samsung/clk-exynos5420.c +++ b/drivers/clk/samsung/clk-exynos5420.c @@ -534,8 +534,6 @@ static const struct samsung_gate_clock exynos5800_gate_clks[] __initconst = { GATE_BUS_TOP, 24, 0, 0), GATE(CLK_ACLK432_SCALER, "aclk432_scaler", "mout_user_aclk432_scaler", GATE_BUS_TOP, 27, CLK_IS_CRITICAL, 0), - GATE(CLK_MAU_EPLL, "mau_epll", "mout_user_mau_epll", - SRC_MASK_TOP7, 20, CLK_SET_RATE_PARENT, 0), }; static const struct samsung_mux_clock exynos5420_mux_clks[] __initconst = { @@ -577,8 +575,13 @@ static const struct samsung_div_clock exynos5420_div_clks[] __initconst = { static const struct samsung_gate_clock exynos5420_gate_clks[] __initconst = { GATE(CLK_SECKEY, "seckey", "aclk66_psgen", GATE_BUS_PERIS1, 1, 0, 0), + /* Maudio Block */ GATE(CLK_MAU_EPLL, "mau_epll", "mout_mau_epll_clk", SRC_MASK_TOP7, 20, CLK_SET_RATE_PARENT, 0), + GATE(CLK_SCLK_MAUDIO0, "sclk_maudio0", "dout_maudio0", + GATE_TOP_SCLK_MAU, 0, CLK_SET_RATE_PARENT, 0), + GATE(CLK_SCLK_MAUPCM0, "sclk_maupcm0", "dout_maupcm0", + GATE_TOP_SCLK_MAU, 1, CLK_SET_RATE_PARENT, 0), }; static const struct samsung_mux_clock exynos5x_mux_clks[] __initconst = { @@ -1017,12 +1020,6 @@ static const struct samsung_gate_clock exynos5x_gate_clks[] __initconst = { GATE(CLK_SCLK_DP1, "sclk_dp1", "dout_dp1", GATE_TOP_SCLK_DISP1, 20, CLK_SET_RATE_PARENT, 0), - /* Maudio Block */ - GATE(CLK_SCLK_MAUDIO0, "sclk_maudio0", "dout_maudio0", - GATE_TOP_SCLK_MAU, 0, CLK_SET_RATE_PARENT, 0), - GATE(CLK_SCLK_MAUPCM0, "sclk_maupcm0", "dout_maupcm0", - GATE_TOP_SCLK_MAU, 1, CLK_SET_RATE_PARENT, 0), - /* FSYS Block */ GATE(CLK_TSI, "tsi", "aclk200_fsys", GATE_BUS_FSYS0, 0, 0, 0), GATE(CLK_PDMA0, "pdma0", "aclk200_fsys", GATE_BUS_FSYS0, 1, 0, 0), @@ -1281,6 +1278,20 @@ static struct exynos5_subcmu_reg_dump exynos5x_mfc_suspend_regs[] = { { DIV4_RATIO, 0, 0x3 }, /* DIV dout_mfc_blk */ }; + +static const struct samsung_gate_clock exynos5800_mau_gate_clks[] __initconst = { + GATE(CLK_MAU_EPLL, "mau_epll", "mout_user_mau_epll", + SRC_MASK_TOP7, 20, CLK_SET_RATE_PARENT, 0), + GATE(CLK_SCLK_MAUDIO0, "sclk_maudio0", "dout_maudio0", + GATE_TOP_SCLK_MAU, 0, CLK_SET_RATE_PARENT, 0), + GATE(CLK_SCLK_MAUPCM0, "sclk_maupcm0", "dout_maupcm0", + GATE_TOP_SCLK_MAU, 1, CLK_SET_RATE_PARENT, 0), +}; + +static struct exynos5_subcmu_reg_dump exynos5800_mau_suspend_regs[] = { + { SRC_TOP9, 0, BIT(8) }, /* MUX mout_user_mau_epll */ +}; + static const struct exynos5_subcmu_info exynos5x_disp_subcmu = { .div_clks = exynos5x_disp_div_clks, .nr_div_clks = ARRAY_SIZE(exynos5x_disp_div_clks), @@ -1311,12 +1322,27 @@ static const struct exynos5_subcmu_info exynos5x_mfc_subcmu = { .pd_name = "MFC", }; +static const struct exynos5_subcmu_info exynos5800_mau_subcmu = { + .gate_clks = exynos5800_mau_gate_clks, + .nr_gate_clks = ARRAY_SIZE(exynos5800_mau_gate_clks), + .suspend_regs = exynos5800_mau_suspend_regs, + .nr_suspend_regs = ARRAY_SIZE(exynos5800_mau_suspend_regs), + .pd_name = "MAU", +}; + static const struct exynos5_subcmu_info *exynos5x_subcmus[] = { &exynos5x_disp_subcmu, &exynos5x_gsc_subcmu, &exynos5x_mfc_subcmu, }; +static const struct exynos5_subcmu_info *exynos5800_subcmus[] = { + &exynos5x_disp_subcmu, + &exynos5x_gsc_subcmu, + &exynos5x_mfc_subcmu, + &exynos5800_mau_subcmu, +}; + static const struct samsung_pll_rate_table exynos5420_pll2550x_24mhz_tbl[] __initconst = { PLL_35XX_RATE(24 * MHZ, 2000000000, 250, 3, 0), PLL_35XX_RATE(24 * MHZ, 1900000000, 475, 6, 0), @@ -1547,11 +1573,17 @@ static void __init exynos5x_clk_init(struct device_node *np, samsung_clk_extended_sleep_init(reg_base, exynos5x_clk_regs, ARRAY_SIZE(exynos5x_clk_regs), exynos5420_set_clksrc, ARRAY_SIZE(exynos5420_set_clksrc)); - if (soc == EXYNOS5800) + + if (soc == EXYNOS5800) { samsung_clk_sleep_init(reg_base, exynos5800_clk_regs, ARRAY_SIZE(exynos5800_clk_regs)); - exynos5_subcmus_init(ctx, ARRAY_SIZE(exynos5x_subcmus), - exynos5x_subcmus); + + exynos5_subcmus_init(ctx, ARRAY_SIZE(exynos5800_subcmus), + exynos5800_subcmus); + } else { + exynos5_subcmus_init(ctx, ARRAY_SIZE(exynos5x_subcmus), + exynos5x_subcmus); + } samsung_clk_of_add_provider(np, ctx); } From baf7b79e1ad79a41fafd8ab8597b9a96962d822d Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 8 Aug 2019 14:18:39 +0200 Subject: [PATCH 011/266] clk: samsung: exynos542x: Move MSCL subsystem clocks to its sub-CMU M2M scaler clocks require special handling of their parent bus clock during power domain on/off sequences. MSCL clocks were not initially added to the sub-CMU handler, because that time there was no driver for the M2M scaler device and it was not possible to test it. This patch fixes this issue. Parent clock for M2M scaler devices is now properly preserved during MSC power domain on/off sequence. This gives M2M scaler devices proper performance: fullHD XRGB32 image 1000 rotations test takes 3.17s instead of 45.08s. Fixes: b06a532bf1fa ("clk: samsung: Add Exynos5 sub-CMU clock driver") Signed-off-by: Marek Szyprowski Link: https://lkml.kernel.org/r/20190808121839.23892-1-m.szyprowski@samsung.com Acked-by: Sylwester Nawrocki Signed-off-by: Stephen Boyd --- drivers/clk/samsung/clk-exynos5420.c | 48 ++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/drivers/clk/samsung/clk-exynos5420.c b/drivers/clk/samsung/clk-exynos5420.c index 2d18e1ae25d7..7670cc596c74 100644 --- a/drivers/clk/samsung/clk-exynos5420.c +++ b/drivers/clk/samsung/clk-exynos5420.c @@ -893,9 +893,6 @@ static const struct samsung_div_clock exynos5x_div_clks[] __initconst = { /* GSCL Block */ DIV(0, "dout_gscl_blk_333", "aclk333_432_gscl", DIV2_RATIO0, 6, 2), - /* MSCL Block */ - DIV(0, "dout_mscl_blk", "aclk400_mscl", DIV2_RATIO0, 28, 2), - /* PSGEN */ DIV(0, "dout_gen_blk", "mout_user_aclk266", DIV2_RATIO0, 8, 1), DIV(0, "dout_jpg_blk", "aclk166", DIV2_RATIO0, 20, 1), @@ -1159,17 +1156,6 @@ static const struct samsung_gate_clock exynos5x_gate_clks[] __initconst = { GATE(CLK_FIMC_LITE3, "fimc_lite3", "aclk333_432_gscl", GATE_IP_GSCL1, 17, 0, 0), - /* MSCL Block */ - GATE(CLK_MSCL0, "mscl0", "aclk400_mscl", GATE_IP_MSCL, 0, 0, 0), - GATE(CLK_MSCL1, "mscl1", "aclk400_mscl", GATE_IP_MSCL, 1, 0, 0), - GATE(CLK_MSCL2, "mscl2", "aclk400_mscl", GATE_IP_MSCL, 2, 0, 0), - GATE(CLK_SMMU_MSCL0, "smmu_mscl0", "dout_mscl_blk", - GATE_IP_MSCL, 8, 0, 0), - GATE(CLK_SMMU_MSCL1, "smmu_mscl1", "dout_mscl_blk", - GATE_IP_MSCL, 9, 0, 0), - GATE(CLK_SMMU_MSCL2, "smmu_mscl2", "dout_mscl_blk", - GATE_IP_MSCL, 10, 0, 0), - /* ISP */ GATE(CLK_SCLK_UART_ISP, "sclk_uart_isp", "dout_uart_isp", GATE_TOP_SCLK_ISP, 0, CLK_SET_RATE_PARENT, 0), @@ -1278,6 +1264,28 @@ static struct exynos5_subcmu_reg_dump exynos5x_mfc_suspend_regs[] = { { DIV4_RATIO, 0, 0x3 }, /* DIV dout_mfc_blk */ }; +static const struct samsung_gate_clock exynos5x_mscl_gate_clks[] __initconst = { + /* MSCL Block */ + GATE(CLK_MSCL0, "mscl0", "aclk400_mscl", GATE_IP_MSCL, 0, 0, 0), + GATE(CLK_MSCL1, "mscl1", "aclk400_mscl", GATE_IP_MSCL, 1, 0, 0), + GATE(CLK_MSCL2, "mscl2", "aclk400_mscl", GATE_IP_MSCL, 2, 0, 0), + GATE(CLK_SMMU_MSCL0, "smmu_mscl0", "dout_mscl_blk", + GATE_IP_MSCL, 8, 0, 0), + GATE(CLK_SMMU_MSCL1, "smmu_mscl1", "dout_mscl_blk", + GATE_IP_MSCL, 9, 0, 0), + GATE(CLK_SMMU_MSCL2, "smmu_mscl2", "dout_mscl_blk", + GATE_IP_MSCL, 10, 0, 0), +}; + +static const struct samsung_div_clock exynos5x_mscl_div_clks[] __initconst = { + DIV(0, "dout_mscl_blk", "aclk400_mscl", DIV2_RATIO0, 28, 2), +}; + +static struct exynos5_subcmu_reg_dump exynos5x_mscl_suspend_regs[] = { + { GATE_IP_MSCL, 0xffffffff, 0xffffffff }, /* MSCL gates */ + { SRC_TOP3, 0, BIT(4) }, /* MUX mout_user_aclk400_mscl */ + { DIV2_RATIO0, 0, 0x30000000 }, /* DIV dout_mscl_blk */ +}; static const struct samsung_gate_clock exynos5800_mau_gate_clks[] __initconst = { GATE(CLK_MAU_EPLL, "mau_epll", "mout_user_mau_epll", @@ -1322,6 +1330,16 @@ static const struct exynos5_subcmu_info exynos5x_mfc_subcmu = { .pd_name = "MFC", }; +static const struct exynos5_subcmu_info exynos5x_mscl_subcmu = { + .div_clks = exynos5x_mscl_div_clks, + .nr_div_clks = ARRAY_SIZE(exynos5x_mscl_div_clks), + .gate_clks = exynos5x_mscl_gate_clks, + .nr_gate_clks = ARRAY_SIZE(exynos5x_mscl_gate_clks), + .suspend_regs = exynos5x_mscl_suspend_regs, + .nr_suspend_regs = ARRAY_SIZE(exynos5x_mscl_suspend_regs), + .pd_name = "MSC", +}; + static const struct exynos5_subcmu_info exynos5800_mau_subcmu = { .gate_clks = exynos5800_mau_gate_clks, .nr_gate_clks = ARRAY_SIZE(exynos5800_mau_gate_clks), @@ -1334,12 +1352,14 @@ static const struct exynos5_subcmu_info *exynos5x_subcmus[] = { &exynos5x_disp_subcmu, &exynos5x_gsc_subcmu, &exynos5x_mfc_subcmu, + &exynos5x_mscl_subcmu, }; static const struct exynos5_subcmu_info *exynos5800_subcmus[] = { &exynos5x_disp_subcmu, &exynos5x_gsc_subcmu, &exynos5x_mfc_subcmu, + &exynos5x_mscl_subcmu, &exynos5800_mau_subcmu, }; From 1109635b292c82e7a2aa15e38edb7c389e34b693 Mon Sep 17 00:00:00 2001 From: "Lowry Li (Arm Technology China)" Date: Fri, 2 Aug 2019 08:07:52 +0000 Subject: [PATCH 012/266] drm/komeda: Initialize and enable output polling on Komeda Initialize and enable output polling on Komeda. Changes since v1: 1. Enable the polling before registering the driver; 2. Disable the polling after unregistering the driver. Changes since v2: 1. If driver register failed, disable the polling. Signed-off-by: Lowry Li (Arm Technology China) Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/1564733249-24329-1-git-send-email-lowry.li@arm.com --- drivers/gpu/drm/arm/display/komeda/komeda_kms.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_kms.c b/drivers/gpu/drm/arm/display/komeda/komeda_kms.c index 419a8b0e5de8..d50e75f0b2bd 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_kms.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_kms.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "komeda_dev.h" #include "komeda_framebuffer.h" @@ -315,6 +316,8 @@ struct komeda_kms_dev *komeda_kms_attach(struct komeda_dev *mdev) drm->irq_enabled = true; + drm_kms_helper_poll_init(drm); + err = drm_dev_register(drm, 0); if (err) goto cleanup_mode_config; @@ -322,6 +325,7 @@ struct komeda_kms_dev *komeda_kms_attach(struct komeda_dev *mdev) return kms; cleanup_mode_config: + drm_kms_helper_poll_fini(drm); drm->irq_enabled = false; drm_mode_config_cleanup(drm); komeda_kms_cleanup_private_objs(kms); @@ -338,6 +342,7 @@ void komeda_kms_detach(struct komeda_kms_dev *kms) drm->irq_enabled = false; mdev->funcs->disable_irq(mdev); drm_dev_unregister(drm); + drm_kms_helper_poll_fini(drm); component_unbind_all(mdev->dev, drm); komeda_kms_cleanup_private_objs(kms); drm_mode_config_cleanup(drm); From a9577f1921255b975da2a47d8f6733b2d6c98193 Mon Sep 17 00:00:00 2001 From: "Lowry Li (Arm Technology China)" Date: Tue, 6 Aug 2019 06:31:56 +0000 Subject: [PATCH 013/266] drm/komeda: Adds internal bpp computing for arm afbc only format YU08 YU10 The drm_format_info doesn't have any cpp or block_size (both are zero) information for arm only afbc format YU08/YU10. we need to compute it by ourselves. Changes since v1: 1. Removed redundant warning check in komeda_get_afbc_format_bpp(); 2. Removed a redundant empty line; 3. Rebased the branch. Signed-off-by: Lowry Li (Arm Technology China) Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/1565073104-24047-1-git-send-email-lowry.li@arm.com Link: https://patchwork.freedesktop.org/patch/msgid/1565073104-24047-1-git-send-email-lowry.li@arm.com --- .../arm/display/komeda/komeda_format_caps.c | 19 +++++++++++++++++++ .../arm/display/komeda/komeda_format_caps.h | 3 +++ .../arm/display/komeda/komeda_framebuffer.c | 5 +++-- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_format_caps.c b/drivers/gpu/drm/arm/display/komeda/komeda_format_caps.c index cd4d9f53ddef..c9a1edb9a000 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_format_caps.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_format_caps.c @@ -35,6 +35,25 @@ komeda_get_format_caps(struct komeda_format_caps_table *table, return NULL; } +u32 komeda_get_afbc_format_bpp(const struct drm_format_info *info, u64 modifier) +{ + u32 bpp; + + switch (info->format) { + case DRM_FORMAT_YUV420_8BIT: + bpp = 12; + break; + case DRM_FORMAT_YUV420_10BIT: + bpp = 15; + break; + default: + bpp = info->cpp[0] * 8; + break; + } + + return bpp; +} + /* Two assumptions * 1. RGB always has YTR * 2. Tiled RGB always has SC diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_format_caps.h b/drivers/gpu/drm/arm/display/komeda/komeda_format_caps.h index 3631910d33b5..32273cf18f7c 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_format_caps.h +++ b/drivers/gpu/drm/arm/display/komeda/komeda_format_caps.h @@ -97,6 +97,9 @@ const struct komeda_format_caps * komeda_get_format_caps(struct komeda_format_caps_table *table, u32 fourcc, u64 modifier); +u32 komeda_get_afbc_format_bpp(const struct drm_format_info *info, + u64 modifier); + u32 *komeda_get_layer_fourcc_list(struct komeda_format_caps_table *table, u32 layer_type, u32 *n_fmts); diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_framebuffer.c b/drivers/gpu/drm/arm/display/komeda/komeda_framebuffer.c index 3b0a70ed6aa0..1b01a625f40e 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_framebuffer.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_framebuffer.c @@ -43,7 +43,7 @@ komeda_fb_afbc_size_check(struct komeda_fb *kfb, struct drm_file *file, struct drm_framebuffer *fb = &kfb->base; const struct drm_format_info *info = fb->format; struct drm_gem_object *obj; - u32 alignment_w = 0, alignment_h = 0, alignment_header, n_blocks; + u32 alignment_w = 0, alignment_h = 0, alignment_header, n_blocks, bpp; u64 min_size; obj = drm_gem_object_lookup(file, mode_cmd->handles[0]); @@ -88,8 +88,9 @@ komeda_fb_afbc_size_check(struct komeda_fb *kfb, struct drm_file *file, kfb->offset_payload = ALIGN(n_blocks * AFBC_HEADER_SIZE, alignment_header); + bpp = komeda_get_afbc_format_bpp(info, fb->modifier); kfb->afbc_size = kfb->offset_payload + n_blocks * - ALIGN(info->cpp[0] * AFBC_SUPERBLK_PIXELS, + ALIGN(bpp * AFBC_SUPERBLK_PIXELS / 8, AFBC_SUPERBLK_ALIGNMENT); min_size = kfb->afbc_size + fb->offsets[0]; if (min_size > obj->size) { From 8f1c748b9a7751ee1297b4880788a09f7c802eb4 Mon Sep 17 00:00:00 2001 From: Mihail Atanassov Date: Mon, 5 Aug 2019 09:56:25 +0000 Subject: [PATCH 014/266] drm/komeda: Add support for 'memory-region' DT node property The 'memory-region' property of the komeda display driver DT binding allows the use of a 'reserved-memory' node for buffer allocations. Add the requisite of_reserved_mem_device_{init,release} calls to actually make use of the memory if present. Changes since v1: - Move handling inside komeda_parse_dt Signed-off-by: Mihail Atanassov Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/20190805095408.21285-1-mihail.atanassov@arm.com --- drivers/gpu/drm/arm/display/komeda/komeda_dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_dev.c b/drivers/gpu/drm/arm/display/komeda/komeda_dev.c index 5a118984de33..a0eabc134dd6 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_dev.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_dev.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_DEBUG_FS @@ -143,6 +144,12 @@ static int komeda_parse_dt(struct device *dev, struct komeda_dev *mdev) return mdev->irq; } + /* Get the optional framebuffer memory resource */ + ret = of_reserved_mem_device_init(dev); + if (ret && ret != -ENODEV) + return ret; + ret = 0; + for_each_available_child_of_node(np, child) { if (of_node_cmp(child->name, "pipeline") == 0) { ret = komeda_parse_pipe_dt(mdev, child); @@ -289,6 +296,8 @@ void komeda_dev_destroy(struct komeda_dev *mdev) mdev->n_pipelines = 0; + of_reserved_mem_device_release(dev); + if (funcs && funcs->cleanup) funcs->cleanup(mdev); From 63daf4e166545363f3b875f5b81aecb46e1e1d19 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Fri, 9 Aug 2019 13:00:38 +0300 Subject: [PATCH 015/266] drm/omap: ensure we have a valid dma_mask The omapdrm driver uses dma_set_coherent_mask(), but that's not enough anymore when LPAE is enabled. From Christoph Hellwig : > The traditional arm DMA code ignores, but the generic dma-direct/swiotlb > has stricter checks and thus fails mappings without a DMA mask. As we > use swiotlb for arm with LPAE now, omapdrm needs to catch up and > actually set a DMA mask. Change the dma_set_coherent_mask() call to dma_coerce_mask_and_coherent() so that the dev->dma_mask is also set. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Reported-by: "H. Nikolaus Schaller" Signed-off-by: Tomi Valkeinen Link: https://patchwork.freedesktop.org/patch/msgid/c219e7e6-0f66-d6fd-e0cf-59c803386825@ti.com Reviewed-by: Christoph Hellwig Reviewed-by: Peter Ujfalusi --- drivers/gpu/drm/omapdrm/omap_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/omapdrm/omap_drv.c b/drivers/gpu/drm/omapdrm/omap_drv.c index 288c59dae56a..1bad0a2cc5c6 100644 --- a/drivers/gpu/drm/omapdrm/omap_drv.c +++ b/drivers/gpu/drm/omapdrm/omap_drv.c @@ -669,7 +669,7 @@ static int pdev_probe(struct platform_device *pdev) if (omapdss_is_initialized() == false) return -EPROBE_DEFER; - ret = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); + ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (ret) { dev_err(&pdev->dev, "Failed to set the DMA mask\n"); return ret; From bb2d267c448f4bc3a3389d97c56391cb779178ae Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 12 Aug 2019 17:03:32 +0200 Subject: [PATCH 016/266] s390/bpf: fix lcgr instruction encoding "masking, test in bounds 3" fails on s390, because BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0) ignores the top 32 bits of BPF_REG_2. The reason is that JIT emits lcgfr instead of lcgr. The associated comment indicates that the code was intended to emit lcgr in the first place, it's just that the wrong opcode was used. Fix by using the correct opcode. Fixes: 054623105728 ("s390/bpf: Add s390x eBPF JIT compiler backend") Signed-off-by: Ilya Leoshkevich Acked-by: Vasily Gorbik Signed-off-by: Daniel Borkmann --- arch/s390/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e636728ab452..6299156f9738 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -863,7 +863,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i break; case BPF_ALU64 | BPF_NEG: /* dst = -dst */ /* lcgr %dst,%dst */ - EMIT4(0xb9130000, dst_reg, dst_reg); + EMIT4(0xb9030000, dst_reg, dst_reg); break; /* * BPF_FROM_BE/LE From 7cdf6e40537f4f287c8e21b99cb4cd082a33bef0 Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Mon, 12 Aug 2019 11:55:52 -0700 Subject: [PATCH 017/266] HID: wacom: add back changes dropped in merge commit Merge commit 74acee309fb2 ("Merge branches 'for-5.2/fixes', 'for-5.3/doc', 'for-5.3/ish', 'for-5.3/logitech' and 'for-5.3/wacom' into for-linus") inadvertently dropped this change from commit 912c6aa67ad4 ("HID: wacom: Add 2nd gen Intuos Pro Small support"). Signed-off-by: Aaron Armstrong Skomra Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 7a8ddc999a8e..50074485b88b 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1290,7 +1290,8 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom) } if (wacom->tool[0]) { input_report_abs(pen_input, ABS_PRESSURE, get_unaligned_le16(&frame[5])); - if (wacom->features.type == INTUOSP2_BT) { + if (wacom->features.type == INTUOSP2_BT || + wacom->features.type == INTUOSP2S_BT) { input_report_abs(pen_input, ABS_DISTANCE, range ? frame[13] : wacom->features.distance_max); } else { From 91b4db5313a2c793aabc2143efb8ed0cf0fdd097 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 12 Aug 2019 18:18:07 +0200 Subject: [PATCH 018/266] s390/bpf: use 32-bit index for tail calls "p runtime/jit: pass > 32bit index to tail_call" fails when bpf_jit_enable=1, because the tail call is not executed. This in turn is because the generated code assumes index is 64-bit, while it must be 32-bit, and as a result prog array bounds check fails, while it should pass. Even if bounds check would have passed, the code that follows uses 64-bit index to compute prog array offset. Fix by using clrj instead of clgrj for comparing index with array size, and also by using llgfr for truncating index to 32 bits before using it to compute prog array offset. Fixes: 6651ee070b31 ("s390/bpf: implement bpf_tail_call() helper") Reported-by: Yauheni Kaliuta Acked-by: Vasily Gorbik Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann --- arch/s390/net/bpf_jit_comp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 6299156f9738..955eb355c2fd 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1049,8 +1049,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i /* llgf %w1,map.max_entries(%b2) */ EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2, offsetof(struct bpf_array, map.max_entries)); - /* clgrj %b3,%w1,0xa,label0: if %b3 >= %w1 goto out */ - EMIT6_PCREL_LABEL(0xec000000, 0x0065, BPF_REG_3, + /* clrj %b3,%w1,0xa,label0: if (u32)%b3 >= (u32)%w1 goto out */ + EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0, 0xa); /* @@ -1076,8 +1076,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i * goto out; */ - /* sllg %r1,%b3,3: %r1 = index * 8 */ - EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, BPF_REG_3, REG_0, 3); + /* llgfr %r1,%b3: %r1 = (u32) index */ + EMIT4(0xb9160000, REG_1, BPF_REG_3); + /* sllg %r1,%r1,3: %r1 *= 8 */ + EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, REG_1, REG_0, 3); /* lg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, BPF_REG_2, REG_1, offsetof(struct bpf_array, ptrs)); From addf3382c47c033e579c9c88f18e36c4e75d806a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 13 Aug 2019 15:38:06 +0200 Subject: [PATCH 019/266] Revert "HID: logitech-hidpp: add USB PID for a few more supported mice" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This partially reverts commit 27fc32fd9417968a459d43d9a7c50fd423d53eb9. It turns out that the G502 has some issues with hid-logitech-hidpp: when plugging it in, the driver tries to contact it but it fails. So the driver bails out leaving only the mouse event node available. This timeout is problematic as it introduce a delay in the boot, and having only the mouse event node means that the hardware macros keys can not be relayed to the userspace. Filipe and I just gave a shot at the following devices: G403 Wireless (0xC082) G703 (0xC087) G703 Hero (0xC090) G903 (0xC086) G903 Hero (0xC091) G Pro (0xC088) Reverting the devices we are not sure that works flawlessly. Reviewed-by: Filipe Laíns Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-logitech-hidpp.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index 21268c9fa71a..343052b117a9 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -3749,30 +3749,10 @@ static const struct hid_device_id hidpp_devices[] = { { L27MHZ_DEVICE(HID_ANY_ID) }, - { /* Logitech G203/Prodigy Gaming Mouse */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC084) }, - { /* Logitech G302 Gaming Mouse */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC07F) }, - { /* Logitech G303 Gaming Mouse */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC080) }, - { /* Logitech G400 Gaming Mouse */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC07E) }, { /* Logitech G403 Wireless Gaming Mouse over USB */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC082) }, - { /* Logitech G403 Gaming Mouse */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC083) }, - { /* Logitech G403 Hero Gaming Mouse over USB */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC08F) }, - { /* Logitech G502 Proteus Core Gaming Mouse */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC07D) }, - { /* Logitech G502 Proteus Spectrum Gaming Mouse over USB */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC332) }, - { /* Logitech G502 Hero Gaming Mouse over USB */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC08B) }, { /* Logitech G700 Gaming Mouse over USB */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC06B) }, - { /* Logitech G700s Gaming Mouse over USB */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC07C) }, { /* Logitech G703 Gaming Mouse over USB */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC087) }, { /* Logitech G703 Hero Gaming Mouse over USB */ From a3384b8d9f63cc042711293bb97bdc92dca0391d Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 13 Aug 2019 15:38:07 +0200 Subject: [PATCH 020/266] HID: logitech-hidpp: remove support for the G700 over USB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The G700 suffers from the same issue than the G502: when plugging it in, the driver tries to contact it but it fails. This timeout is problematic as it introduce a delay in the boot, and having only the mouse event node means that the hardware macros keys can not be relayed to the userspace. Link: https://github.com/libratbag/libratbag/issues/797 Fixes: 91cf9a98ae41 ("HID: logitech-hidpp: make .probe usbhid capable") Cc: stable@vger.kernel.org # v5.2 Reviewed-by: Filipe Laíns Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-logitech-hidpp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index 343052b117a9..0179f7ed77e5 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -3751,8 +3751,6 @@ static const struct hid_device_id hidpp_devices[] = { { /* Logitech G403 Wireless Gaming Mouse over USB */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC082) }, - { /* Logitech G700 Gaming Mouse over USB */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC06B) }, { /* Logitech G703 Gaming Mouse over USB */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC087) }, { /* Logitech G703 Hero Gaming Mouse over USB */ From 074014abdf2bd2a00da3dd14a6ae04cafc1d62cc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 13 Aug 2019 15:28:18 +0100 Subject: [PATCH 021/266] net: ieee802154: remove redundant assignment to rc Variable rc is initialized to a value that is never read and it is re-assigned later. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Stefan Schmidt --- net/ieee802154/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index bc6b912603f1..1e9876813392 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1102,7 +1102,7 @@ static struct packet_type ieee802154_packet_type = { static int __init af_ieee802154_init(void) { - int rc = -EINVAL; + int rc; rc = proto_register(&ieee802154_raw_prot, 1); if (rc) From 2c60e6b5c9241b24b8b523fefd3e44fb85622cda Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 6 Aug 2019 13:41:51 +0200 Subject: [PATCH 022/266] gpiolib: never report open-drain/source lines as 'input' to user-space If the driver doesn't support open-drain/source config options, we emulate this behavior when setting the direction by calling gpiod_direction_input() if the default value is 0 (open-source) or 1 (open-drain), thus not actively driving the line in those cases. This however clears the FLAG_IS_OUT bit for the GPIO line descriptor and makes the LINEINFO ioctl() incorrectly report this line's mode as 'input' to user-space. This commit modifies the ioctl() to always set the GPIOLINE_FLAG_IS_OUT bit in the lineinfo structure's flags field. Since it's impossible to use the input mode and open-drain/source options at the same time, we can be sure the reported information will be correct. Fixes: 521a2ad6f862 ("gpio: add userspace ABI for GPIO line information") Cc: stable Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20190806114151.17652-1-brgl@bgdev.pl Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index f497003f119c..80a2a2cb673b 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1091,9 +1091,11 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) lineinfo.flags |= GPIOLINE_FLAG_ACTIVE_LOW; if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) - lineinfo.flags |= GPIOLINE_FLAG_OPEN_DRAIN; + lineinfo.flags |= (GPIOLINE_FLAG_OPEN_DRAIN | + GPIOLINE_FLAG_IS_OUT); if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) - lineinfo.flags |= GPIOLINE_FLAG_OPEN_SOURCE; + lineinfo.flags |= (GPIOLINE_FLAG_OPEN_SOURCE | + GPIOLINE_FLAG_IS_OUT); if (copy_to_user(ip, &lineinfo, sizeof(lineinfo))) return -EFAULT; From 68e03b85474a51ec1921b4d13204782594ef7223 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 31 Jul 2019 20:38:14 +0800 Subject: [PATCH 023/266] gpio: Fix build error of function redefinition when do randbuilding, I got this error: In file included from drivers/hwmon/pmbus/ucd9000.c:19:0: ./include/linux/gpio/driver.h:576:1: error: redefinition of gpiochip_add_pin_range gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, ^~~~~~~~~~~~~~~~~~~~~~ In file included from drivers/hwmon/pmbus/ucd9000.c:18:0: ./include/linux/gpio.h:245:1: note: previous definition of gpiochip_add_pin_range was here gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, ^~~~~~~~~~~~~~~~~~~~~~ Reported-by: Hulk Robot Fixes: 964cb341882f ("gpio: move pincontrol calls to ") Signed-off-by: YueHaibing Link: https://lore.kernel.org/r/20190731123814.46624-1-yuehaibing@huawei.com Signed-off-by: Linus Walleij --- include/linux/gpio.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 40915b461f18..f757a58191a6 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -241,30 +241,6 @@ static inline int irq_to_gpio(unsigned irq) return -EINVAL; } -static inline int -gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins) -{ - WARN_ON(1); - return -EINVAL; -} - -static inline int -gpiochip_add_pingroup_range(struct gpio_chip *chip, - struct pinctrl_dev *pctldev, - unsigned int gpio_offset, const char *pin_group) -{ - WARN_ON(1); - return -EINVAL; -} - -static inline void -gpiochip_remove_pin_ranges(struct gpio_chip *chip) -{ - WARN_ON(1); -} - static inline int devm_gpio_request(struct device *dev, unsigned gpio, const char *label) { From b14c876b994f208b6b95c222056e1deb0a45de0e Mon Sep 17 00:00:00 2001 From: Radim Krcmar Date: Tue, 13 Aug 2019 23:37:37 -0400 Subject: [PATCH 024/266] kvm: x86: skip populating logical dest map if apic is not sw enabled recalculate_apic_map does not santize ldr and it's possible that multiple bits are set. In that case, a previous valid entry can potentially be overwritten by an invalid one. This condition is hit when booting a 32 bit, >8 CPU, RHEL6 guest and then triggering a crash to boot a kdump kernel. This is the sequence of events: 1. Linux boots in bigsmp mode and enables PhysFlat, however, it still writes to the LDR which probably will never be used. 2. However, when booting into kdump, the stale LDR values remain as they are not cleared by the guest and there isn't a apic reset. 3. kdump boots with 1 cpu, and uses Logical Destination Mode but the logical map has been overwritten and points to an inactive vcpu. Signed-off-by: Radim Krcmar Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 685d17c11461..e904ff06a83d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -216,6 +216,9 @@ static void recalculate_apic_map(struct kvm *kvm) if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) new->phys_map[xapic_id] = apic; + if (!kvm_apic_sw_enabled(apic)) + continue; + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) { @@ -258,6 +261,8 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) static_key_slow_dec_deferred(&apic_sw_disabled); else static_key_slow_inc(&apic_sw_disabled.key); + + recalculate_apic_map(apic->vcpu->kvm); } } From 74260dc278a725b692b1a201c6b780a02804d3e4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 9 Aug 2019 09:18:43 +0200 Subject: [PATCH 025/266] MAINTAINERS: change list for KVM/s390 KVM/s390 does not have a list of its own, and linux-s390 is in the loop anyway thanks to the generic arch/s390 match. So use the generic KVM list for s390 patches. Signed-off-by: Paolo Bonzini --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5e1f9ee8f86f..05c107b168c0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8869,7 +8869,7 @@ M: Christian Borntraeger M: Janosch Frank R: David Hildenbrand R: Cornelia Huck -L: linux-s390@vger.kernel.org +L: kvm@vger.kernel.org W: http://www.ibm.com/developerworks/linux/linux390/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git S: Supported From ed4e7b057e9e75cecd56f6c3434f88eaa69c1209 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 9 Aug 2019 09:30:02 +0200 Subject: [PATCH 026/266] MAINTAINERS: add KVM x86 reviewers This is probably overdue---KVM x86 has quite a few contributors that usually review each other's patches, which is really helpful to me. Formalize this by listing them as reviewers. I am including people with various expertise: - Joerg for SVM (with designated reviewers, it makes more sense to have him in the main KVM/x86 stanza) - Sean for MMU and VMX - Jim for VMX - Vitaly for Hyper-V and possibly SVM - Wanpeng for LAPIC and paravirtualization. Please ack if you are okay with this arrangement, otherwise speak up. In other news, Radim is going to leave Red Hat soon. However, he has not been very much involved in upstream KVM development for some time, and in the immediate future he is still going to help maintain kvm/queue while I am on vacation. Since not much is going to change, I will let him decide whether he wants to keep the maintainer role after he leaves. Acked-by: Joerg Roedel Acked-by: Vitaly Kuznetsov Acked-by: Wanpeng Li Cc: Sean Christopherson Cc: Jim Mattson Signed-off-by: Paolo Bonzini --- MAINTAINERS | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 05c107b168c0..166e765acce8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8819,14 +8819,6 @@ F: virt/kvm/* F: tools/kvm/ F: tools/testing/selftests/kvm/ -KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd) -M: Joerg Roedel -L: kvm@vger.kernel.org -W: http://www.linux-kvm.org/ -S: Maintained -F: arch/x86/include/asm/svm.h -F: arch/x86/kvm/svm.c - KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64) M: Marc Zyngier R: James Morse @@ -8884,6 +8876,11 @@ F: tools/testing/selftests/kvm/*/s390x/ KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86) M: Paolo Bonzini M: Radim Krčmář +R: Sean Christopherson +R: Vitaly Kuznetsov +R: Wanpeng Li +R: Jim Mattson +R: Joerg Roedel L: kvm@vger.kernel.org W: http://www.linux-kvm.org T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git @@ -8891,8 +8888,12 @@ S: Supported F: arch/x86/kvm/ F: arch/x86/kvm/*/ F: arch/x86/include/uapi/asm/kvm* +F: arch/x86/include/uapi/asm/vmx.h +F: arch/x86/include/uapi/asm/svm.h F: arch/x86/include/asm/kvm* F: arch/x86/include/asm/pvclock-abi.h +F: arch/x86/include/asm/svm.h +F: arch/x86/include/asm/vmx.h F: arch/x86/kernel/kvm.c F: arch/x86/kernel/kvmclock.c From c8e174b39887ea1992286ff8ffdbcf79f6057cf2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 12 Aug 2019 10:33:00 +0800 Subject: [PATCH 027/266] KVM: x86: svm: remove redundant assignment of var new_entry new_entry is reassigned a new value next line. So it's redundant and remove it. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d685491fce4d..e3d3b2128f2b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1714,7 +1714,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (!entry) return -EINVAL; - new_entry = READ_ONCE(*entry); new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); From c7ec75ea4d5316518adc87224e3cff47192579e7 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Wed, 14 Aug 2019 10:30:14 -0500 Subject: [PATCH 028/266] clk: socfpga: stratix10: fix rate caclulationg for cnt_clks Checking bypass_reg is incorrect for calculating the cnt_clk rates. Instead we should be checking that there is a proper hardware register that holds the clock divider. Cc: stable@vger.kernel.org Signed-off-by: Dinh Nguyen Link: https://lkml.kernel.org/r/20190814153014.12962-1-dinguyen@kernel.org Signed-off-by: Stephen Boyd --- drivers/clk/socfpga/clk-periph-s10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/socfpga/clk-periph-s10.c b/drivers/clk/socfpga/clk-periph-s10.c index 5c50e723ecae..1a191eeeebba 100644 --- a/drivers/clk/socfpga/clk-periph-s10.c +++ b/drivers/clk/socfpga/clk-periph-s10.c @@ -38,7 +38,7 @@ static unsigned long clk_peri_cnt_clk_recalc_rate(struct clk_hw *hwclk, if (socfpgaclk->fixed_div) { div = socfpgaclk->fixed_div; } else { - if (!socfpgaclk->bypass_reg) + if (socfpgaclk->hw.reg) div = ((readl(socfpgaclk->hw.reg) & 0x7ff) + 1); } From 3ee1bb7aae97324ec9078da1f00cb2176919563f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 12 Aug 2019 04:57:27 -0700 Subject: [PATCH 029/266] batman-adv: fix uninit-value in batadv_netlink_get_ifindex() batadv_netlink_get_ifindex() needs to make sure user passed a correct u32 attribute. syzbot reported : BUG: KMSAN: uninit-value in batadv_netlink_dump_hardif+0x70d/0x880 net/batman-adv/netlink.c:968 CPU: 1 PID: 11705 Comm: syz-executor888 Not tainted 5.1.0+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x191/0x1f0 lib/dump_stack.c:113 kmsan_report+0x130/0x2a0 mm/kmsan/kmsan.c:622 __msan_warning+0x75/0xe0 mm/kmsan/kmsan_instr.c:310 batadv_netlink_dump_hardif+0x70d/0x880 net/batman-adv/netlink.c:968 genl_lock_dumpit+0xc6/0x130 net/netlink/genetlink.c:482 netlink_dump+0xa84/0x1ab0 net/netlink/af_netlink.c:2253 __netlink_dump_start+0xa3a/0xb30 net/netlink/af_netlink.c:2361 genl_family_rcv_msg net/netlink/genetlink.c:550 [inline] genl_rcv_msg+0xfc1/0x1a40 net/netlink/genetlink.c:627 netlink_rcv_skb+0x431/0x620 net/netlink/af_netlink.c:2486 genl_rcv+0x63/0x80 net/netlink/genetlink.c:638 netlink_unicast_kernel net/netlink/af_netlink.c:1311 [inline] netlink_unicast+0xf3e/0x1020 net/netlink/af_netlink.c:1337 netlink_sendmsg+0x127e/0x12f0 net/netlink/af_netlink.c:1926 sock_sendmsg_nosec net/socket.c:651 [inline] sock_sendmsg net/socket.c:661 [inline] ___sys_sendmsg+0xcc6/0x1200 net/socket.c:2260 __sys_sendmsg net/socket.c:2298 [inline] __do_sys_sendmsg net/socket.c:2307 [inline] __se_sys_sendmsg+0x305/0x460 net/socket.c:2305 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2305 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x440209 Fixes: b60620cf567b ("batman-adv: netlink: hardif query") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 6f08fd122a8d..7e052d6f759b 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -164,7 +164,7 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) { struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); - return attr ? nla_get_u32(attr) : 0; + return (attr && nla_len(attr) == sizeof(u32)) ? nla_get_u32(attr) : 0; } /** From 27df5c7068bf23cab282dc64b1c9894429b3b8a0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 14 Aug 2019 12:41:09 +0200 Subject: [PATCH 030/266] selftests/bpf: fix "bind{4, 6} deny specific IP & port" on s390 "bind4 allow specific IP & port" and "bind6 deny specific IP & port" fail on s390 because of endianness issue: the 4 IP address bytes are loaded as a word and compared with a constant, but the value of this constant should be different on big- and little- endian machines, which is not the case right now. Use __bpf_constant_ntohl to generate proper value based on machine endianness. Fixes: 1d436885b23b ("selftests/bpf: Selftest for sys_bind post-hooks.") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_sock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index fb679ac3d4b0..0e6652733462 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -13,6 +13,7 @@ #include #include "cgroup_helpers.h" +#include "bpf_endian.h" #include "bpf_rlimit.h" #include "bpf_util.h" @@ -232,7 +233,8 @@ static struct sock_test tests[] = { /* if (ip == expected && port == expected) */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock, src_ip6[3])), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x01000000, 4), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, + __bpf_constant_ntohl(0x00000001), 4), BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock, src_port)), BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2), @@ -261,7 +263,8 @@ static struct sock_test tests[] = { /* if (ip == expected && port == expected) */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock, src_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x0100007F, 4), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, + __bpf_constant_ntohl(0x7F000001), 4), BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock, src_port)), BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2), From 4c6f3196e6ea111c456c6086dc3f57d4706b0b2d Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Mon, 29 Jul 2019 14:33:34 +0900 Subject: [PATCH 031/266] drm/mediatek: use correct device to import PRIME buffers PRIME buffers should be imported using the DMA device. To this end, use a custom import function that mimics drm_gem_prime_import_dev(), but passes the correct device. Fixes: 119f5173628aa ("drm/mediatek: Add DRM Driver for Mediatek SoC MT8173.") Signed-off-by: Alexandre Courbot Signed-off-by: CK Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 95fdbd0fbcac..8b18a00a58c7 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -320,6 +320,18 @@ static const struct file_operations mtk_drm_fops = { .compat_ioctl = drm_compat_ioctl, }; +/* + * We need to override this because the device used to import the memory is + * not dev->dev, as drm_gem_prime_import() expects. + */ +struct drm_gem_object *mtk_drm_gem_prime_import(struct drm_device *dev, + struct dma_buf *dma_buf) +{ + struct mtk_drm_private *private = dev->dev_private; + + return drm_gem_prime_import_dev(dev, dma_buf, private->dma_dev); +} + static struct drm_driver mtk_drm_driver = { .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_PRIME | DRIVER_ATOMIC, @@ -331,7 +343,7 @@ static struct drm_driver mtk_drm_driver = { .prime_handle_to_fd = drm_gem_prime_handle_to_fd, .prime_fd_to_handle = drm_gem_prime_fd_to_handle, .gem_prime_export = drm_gem_prime_export, - .gem_prime_import = drm_gem_prime_import, + .gem_prime_import = mtk_drm_gem_prime_import, .gem_prime_get_sg_table = mtk_gem_prime_get_sg_table, .gem_prime_import_sg_table = mtk_gem_prime_import_sg_table, .gem_prime_mmap = mtk_drm_gem_mmap_buf, From 070955558e820b9a89c570b91b1f21762f62b288 Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Mon, 29 Jul 2019 14:33:35 +0900 Subject: [PATCH 032/266] drm/mediatek: set DMA max segment size This driver requires imported PRIME buffers to appear contiguously in its IO address space. Make sure this is the case by setting the maximum DMA segment size to a more suitable value than the default 64KB. Signed-off-by: Alexandre Courbot Reviewed-by: Tomasz Figa Signed-off-by: CK Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 35 ++++++++++++++++++++++++-- drivers/gpu/drm/mediatek/mtk_drm_drv.h | 2 ++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 8b18a00a58c7..c021d4c8324f 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -213,6 +213,7 @@ static int mtk_drm_kms_init(struct drm_device *drm) struct mtk_drm_private *private = drm->dev_private; struct platform_device *pdev; struct device_node *np; + struct device *dma_dev; int ret; if (!iommu_present(&platform_bus_type)) @@ -275,7 +276,29 @@ static int mtk_drm_kms_init(struct drm_device *drm) goto err_component_unbind; } - private->dma_dev = &pdev->dev; + dma_dev = &pdev->dev; + private->dma_dev = dma_dev; + + /* + * Configure the DMA segment size to make sure we get contiguous IOVA + * when importing PRIME buffers. + */ + if (!dma_dev->dma_parms) { + private->dma_parms_allocated = true; + dma_dev->dma_parms = + devm_kzalloc(drm->dev, sizeof(*dma_dev->dma_parms), + GFP_KERNEL); + } + if (!dma_dev->dma_parms) { + ret = -ENOMEM; + goto err_component_unbind; + } + + ret = dma_set_max_seg_size(dma_dev, (unsigned int)DMA_BIT_MASK(32)); + if (ret) { + dev_err(dma_dev, "Failed to set DMA segment size\n"); + goto err_unset_dma_parms; + } /* * We don't use the drm_irq_install() helpers provided by the DRM @@ -285,13 +308,16 @@ static int mtk_drm_kms_init(struct drm_device *drm) drm->irq_enabled = true; ret = drm_vblank_init(drm, MAX_CRTC); if (ret < 0) - goto err_component_unbind; + goto err_unset_dma_parms; drm_kms_helper_poll_init(drm); drm_mode_config_reset(drm); return 0; +err_unset_dma_parms: + if (private->dma_parms_allocated) + dma_dev->dma_parms = NULL; err_component_unbind: component_unbind_all(drm->dev, drm); err_config_cleanup: @@ -302,9 +328,14 @@ err_config_cleanup: static void mtk_drm_kms_deinit(struct drm_device *drm) { + struct mtk_drm_private *private = drm->dev_private; + drm_kms_helper_poll_fini(drm); drm_atomic_helper_shutdown(drm); + if (private->dma_parms_allocated) + private->dma_dev->dma_parms = NULL; + component_unbind_all(drm->dev, drm); drm_mode_config_cleanup(drm); } diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.h b/drivers/gpu/drm/mediatek/mtk_drm_drv.h index 598ff3e70446..e03fea12ff59 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.h +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.h @@ -51,6 +51,8 @@ struct mtk_drm_private { } commit; struct drm_atomic_state *suspend_state; + + bool dma_parms_allocated; }; extern struct platform_driver mtk_ddp_driver; From 26fa656e9a0cbccddf7db132ea020d2169dbe46e Mon Sep 17 00:00:00 2001 From: Bill Kuzeja Date: Wed, 14 Aug 2019 10:24:41 -0400 Subject: [PATCH 033/266] scsi: qla2xxx: Fix gnl.l memory leak on adapter init failure If HBA initialization fails unexpectedly (exiting via probe_failed:), we may fail to free vha->gnl.l. So that we don't attempt to double free, set this pointer to NULL after a free and check for NULL at probe_failed: so we know whether or not to call dma_free_coherent. Signed-off-by: Bill Kuzeja Acked-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_attr.c | 2 ++ drivers/scsi/qla2xxx/qla_os.c | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 8d560c562e9c..6b7b390b2e52 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -2956,6 +2956,8 @@ qla24xx_vport_delete(struct fc_vport *fc_vport) dma_free_coherent(&ha->pdev->dev, vha->gnl.size, vha->gnl.l, vha->gnl.ldma); + vha->gnl.l = NULL; + vfree(vha->scan.l); if (vha->qpair && vha->qpair->vp_idx == vha->vp_idx) { diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 2e58cff9d200..98e60a34afd9 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -3440,6 +3440,12 @@ skip_dpc: return 0; probe_failed: + if (base_vha->gnl.l) { + dma_free_coherent(&ha->pdev->dev, base_vha->gnl.size, + base_vha->gnl.l, base_vha->gnl.ldma); + base_vha->gnl.l = NULL; + } + if (base_vha->timer_active) qla2x00_stop_timer(base_vha); base_vha->flags.online = 0; @@ -3673,7 +3679,7 @@ qla2x00_remove_one(struct pci_dev *pdev) if (!atomic_read(&pdev->enable_cnt)) { dma_free_coherent(&ha->pdev->dev, base_vha->gnl.size, base_vha->gnl.l, base_vha->gnl.ldma); - + base_vha->gnl.l = NULL; scsi_host_put(base_vha->host); kfree(ha); pci_set_drvdata(pdev, NULL); @@ -3713,6 +3719,8 @@ qla2x00_remove_one(struct pci_dev *pdev) dma_free_coherent(&ha->pdev->dev, base_vha->gnl.size, base_vha->gnl.l, base_vha->gnl.ldma); + base_vha->gnl.l = NULL; + vfree(base_vha->scan.l); if (IS_QLAFX00(ha)) @@ -4816,6 +4824,7 @@ struct scsi_qla_host *qla2x00_create_host(struct scsi_host_template *sht, "Alloc failed for scan database.\n"); dma_free_coherent(&ha->pdev->dev, vha->gnl.size, vha->gnl.l, vha->gnl.ldma); + vha->gnl.l = NULL; scsi_remove_host(vha->host); return NULL; } From a86a75865ff4d8c05f355d1750a5250aec89ab15 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Sun, 11 Aug 2019 11:25:10 -0700 Subject: [PATCH 034/266] scsi: target: tcmu: avoid use-after-free after command timeout In tcmu_handle_completion() function, the variable called read_len is always initialized with a value taken from se_cmd structure. If this function is called to complete an expired (timed out) out command, the session command pointed by se_cmd is likely to be already deallocated by the target core at that moment. As the result, this access triggers a use-after-free warning from KASAN. This patch fixes the code not to touch se_cmd when completing timed out TCMU commands. It also resets the pointer to se_cmd at the time when the TCMU_CMD_BIT_EXPIRED flag is set because it is going to become invalid after calling target_complete_cmd() later in the same function, tcmu_check_expired_cmd(). Signed-off-by: Dmitry Fomichev Acked-by: Mike Christie Reviewed-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/target/target_core_user.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 04eda111920e..661bb9358364 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1132,14 +1132,16 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * struct se_cmd *se_cmd = cmd->se_cmd; struct tcmu_dev *udev = cmd->tcmu_dev; bool read_len_valid = false; - uint32_t read_len = se_cmd->data_length; + uint32_t read_len; /* * cmd has been completed already from timeout, just reclaim * data area space and free cmd */ - if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) { + WARN_ON_ONCE(se_cmd); goto out; + } list_del_init(&cmd->queue_entry); @@ -1152,6 +1154,7 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * goto done; } + read_len = se_cmd->data_length; if (se_cmd->data_direction == DMA_FROM_DEVICE && (entry->hdr.uflags & TCMU_UFLAG_READ_LEN) && entry->rsp.read_len) { read_len_valid = true; @@ -1307,6 +1310,7 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data) */ scsi_status = SAM_STAT_CHECK_CONDITION; list_del_init(&cmd->queue_entry); + cmd->se_cmd = NULL; } else { list_del_init(&cmd->queue_entry); idr_remove(&udev->commands, id); @@ -2022,6 +2026,7 @@ static void tcmu_reset_ring(struct tcmu_dev *udev, u8 err_level) idr_remove(&udev->commands, i); if (!test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) { + WARN_ON(!cmd->se_cmd); list_del_init(&cmd->queue_entry); if (err_level == 1) { /* From 7c7cfdcf7f1777c7376fc9a239980de04b6b5ea1 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 14 Aug 2019 15:59:50 +0300 Subject: [PATCH 035/266] scsi: ufs: Fix NULL pointer dereference in ufshcd_config_vreg_hpm() Fix the following BUG: [ 187.065689] BUG: kernel NULL pointer dereference, address: 000000000000001c [ 187.065790] RIP: 0010:ufshcd_vreg_set_hpm+0x3c/0x110 [ufshcd_core] [ 187.065938] Call Trace: [ 187.065959] ufshcd_resume+0x72/0x290 [ufshcd_core] [ 187.065980] ufshcd_system_resume+0x54/0x140 [ufshcd_core] [ 187.065993] ? pci_pm_restore+0xb0/0xb0 [ 187.066005] ufshcd_pci_resume+0x15/0x20 [ufshcd_pci] [ 187.066017] pci_pm_thaw+0x4c/0x90 [ 187.066030] dpm_run_callback+0x5b/0x150 [ 187.066043] device_resume+0x11b/0x220 Voltage regulators are optional, so functions must check they exist before dereferencing. Note this issue is hidden if CONFIG_REGULATORS is not set, because the offending code is optimised away. Notes for stable: The issue first appears in commit 57d104c153d3 ("ufs: add UFS power management support") but is inadvertently fixed in commit 60f0187031c0 ("scsi: ufs: disable vccq if it's not needed by UFS device") which in turn was reverted by commit 730679817d83 ("Revert "scsi: ufs: disable vccq if it's not needed by UFS device""). So fix applies v3.18 to v4.5 and v5.1+ Fixes: 57d104c153d3 ("ufs: add UFS power management support") Fixes: 730679817d83 ("Revert "scsi: ufs: disable vccq if it's not needed by UFS device"") Cc: stable@vger.kernel.org Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index e274053109d0..029da74bb2f5 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7062,6 +7062,9 @@ static inline int ufshcd_config_vreg_lpm(struct ufs_hba *hba, static inline int ufshcd_config_vreg_hpm(struct ufs_hba *hba, struct ufs_vreg *vreg) { + if (!vreg) + return 0; + return ufshcd_config_vreg_load(hba->dev, vreg, vreg->max_uA); } From 92cd0f0be3d7adb63611c28693ec0399beded837 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Aug 2019 18:18:55 +0200 Subject: [PATCH 036/266] selftests: kvm: do not try running the VM in vmx_set_nested_state_test This test is only covering various edge cases of the KVM_SET_NESTED_STATE ioctl. Running the VM does not really add anything. Signed-off-by: Paolo Bonzini --- .../kvm/x86_64/vmx_set_nested_state_test.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index ed7218d166da..a99fc66dafeb 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -27,22 +27,13 @@ void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state) { - volatile struct kvm_run *run; - vcpu_nested_state_set(vm, VCPU_ID, state, false); - run = vcpu_state(vm, VCPU_ID); - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, - "Got exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); } void test_nested_state_expect_errno(struct kvm_vm *vm, struct kvm_nested_state *state, int expected_errno) { - volatile struct kvm_run *run; int rv; rv = vcpu_nested_state_set(vm, VCPU_ID, state, true); @@ -50,12 +41,6 @@ void test_nested_state_expect_errno(struct kvm_vm *vm, "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", strerror(expected_errno), expected_errno, rv, strerror(errno), errno); - run = vcpu_state(vm, VCPU_ID); - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, - "Got exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); } void test_nested_state_expect_einval(struct kvm_vm *vm, From 65efa61dc0d536d5f0602c33ee805a57cc07e9dc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Aug 2019 12:02:41 -0400 Subject: [PATCH 037/266] selftests: kvm: provide common function to enable eVMCS There are two tests already enabling eVMCS and a third is coming. Add a function that enables the capability and tests the result. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/evmcs.h | 2 ++ tools/testing/selftests/kvm/lib/x86_64/vmx.c | 20 +++++++++++++++++++ .../testing/selftests/kvm/x86_64/evmcs_test.c | 15 ++------------ .../selftests/kvm/x86_64/hyperv_cpuid.c | 12 ++++------- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h index 4059014d93ea..4912d23844bc 100644 --- a/tools/testing/selftests/kvm/include/evmcs.h +++ b/tools/testing/selftests/kvm/include/evmcs.h @@ -220,6 +220,8 @@ struct hv_enlightened_vmcs { struct hv_enlightened_vmcs *current_evmcs; struct hv_vp_assist_page *current_vp_assist; +int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id); + static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) { u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 204f847bd065..9cef0455b819 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -12,6 +12,26 @@ bool enable_evmcs; +int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) +{ + uint16_t evmcs_ver; + + struct kvm_enable_cap enable_evmcs_cap = { + .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, + .args[0] = (unsigned long)&evmcs_ver + }; + + vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_evmcs_cap); + + /* KVM should return supported EVMCS version range */ + TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && + (evmcs_ver & 0xff) > 0, + "Incorrect EVMCS version range: %x:%x\n", + evmcs_ver & 0xff, evmcs_ver >> 8); + + return evmcs_ver; +} + /* Allocate memory regions for nested VMX tests. * * Input Args: diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index f95c08343b48..92915e6408e7 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -79,11 +79,6 @@ int main(int argc, char *argv[]) struct kvm_x86_state *state; struct ucall uc; int stage; - uint16_t evmcs_ver; - struct kvm_enable_cap enable_evmcs_cap = { - .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, - .args[0] = (unsigned long)&evmcs_ver - }; /* Create VM */ vm = vm_create_default(VCPU_ID, 0, guest_code); @@ -96,13 +91,7 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); - - /* KVM should return supported EVMCS version range */ - TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && - (evmcs_ver & 0xff) > 0, - "Incorrect EVMCS version range: %x:%x\n", - evmcs_ver & 0xff, evmcs_ver >> 8); + vcpu_enable_evmcs(vm, VCPU_ID); run = vcpu_state(vm, VCPU_ID); @@ -146,7 +135,7 @@ int main(int argc, char *argv[]) kvm_vm_restart(vm, O_RDWR); vm_vcpu_add(vm, VCPU_ID); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); + vcpu_enable_evmcs(vm, VCPU_ID); vcpu_load_state(vm, VCPU_ID, state); run = vcpu_state(vm, VCPU_ID); free(state); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index f72b3043db0e..ee59831fbc98 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -18,6 +18,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "vmx.h" #define VCPU_ID 0 @@ -106,12 +107,7 @@ int main(int argc, char *argv[]) { struct kvm_vm *vm; int rv; - uint16_t evmcs_ver; struct kvm_cpuid2 *hv_cpuid_entries; - struct kvm_enable_cap enable_evmcs_cap = { - .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, - .args[0] = (unsigned long)&evmcs_ver - }; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); @@ -136,14 +132,14 @@ int main(int argc, char *argv[]) free(hv_cpuid_entries); - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); - - if (rv) { + if (!kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { fprintf(stderr, "Enlightened VMCS is unsupported, skip related test\n"); goto vm_free; } + vcpu_enable_evmcs(vm, VCPU_ID); + hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); if (!hv_cpuid_entries) return 1; From c930e19790bbbff31c018009907c813fa0925f63 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Aug 2019 12:07:34 -0400 Subject: [PATCH 038/266] selftests: kvm: fix vmx_set_nested_state_test vmx_set_nested_state_test is trying to use the KVM_STATE_NESTED_EVMCS without enabling enlightened VMCS first. Correct the outcome of the test, and actually test that it succeeds after the capability is enabled. Signed-off-by: Paolo Bonzini --- .../kvm/x86_64/vmx_set_nested_state_test.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index a99fc66dafeb..853e370e8a39 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -25,6 +25,8 @@ #define VMCS12_REVISION 0x11e57ed0 #define VCPU_ID 5 +bool have_evmcs; + void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state) { vcpu_nested_state_set(vm, VCPU_ID, state, false); @@ -75,8 +77,9 @@ void set_default_vmx_state(struct kvm_nested_state *state, int size) { memset(state, 0, size); state->flags = KVM_STATE_NESTED_GUEST_MODE | - KVM_STATE_NESTED_RUN_PENDING | - KVM_STATE_NESTED_EVMCS; + KVM_STATE_NESTED_RUN_PENDING; + if (have_evmcs) + state->flags |= KVM_STATE_NESTED_EVMCS; state->format = 0; state->size = size; state->hdr.vmx.vmxon_pa = 0x1000; @@ -126,13 +129,19 @@ void test_vmx_nested_state(struct kvm_vm *vm) /* * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without * setting the nested state but flags other than eVMCS must be clear. + * The eVMCS flag can be set if the enlightened VMCS capability has + * been enabled. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = -1ull; state->hdr.vmx.vmcs12_pa = -1ull; test_nested_state_expect_einval(vm, state); - state->flags = KVM_STATE_NESTED_EVMCS; + state->flags &= KVM_STATE_NESTED_EVMCS; + if (have_evmcs) { + test_nested_state_expect_einval(vm, state); + vcpu_enable_evmcs(vm, VCPU_ID); + } test_nested_state(vm, state); /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ @@ -217,6 +226,8 @@ int main(int argc, char *argv[]) struct kvm_nested_state state; struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); + have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); + if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) { printf("KVM_CAP_NESTED_STATE not available, skipping test\n"); exit(KSFT_SKIP); From ad54567ad5d8e938ee6cf02e4f3867f18835ae6e Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 1 Aug 2019 18:01:17 -0400 Subject: [PATCH 039/266] PCI: Reset both NVIDIA GPU and HDA in ThinkPad P50 workaround quirk_reset_lenovo_thinkpad_50_nvgpu() resets NVIDIA GPUs to work around an apparent BIOS defect. It previously used pci_reset_function(), and the available method was a bus reset, which was fine because there was only one function on the bus. After b516ea586d71 ("PCI: Enable NVIDIA HDA controllers"), there are now two functions (the HDA controller and the GPU itself) on the bus, so the reset fails. Use pci_reset_bus() explicitly instead of pci_reset_function() since it's OK to reset both devices. [bhelgaas: commit log, add e0547c81bfcf] Fixes: b516ea586d71 ("PCI: Enable NVIDIA HDA controllers") Fixes: e0547c81bfcf ("PCI: Reset Lenovo ThinkPad P50 nvgpu at boot if necessary") Link: https://lore.kernel.org/r/20190801220117.14952-1-lyude@redhat.com Signed-off-by: Lyude Paul Signed-off-by: Bjorn Helgaas Acked-by: Ben Skeggs Cc: Lukas Wunner Cc: Daniel Drake Cc: Aaron Plattner Cc: Peter Wu Cc: Ilia Mirkin Cc: Karol Herbst Cc: Maik Freudenberg --- drivers/pci/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..44c4ae1abd00 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5256,7 +5256,7 @@ static void quirk_reset_lenovo_thinkpad_p50_nvgpu(struct pci_dev *pdev) */ if (ioread32(map + 0x2240c) & 0x2) { pci_info(pdev, FW_BUG "GPU left initialized by EFI, resetting\n"); - ret = pci_reset_function(pdev); + ret = pci_reset_bus(pdev); if (ret < 0) pci_err(pdev, "Failed to reset GPU: %d\n", ret); } From 7bafda88de20b2990461d253c5475007436e355c Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 15 Aug 2019 12:12:38 -0500 Subject: [PATCH 040/266] Documentation PCI: Fix pciebus-howto.rst filename typo 2e6422444894 ("Documentation: PCI: convert PCIEBUS-HOWTO.txt to reST") incorrectly renamed PCIEBUS-HOWTO.txt to picebus-howto.rst. Rename it to pciebus-howto.rst. Fixes: 2e6422444894 ("Documentation: PCI: convert PCIEBUS-HOWTO.txt to reST") Signed-off-by: Bjorn Helgaas --- Documentation/PCI/index.rst | 2 +- Documentation/PCI/{picebus-howto.rst => pciebus-howto.rst} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/PCI/{picebus-howto.rst => pciebus-howto.rst} (100%) diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index f4c6121868c3..6768305e4c26 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -9,7 +9,7 @@ Linux PCI Bus Subsystem :numbered: pci - picebus-howto + pciebus-howto pci-iov-howto msi-howto acpi-info diff --git a/Documentation/PCI/picebus-howto.rst b/Documentation/PCI/pciebus-howto.rst similarity index 100% rename from Documentation/PCI/picebus-howto.rst rename to Documentation/PCI/pciebus-howto.rst From d6846bfbeeac873d85f32bd2b988fa94c89dbcb8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 12 Aug 2019 11:16:11 +0900 Subject: [PATCH 041/266] nfsd: fix dentry leak upon mkdir failure. syzbot is reporting that nfsd_mkdir() forgot to remove dentry created by d_alloc_name() when __nfsd_mkdir() failed (due to memory allocation fault injection) [1]. [1] https://syzkaller.appspot.com/bug?id=ce41a1f769ea4637ebffedf004a803e8405b4674 Signed-off-by: Tetsuo Handa Reported-by: syzbot Fixes: e8a79fb14f6b76b5 ("nfsd: add nfsd/clients directory") [bfields: clean up in nfsd_mkdir instead of __nfsd_mkdir] Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 13c548733860..928a0b2c05dc 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1205,6 +1205,7 @@ out: inode_unlock(dir); return dentry; out_err: + dput(dentry); dentry = ERR_PTR(ret); goto out; } From dc46bba709cfb45e4b2d40cf45aaeacb82690504 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 14 Aug 2019 21:57:37 -0400 Subject: [PATCH 042/266] nfsd: use i_wrlock instead of rcu for nfsdfs i_private synchronize_rcu() gets called multiple times each time a client is destroyed. If the laundromat thread has a lot of clients to destroy, the delay can be noticeable. This was causing pynfs test RENEW3 to fail. We could embed an rcu_head in each inode and do the kref_put in an rcu callback. But simplest is just to take a lock here. (I also wonder if the laundromat thread would be better replaced by a bunch of scheduled work or timers or something.) Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 928a0b2c05dc..b14f825c62fe 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1215,11 +1215,9 @@ static void clear_ncl(struct inode *inode) struct nfsdfs_client *ncl = inode->i_private; inode->i_private = NULL; - synchronize_rcu(); kref_put(&ncl->cl_ref, ncl->cl_release); } - static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc = inode->i_private; @@ -1233,9 +1231,9 @@ struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc; - rcu_read_lock(); + inode_lock_shared(inode); nc = __get_nfsdfs_client(inode); - rcu_read_unlock(); + inode_unlock_shared(inode); return nc; } /* from __rpc_unlink */ From cf3591ef832915892f2499b7e54b51d4c578b28c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 8 Aug 2019 05:40:04 -0400 Subject: [PATCH 043/266] Revert "dm bufio: fix deadlock with loop device" Revert the commit bd293d071ffe65e645b4d8104f9d8fe15ea13862. The proper fix has been made available with commit d0a255e795ab ("loop: set PF_MEMALLOC_NOIO for the worker thread"). Note that the fix offered by commit bd293d071ffe doesn't really prevent the deadlock from occuring - if we look at the stacktrace reported by Junxiao Bi, we see that it hangs in bit_wait_io and not on the mutex - i.e. it has already successfully taken the mutex. Changing the mutex from mutex_lock to mutex_trylock won't help with deadlocks that happen afterwards. PID: 474 TASK: ffff8813e11f4600 CPU: 10 COMMAND: "kswapd0" #0 [ffff8813dedfb938] __schedule at ffffffff8173f405 #1 [ffff8813dedfb990] schedule at ffffffff8173fa27 #2 [ffff8813dedfb9b0] schedule_timeout at ffffffff81742fec #3 [ffff8813dedfba60] io_schedule_timeout at ffffffff8173f186 #4 [ffff8813dedfbaa0] bit_wait_io at ffffffff8174034f #5 [ffff8813dedfbac0] __wait_on_bit at ffffffff8173fec8 #6 [ffff8813dedfbb10] out_of_line_wait_on_bit at ffffffff8173ff81 #7 [ffff8813dedfbb90] __make_buffer_clean at ffffffffa038736f [dm_bufio] #8 [ffff8813dedfbbb0] __try_evict_buffer at ffffffffa0387bb8 [dm_bufio] #9 [ffff8813dedfbbd0] dm_bufio_shrink_scan at ffffffffa0387cc3 [dm_bufio] #10 [ffff8813dedfbc40] shrink_slab at ffffffff811a87ce #11 [ffff8813dedfbd30] shrink_zone at ffffffff811ad778 #12 [ffff8813dedfbdc0] kswapd at ffffffff811ae92f #13 [ffff8813dedfbec0] kthread at ffffffff810a8428 #14 [ffff8813dedfbf50] ret_from_fork at ffffffff81745242 Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Fixes: bd293d071ffe ("dm bufio: fix deadlock with loop device") Depends-on: d0a255e795ab ("loop: set PF_MEMALLOC_NOIO for the worker thread") Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index b6b5acc92ca2..2a48ea3f1b30 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1599,7 +1599,9 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (!dm_bufio_trylock(c)) + if (sc->gfp_mask & __GFP_FS) + dm_bufio_lock(c); + else if (!dm_bufio_trylock(c)) return SHRINK_STOP; freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); From d1fef41465f0e8cae0693fb184caa6bfafb6cd16 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Mon, 5 Aug 2019 16:56:03 -0700 Subject: [PATCH 044/266] dm kcopyd: always complete failed jobs This patch fixes a problem in dm-kcopyd that may leave jobs in complete queue indefinitely in the event of backing storage failure. This behavior has been observed while running 100% write file fio workload against an XFS volume created on top of a dm-zoned target device. If the underlying storage of dm-zoned goes to offline state under I/O, kcopyd sometimes never issues the end copy callback and dm-zoned reclaim work hangs indefinitely waiting for that completion. This behavior was traced down to the error handling code in process_jobs() function that places the failed job to complete_jobs queue, but doesn't wake up the job handler. In case of backing device failure, all outstanding jobs may end up going to complete_jobs queue via this code path and then stay there forever because there are no more successful I/O jobs to wake up the job handler. This patch adds a wake() call to always wake up kcopyd job wait queue for all I/O jobs that fail before dm_io() gets called for that job. The patch also sets the write error status in all sub jobs that are failed because their master job has failed. Fixes: b73c67c2cbb00 ("dm kcopyd: add sequential write feature") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Fomichev Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-kcopyd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index df2011de7be2..1bbe4a34ef4c 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -566,8 +566,10 @@ static int run_io_job(struct kcopyd_job *job) * no point in continuing. */ if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) && - job->master_job->write_err) + job->master_job->write_err) { + job->write_err = job->master_job->write_err; return -EIO; + } io_job_start(job->kc->throttle); @@ -619,6 +621,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, else job->read_err = 1; push(&kc->complete_jobs, job); + wake(kc); break; } From b234c6d7a703661b5045c5bf569b7c99d2edbf88 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Sat, 10 Aug 2019 14:43:09 -0700 Subject: [PATCH 045/266] dm zoned: improve error handling in reclaim There are several places in reclaim code where errors are not propagated to the main function, dmz_reclaim(). This function is responsible for unlocking zones that might be still locked at the end of any failed reclaim iterations. As the result, some device zones may be left permanently locked for reclaim, degrading target's capability to reclaim zones. This patch fixes these issues as follows - Make sure that dmz_reclaim_buf(), dmz_reclaim_seq_data() and dmz_reclaim_rnd_data() return error codes to the caller. dmz_reclaim() function is renamed to dmz_do_reclaim() to avoid clashing with "struct dmz_reclaim" and is modified to return the error to the caller. dmz_get_zone_for_reclaim() now returns an error instead of NULL pointer and reclaim code checks for that error. Error logging/debug messages are added where necessary. Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Fomichev Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 4 ++-- drivers/md/dm-zoned-reclaim.c | 28 +++++++++++++++++++--------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 8545dcee9fd0..935d9be5af39 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1542,7 +1542,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) struct dm_zone *zone; if (list_empty(&zmd->map_rnd_list)) - return NULL; + return ERR_PTR(-EBUSY); list_for_each_entry(zone, &zmd->map_rnd_list, link) { if (dmz_is_buf(zone)) @@ -1553,7 +1553,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) return dzone; } - return NULL; + return ERR_PTR(-EBUSY); } /* diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index edf4b95eb075..e381354dc136 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -215,7 +215,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_flush(zmd); - return 0; + return ret; } /* @@ -259,7 +259,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_flush(zmd); - return 0; + return ret; } /* @@ -312,7 +312,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_flush(zmd); - return 0; + return ret; } /* @@ -334,7 +334,7 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone) /* * Find a candidate zone for reclaim and process it. */ -static void dmz_reclaim(struct dmz_reclaim *zrc) +static int dmz_do_reclaim(struct dmz_reclaim *zrc) { struct dmz_metadata *zmd = zrc->metadata; struct dm_zone *dzone; @@ -344,8 +344,8 @@ static void dmz_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd); - if (!dzone) - return; + if (IS_ERR(dzone)) + return PTR_ERR(dzone); start = jiffies; @@ -391,13 +391,20 @@ static void dmz_reclaim(struct dmz_reclaim *zrc) out: if (ret) { dmz_unlock_zone_reclaim(dzone); - return; + return ret; } - (void) dmz_flush_metadata(zrc->metadata); + ret = dmz_flush_metadata(zrc->metadata); + if (ret) { + dmz_dev_debug(zrc->dev, + "Metadata flush for zone %u failed, err %d\n", + dmz_id(zmd, rzone), ret); + return ret; + } dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms", dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start)); + return 0; } /* @@ -442,6 +449,7 @@ static void dmz_reclaim_work(struct work_struct *work) struct dmz_metadata *zmd = zrc->metadata; unsigned int nr_rnd, nr_unmap_rnd; unsigned int p_unmap_rnd; + int ret; if (!dmz_should_reclaim(zrc)) { mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); @@ -471,7 +479,9 @@ static void dmz_reclaim_work(struct work_struct *work) (dmz_target_idle(zrc) ? "Idle" : "Busy"), p_unmap_rnd, nr_unmap_rnd, nr_rnd); - dmz_reclaim(zrc); + ret = dmz_do_reclaim(zrc); + if (ret) + dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); dmz_schedule_reclaim(zrc); } From d7428c50118e739e672656c28d2b26b09375d4e0 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Sat, 10 Aug 2019 14:43:10 -0700 Subject: [PATCH 046/266] dm zoned: improve error handling in i/o map code Some errors are ignored in the I/O path during queueing chunks for processing by chunk works. Since at least these errors are transient in nature, it should be possible to retry the failed incoming commands. The fix - Errors that can happen while queueing chunks are carried upwards to the main mapping function and it now returns DM_MAPIO_REQUEUE for any incoming requests that can not be properly queued. Error logging/debug messages are added where needed. Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Fomichev Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-target.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 51d029bbb740..944db71ed3d7 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -513,22 +513,24 @@ static void dmz_flush_work(struct work_struct *work) * Get a chunk work and start it to process a new BIO. * If the BIO chunk has no work yet, create one. */ -static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) +static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) { unsigned int chunk = dmz_bio_chunk(dmz->dev, bio); struct dm_chunk_work *cw; + int ret = 0; mutex_lock(&dmz->chunk_lock); /* Get the BIO chunk work. If one is not active yet, create one */ cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk); if (!cw) { - int ret; /* Create a new chunk work */ cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); - if (!cw) + if (unlikely(!cw)) { + ret = -ENOMEM; goto out; + } INIT_WORK(&cw->work, dmz_chunk_work); refcount_set(&cw->refcount, 0); @@ -539,7 +541,6 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw); if (unlikely(ret)) { kfree(cw); - cw = NULL; goto out; } } @@ -547,10 +548,12 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) bio_list_add(&cw->bio_list, bio); dmz_get_chunk_work(cw); + dmz_reclaim_bio_acc(dmz->reclaim); if (queue_work(dmz->chunk_wq, &cw->work)) dmz_get_chunk_work(cw); out: mutex_unlock(&dmz->chunk_lock); + return ret; } /* @@ -564,6 +567,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) sector_t sector = bio->bi_iter.bi_sector; unsigned int nr_sectors = bio_sectors(bio); sector_t chunk_sector; + int ret; dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", bio_op(bio), (unsigned long long)sector, nr_sectors, @@ -601,8 +605,14 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector); /* Now ready to handle this BIO */ - dmz_reclaim_bio_acc(dmz->reclaim); - dmz_queue_chunk_work(dmz, bio); + ret = dmz_queue_chunk_work(dmz, bio); + if (ret) { + dmz_dev_debug(dmz->dev, + "BIO op %d, can't process chunk %llu, err %i\n", + bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio), + ret); + return DM_MAPIO_REQUEUE; + } return DM_MAPIO_SUBMITTED; } From 75d66ffb48efb30f2dd42f041ba8b39c5b2bd115 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Sat, 10 Aug 2019 14:43:11 -0700 Subject: [PATCH 047/266] dm zoned: properly handle backing device failure dm-zoned is observed to lock up or livelock in case of hardware failure or some misconfiguration of the backing zoned device. This patch adds a new dm-zoned target function that checks the status of the backing device. If the request queue of the backing device is found to be in dying state or the SCSI backing device enters offline state, the health check code sets a dm-zoned target flag prompting all further incoming I/O to be rejected. In order to detect backing device failures timely, this new function is called in the request mapping path, at the beginning of every reclaim run and before performing any metadata I/O. The proper way out of this situation is to do dmsetup remove and recreate the target when the problem with the backing device is resolved. Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Fomichev Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 51 +++++++++++++++++++++++++++------- drivers/md/dm-zoned-reclaim.c | 18 ++++++++++-- drivers/md/dm-zoned-target.c | 45 ++++++++++++++++++++++++++++-- drivers/md/dm-zoned.h | 10 +++++++ 4 files changed, 110 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 935d9be5af39..2882897aece2 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -402,15 +402,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; struct bio *bio; + if (dmz_bdev_is_dying(zmd->dev)) + return ERR_PTR(-EIO); + /* Get a new block and a BIO to read it */ mblk = dmz_alloc_mblock(zmd, mblk_no); if (!mblk) - return NULL; + return ERR_PTR(-ENOMEM); bio = bio_alloc(GFP_NOIO, 1); if (!bio) { dmz_free_mblock(zmd, mblk); - return NULL; + return ERR_PTR(-ENOMEM); } spin_lock(&zmd->mblk_lock); @@ -541,8 +544,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, if (!mblk) { /* Cache miss: read the block from disk */ mblk = dmz_get_mblock_slow(zmd, mblk_no); - if (!mblk) - return ERR_PTR(-ENOMEM); + if (IS_ERR(mblk)) + return mblk; } /* Wait for on-going read I/O and check for error */ @@ -570,16 +573,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) /* * Issue a metadata block write BIO. */ -static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, - unsigned int set) +static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, + unsigned int set) { sector_t block = zmd->sb[set].block + mblk->no; struct bio *bio; + if (dmz_bdev_is_dying(zmd->dev)) + return -EIO; + bio = bio_alloc(GFP_NOIO, 1); if (!bio) { set_bit(DMZ_META_ERROR, &mblk->state); - return; + return -ENOMEM; } set_bit(DMZ_META_WRITING, &mblk->state); @@ -591,6 +597,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); submit_bio(bio); + + return 0; } /* @@ -602,6 +610,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, struct bio *bio; int ret; + if (dmz_bdev_is_dying(zmd->dev)) + return -EIO; + bio = bio_alloc(GFP_NOIO, 1); if (!bio) return -ENOMEM; @@ -659,22 +670,29 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, { struct dmz_mblock *mblk; struct blk_plug plug; - int ret = 0; + int ret = 0, nr_mblks_submitted = 0; /* Issue writes */ blk_start_plug(&plug); - list_for_each_entry(mblk, write_list, link) - dmz_write_mblock(zmd, mblk, set); + list_for_each_entry(mblk, write_list, link) { + ret = dmz_write_mblock(zmd, mblk, set); + if (ret) + break; + nr_mblks_submitted++; + } blk_finish_plug(&plug); /* Wait for completion */ list_for_each_entry(mblk, write_list, link) { + if (!nr_mblks_submitted) + break; wait_on_bit_io(&mblk->state, DMZ_META_WRITING, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); ret = -EIO; } + nr_mblks_submitted--; } /* Flush drive cache (this will also sync data) */ @@ -736,6 +754,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ dmz_lock_flush(zmd); + if (dmz_bdev_is_dying(zmd->dev)) { + ret = -EIO; + goto out; + } + /* Get dirty blocks */ spin_lock(&zmd->mblk_lock); list_splice_init(&zmd->mblk_dirty_list, &write_list); @@ -1631,6 +1654,10 @@ again: /* Alloate a random zone */ dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!dzone) { + if (dmz_bdev_is_dying(zmd->dev)) { + dzone = ERR_PTR(-EIO); + goto out; + } dmz_wait_for_free_zones(zmd); goto again; } @@ -1728,6 +1755,10 @@ again: /* Alloate a random zone */ bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!bzone) { + if (dmz_bdev_is_dying(zmd->dev)) { + bzone = ERR_PTR(-EIO); + goto out; + } dmz_wait_for_free_zones(zmd); goto again; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index e381354dc136..9470b8f77a33 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -37,7 +37,7 @@ enum { /* * Number of seconds of target BIO inactivity to consider the target idle. */ -#define DMZ_IDLE_PERIOD (10UL * HZ) +#define DMZ_IDLE_PERIOD (10UL * HZ) /* * Percentage of unmapped (free) random zones below which reclaim starts @@ -134,6 +134,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, set_bit(DM_KCOPYD_WRITE_SEQ, &flags); while (block < end_block) { + if (dev->flags & DMZ_BDEV_DYING) + return -EIO; + /* Get a valid region from the source zone */ ret = dmz_first_valid_block(zmd, src_zone, &block); if (ret <= 0) @@ -451,6 +454,9 @@ static void dmz_reclaim_work(struct work_struct *work) unsigned int p_unmap_rnd; int ret; + if (dmz_bdev_is_dying(zrc->dev)) + return; + if (!dmz_should_reclaim(zrc)) { mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); return; @@ -480,8 +486,16 @@ static void dmz_reclaim_work(struct work_struct *work) p_unmap_rnd, nr_unmap_rnd, nr_rnd); ret = dmz_do_reclaim(zrc); - if (ret) + if (ret) { dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); + if (ret == -EIO) + /* + * LLD might be performing some error handling sequence + * at the underlying device. To not interfere, do not + * attempt to schedule the next reclaim run immediately. + */ + return; + } dmz_schedule_reclaim(zrc); } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 944db71ed3d7..ff3fd011796e 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -133,6 +133,8 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, refcount_inc(&bioctx->ref); generic_make_request(clone); + if (clone->bi_status == BLK_STS_IOERR) + return -EIO; if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) zone->wp_block += nr_blocks; @@ -277,8 +279,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz, /* Get the buffer zone. One will be allocated if needed */ bzone = dmz_get_chunk_buffer(zmd, zone); - if (!bzone) - return -ENOSPC; + if (IS_ERR(bzone)) + return PTR_ERR(bzone); if (dmz_is_readonly(bzone)) return -EROFS; @@ -389,6 +391,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, dmz_lock_metadata(zmd); + if (dmz->dev->flags & DMZ_BDEV_DYING) { + ret = -EIO; + goto out; + } + /* * Get the data zone mapping the chunk. There may be no * mapping for read and discard. If a mapping is obtained, @@ -493,6 +500,8 @@ static void dmz_flush_work(struct work_struct *work) /* Flush dirty metadata blocks */ ret = dmz_flush_metadata(dmz->metadata); + if (ret) + dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret); /* Process queued flush requests */ while (1) { @@ -556,6 +565,32 @@ out: return ret; } +/* + * Check the backing device availability. If it's on the way out, + * start failing I/O. Reclaim and metadata components also call this + * function to cleanly abort operation in the event of such failure. + */ +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) +{ + struct gendisk *disk; + + if (!(dmz_dev->flags & DMZ_BDEV_DYING)) { + disk = dmz_dev->bdev->bd_disk; + if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { + dmz_dev_warn(dmz_dev, "Backing device queue dying"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } else if (disk->fops->check_events) { + if (disk->fops->check_events(disk, 0) & + DISK_EVENT_MEDIA_CHANGE) { + dmz_dev_warn(dmz_dev, "Backing device offline"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + } + } + + return dmz_dev->flags & DMZ_BDEV_DYING; +} + /* * Process a new BIO. */ @@ -569,6 +604,9 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) sector_t chunk_sector; int ret; + if (dmz_bdev_is_dying(dmz->dev)) + return DM_MAPIO_KILL; + dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", bio_op(bio), (unsigned long long)sector, nr_sectors, (unsigned long long)dmz_bio_chunk(dmz->dev, bio), @@ -865,6 +903,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; + if (dmz_bdev_is_dying(dmz->dev)) + return -ENODEV; + *bdev = dmz->dev->bdev; return 0; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index ed8de49c9a08..93a64529f219 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -56,6 +56,8 @@ struct dmz_dev { unsigned int nr_zones; + unsigned int flags; + sector_t zone_nr_sectors; unsigned int zone_nr_sectors_shift; @@ -67,6 +69,9 @@ struct dmz_dev { (dev)->zone_nr_sectors_shift) #define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1)) +/* Device flags. */ +#define DMZ_BDEV_DYING (1 << 0) + /* * Zone descriptor. */ @@ -245,4 +250,9 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc); void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc); void dmz_schedule_reclaim(struct dmz_reclaim *zrc); +/* + * Functions defined in dm-zoned-target.c + */ +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); + #endif /* DM_ZONED_H */ From bae9a0aa331d4cc20bd73c11f91abfceda4b7b29 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Fri, 2 Aug 2019 15:02:50 -0700 Subject: [PATCH 048/266] dm zoned: add SPDX license identifiers Signed-off-by: Dmitry Fomichev Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 1 + drivers/md/dm-zoned-reclaim.c | 1 + drivers/md/dm-zoned-target.c | 1 + drivers/md/dm-zoned.h | 1 + 4 files changed, 4 insertions(+) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 2882897aece2..a033b5b1d77e 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 9470b8f77a33..8297b7558154 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index ff3fd011796e..31478fef6032 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 93a64529f219..d8e70b0ade35 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * From ad1bd578bd5afdf20de0bead42d25f199601211d Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Fri, 2 Aug 2019 15:02:51 -0700 Subject: [PATCH 049/266] dm zoned: fix a few typos Signed-off-by: Dmitry Fomichev Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 8 ++++---- drivers/md/dm-zoned-reclaim.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index a033b5b1d77e..2a5bc51fd6d5 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -35,7 +35,7 @@ * (1) Super block (1 block) * (2) Chunk mapping table (nr_map_blocks) * (3) Bitmap blocks (nr_bitmap_blocks) - * All metadata blocks are stored in conventional zones, starting from the + * All metadata blocks are stored in conventional zones, starting from * the first conventional zone found on disk. */ struct dmz_super { @@ -234,7 +234,7 @@ void dmz_unlock_map(struct dmz_metadata *zmd) * Lock/unlock metadata access. This is a "read" lock on a semaphore * that prevents metadata flush from running while metadata are being * modified. The actual metadata write mutual exclusion is achieved with - * the map lock and zone styate management (active and reclaim state are + * the map lock and zone state management (active and reclaim state are * mutually exclusive). */ void dmz_lock_metadata(struct dmz_metadata *zmd) @@ -1652,7 +1652,7 @@ again: if (op != REQ_OP_WRITE) goto out; - /* Alloate a random zone */ + /* Allocate a random zone */ dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!dzone) { if (dmz_bdev_is_dying(zmd->dev)) { @@ -1753,7 +1753,7 @@ again: if (bzone) goto out; - /* Alloate a random zone */ + /* Allocate a random zone */ bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!bzone) { if (dmz_bdev_is_dying(zmd->dev)) { diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 8297b7558154..d240d7ca8a8a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -438,7 +438,7 @@ static bool dmz_should_reclaim(struct dmz_reclaim *zrc) return false; /* - * If the percentage of unmappped random zones is low, + * If the percentage of unmapped random zones is low, * reclaim even if the target is busy. */ return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND; From 5729b6e5a1bcb0bbc28abe82d749c7392f66d2c7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sat, 10 Aug 2019 12:30:27 -0400 Subject: [PATCH 050/266] dm integrity: fix a crash due to BUG_ON in __journal_read_write() Fix a crash that was introduced by the commit 724376a04d1a. The crash is reported here: https://gitlab.com/cryptsetup/cryptsetup/issues/468 When reading from the integrity device, the function dm_integrity_map_continue calls find_journal_node to find out if the location to read is present in the journal. Then, it calculates how many sectors are consecutively stored in the journal. Then, it locks the range with add_new_range and wait_and_add_new_range. The problem is that during wait_and_add_new_range, we hold no locks (we don't hold ic->endio_wait.lock and we don't hold a range lock), so the journal may change arbitrarily while wait_and_add_new_range sleeps. The code then goes to __journal_read_write and hits BUG_ON(journal_entry_get_sector(je) != logical_sector); because the journal has changed. In order to fix this bug, we need to re-check the journal location after wait_and_add_new_range. We restrict the length to one block in order to not complicate the code too much. Fixes: 724376a04d1a ("dm integrity: implement fair range locks") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index b1b0de402dfc..9118ab85cb3a 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1943,7 +1943,22 @@ offload_to_thread: queue_work(ic->wait_wq, &dio->work); return; } + if (journal_read_pos != NOT_FOUND) + dio->range.n_sectors = ic->sectors_per_block; wait_and_add_new_range(ic, &dio->range); + /* + * wait_and_add_new_range drops the spinlock, so the journal + * may have been changed arbitrarily. We need to recheck. + * To simplify the code, we restrict I/O size to just one block. + */ + if (journal_read_pos != NOT_FOUND) { + sector_t next_sector; + unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (unlikely(new_pos != journal_read_pos)) { + remove_range_unlocked(ic, &dio->range); + goto retry; + } + } } spin_unlock_irq(&ic->endio_wait.lock); From bebd6997163addc1938db8c61754a23ffdf8ccc4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 15 Aug 2019 16:18:26 -0400 Subject: [PATCH 051/266] nfsd: initialize i_private before d_add A process could race in an open and attempt to read one of these files before i_private is initialized, and get a spurious error. Reported-by: Al Viro Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b14f825c62fe..3cf4f6aa48d6 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1171,13 +1171,17 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) return inode; } -static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl) { struct inode *inode; inode = nfsd_get_inode(dir->i_sb, mode); if (!inode) return -ENOMEM; + if (ncl) { + inode->i_private = ncl; + kref_get(&ncl->cl_ref); + } d_add(dentry, inode); inc_nlink(dir); fsnotify_mkdir(dir, dentry); @@ -1194,13 +1198,9 @@ static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *nc dentry = d_alloc_name(parent, name); if (!dentry) goto out_err; - ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600); + ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl); if (ret) goto out_err; - if (ncl) { - d_inode(dentry)->i_private = ncl; - kref_get(&ncl->cl_ref); - } out: inode_unlock(dir); return dentry; From d34b044038bfb0e19caa8b019910efc465f41d5f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 15 Aug 2019 15:22:23 +0100 Subject: [PATCH 052/266] tools: bpftool: close prog FD before exit on showing a single program When showing metadata about a single program by invoking "bpftool prog show PROG", the file descriptor referring to the program is not closed before returning from the function. Let's close it. Fixes: 71bb428fe2c1 ("tools: bpf: add bpftool") Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/prog.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 66f04a4846a5..43fdbbfe41bb 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -363,7 +363,9 @@ static int do_show(int argc, char **argv) if (fd < 0) return -1; - return show_prog(fd); + err = show_prog(fd); + close(fd); + return err; } if (argc) From a53358a31c989c360ea59536d28762b9d2d68d19 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Mon, 5 Aug 2019 18:18:37 +0200 Subject: [PATCH 053/266] drm: rcar_lvds: Fix dual link mode operations The R-Car LVDS encoder units support dual-link operations by splitting the pixel output between the primary encoder and the companion encoder. Currently the companion encoder fails at probe time, causing the registration of the primary to fail as well, preventing the whole DU unit from being registered at all. Fix this by not bailing out from probe with error if the "renesas,companion" property is not specified. Fixes: fa440d870358 ("drm: rcar-du: lvds: Add support for dual-link mode") Reported-by: Fabrizio Castro Signed-off-by: Jacopo Mondi Reviewed-by: Laurent Pinchart Signed-off-by: Laurent Pinchart --- drivers/gpu/drm/rcar-du/rcar_lvds.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_lvds.c b/drivers/gpu/drm/rcar-du/rcar_lvds.c index 1c62578590f4..082d02c84024 100644 --- a/drivers/gpu/drm/rcar-du/rcar_lvds.c +++ b/drivers/gpu/drm/rcar-du/rcar_lvds.c @@ -673,10 +673,8 @@ static int rcar_lvds_parse_dt_companion(struct rcar_lvds *lvds) /* Locate the companion LVDS encoder for dual-link operation, if any. */ companion = of_parse_phandle(dev->of_node, "renesas,companion", 0); - if (!companion) { - dev_err(dev, "Companion LVDS encoder not found\n"); - return -ENXIO; - } + if (!companion) + return 0; /* * Sanity check: the companion encoder must have the same compatible From 4f8c6aba37da199155a121c6cdc38505a9eb0259 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 13 Aug 2019 14:41:47 -0700 Subject: [PATCH 054/266] clk: Fix falling back to legacy parent string matching Calls to clk_core_get() will return ERR_PTR(-EINVAL) if we've started migrating a clk driver to use the DT based style of specifying parents but we haven't made any DT updates yet. This happens when we pass a non-NULL value as the 'name' argument of of_parse_clkspec(). That function returns -EINVAL in such a situation, instead of -ENOENT like we expected. The return value comes back up to clk_core_fill_parent_index() which proceeds to skip calling clk_core_lookup() because the error pointer isn't equal to -ENOENT, it's -EINVAL. Furthermore, we blindly overwrite the error pointer returned by clk_core_get() with NULL when there isn't a legacy .name member specified in the parent map. This isn't too bad right now because we don't really care to differentiate NULL from an error, but in the future we should only try to do a legacy lookup if we know we might find something. This way DT lookups that fail don't try to lookup based on strings when there isn't any string to match, hiding the error from DT parsing. Fix both these problems so that clk provider drivers can use the new style of parent mapping without having to also update their DT at the same time. This patch is based on an earlier patch from Taniya Das which checked for -EINVAL in addition to -ENOENT return values from clk_core_get(). Fixes: 601b6e93304a ("clk: Allow parents to be specified via clkspec index") Cc: Taniya Das Cc: Jerome Brunet Cc: Chen-Yu Tsai Reported-by: Taniya Das Signed-off-by: Stephen Boyd Link: https://lkml.kernel.org/r/20190813214147.34394-1-sboyd@kernel.org Tested-by: Taniya Das --- drivers/clk/clk.c | 46 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index c0990703ce54..8bce6bb4a965 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -324,6 +324,25 @@ static struct clk_core *clk_core_lookup(const char *name) return NULL; } +#ifdef CONFIG_OF +static int of_parse_clkspec(const struct device_node *np, int index, + const char *name, struct of_phandle_args *out_args); +static struct clk_hw * +of_clk_get_hw_from_clkspec(struct of_phandle_args *clkspec); +#else +static inline int of_parse_clkspec(const struct device_node *np, int index, + const char *name, + struct of_phandle_args *out_args) +{ + return -ENOENT; +} +static inline struct clk_hw * +of_clk_get_hw_from_clkspec(struct of_phandle_args *clkspec) +{ + return ERR_PTR(-ENOENT); +} +#endif + /** * clk_core_get - Find the clk_core parent of a clk * @core: clk to find parent of @@ -355,8 +374,9 @@ static struct clk_core *clk_core_lookup(const char *name) * }; * * Returns: -ENOENT when the provider can't be found or the clk doesn't - * exist in the provider. -EINVAL when the name can't be found. NULL when the - * provider knows about the clk but it isn't provided on this system. + * exist in the provider or the name can't be found in the DT node or + * in a clkdev lookup. NULL when the provider knows about the clk but it + * isn't provided on this system. * A valid clk_core pointer when the clk can be found in the provider. */ static struct clk_core *clk_core_get(struct clk_core *core, u8 p_index) @@ -367,17 +387,19 @@ static struct clk_core *clk_core_get(struct clk_core *core, u8 p_index) struct device *dev = core->dev; const char *dev_id = dev ? dev_name(dev) : NULL; struct device_node *np = core->of_node; + struct of_phandle_args clkspec; - if (np && (name || index >= 0)) - hw = of_clk_get_hw(np, index, name); - - /* - * If the DT search above couldn't find the provider or the provider - * didn't know about this clk, fallback to looking up via clkdev based - * clk_lookups - */ - if (PTR_ERR(hw) == -ENOENT && name) + if (np && (name || index >= 0) && + !of_parse_clkspec(np, index, name, &clkspec)) { + hw = of_clk_get_hw_from_clkspec(&clkspec); + of_node_put(clkspec.np); + } else if (name) { + /* + * If the DT search above couldn't find the provider fallback to + * looking up via clkdev based clk_lookups. + */ hw = clk_find_hw(dev_id, name); + } if (IS_ERR(hw)) return ERR_CAST(hw); @@ -401,7 +423,7 @@ static void clk_core_fill_parent_index(struct clk_core *core, u8 index) parent = ERR_PTR(-EPROBE_DEFER); } else { parent = clk_core_get(core, index); - if (IS_ERR(parent) && PTR_ERR(parent) == -ENOENT) + if (IS_ERR(parent) && PTR_ERR(parent) == -ENOENT && entry->name) parent = clk_core_lookup(entry->name); } From 24876f09a7dfe36a82f53d304d8c1bceb3257a0f Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Fri, 16 Aug 2019 00:31:55 +0200 Subject: [PATCH 055/266] clk: Fix potential NULL dereference in clk_fetch_parent_index() Don't compare the parent clock name with a NULL name in the clk_parent_map. This prevents a kernel crash when passing NULL core->parents[i].name to strcmp(). An example which triggered this is a mux clock with four parents when each of them is referenced in the clock driver using clk_parent_data.fw_name and then calling clk_set_parent(clk, 3rd_parent) on this mux. In this case the first parent is also the HW default so core->parents[i].hw is populated when the clock is registered. Calling clk_set_parent(clk, 3rd_parent) will then go through all parents and skip the first parent because it's hw pointer doesn't match. For the second parent no hw pointer is cached yet and clk_core_get(core, 1) returns a non-matching pointer (which is correct because we are comparing the second with the third parent). Comparing the result of clk_core_get(core, 2) with the requested parent gives a match. However we don't reach this point because right after the clk_core_get(core, 1) mismatch the old code tried to !strcmp(parent->name, NULL) (where the second argument is actually core->parents[i].name, but that was never populated by the clock driver). Signed-off-by: Martin Blumenstingl Link: https://lkml.kernel.org/r/20190815223155.21384-1-martin.blumenstingl@googlemail.com Fixes: fc0c209c147f ("clk: Allow parents to be specified without string names") Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 8bce6bb4a965..1c46babeb093 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1654,7 +1654,8 @@ static int clk_fetch_parent_index(struct clk_core *core, break; /* Fallback to comparing globally unique names */ - if (!strcmp(parent->name, core->parents[i].name)) + if (core->parents[i].name && + !strcmp(parent->name, core->parents[i].name)) break; } From 78e70e780b289ff59ec33a9f9c1fcecaf17a46e1 Mon Sep 17 00:00:00 2001 From: He Zhe Date: Tue, 6 Aug 2019 17:41:04 +0800 Subject: [PATCH 056/266] nfsd4: Fix kernel crash when reading proc file reply_cache_stats reply_cache_stats uses wrong parameter as seq file private structure and thus causes the following kernel crash when users read /proc/fs/nfsd/reply_cache_stats BUG: kernel NULL pointer dereference, address: 00000000000001f9 PGD 0 P4D 0 Oops: 0000 [#3] SMP PTI CPU: 6 PID: 1502 Comm: cat Tainted: G D 5.3.0-rc3+ #1 Hardware name: Intel Corporation Broadwell Client platform/Basking Ridge, BIOS BDW-E2R1.86C.0118.R01.1503110618 03/11/2015 RIP: 0010:nfsd_reply_cache_stats_show+0x3b/0x2d0 Code: 41 54 49 89 f4 48 89 fe 48 c7 c7 b3 10 33 88 53 bb e8 03 00 00 e8 88 82 d1 ff bf 58 89 41 00 e8 eb c5 85 00 48 83 eb 01 75 f0 <41> 8b 94 24 f8 01 00 00 48 c7 c6 be 10 33 88 4c 89 ef bb e8 03 00 RSP: 0018:ffffaa520106fe08 EFLAGS: 00010246 RAX: 000000cfe1a77123 RBX: 0000000000000000 RCX: 0000000000291b46 RDX: 000000cf00000000 RSI: 0000000000000006 RDI: 0000000000291b28 RBP: ffffaa520106fe20 R08: 0000000000000006 R09: 000000cfe17e55dd R10: ffffa424e47c0000 R11: 000000000000030b R12: 0000000000000001 R13: ffffa424e5697000 R14: 0000000000000001 R15: ffffa424e5697000 FS: 00007f805735f580(0000) GS:ffffa424f8f80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000001f9 CR3: 00000000655ce005 CR4: 00000000003606e0 Call Trace: seq_read+0x194/0x3e0 __vfs_read+0x1b/0x40 vfs_read+0x95/0x140 ksys_read+0x61/0xe0 __x64_sys_read+0x1a/0x20 do_syscall_64+0x4d/0x120 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f805728b861 Code: fe ff ff 50 48 8d 3d 86 b4 09 00 e8 79 e0 01 00 66 0f 1f 84 00 00 00 00 00 48 8d 05 d9 19 0d 00 8b 00 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 c3 66 0f 1f 44 00 00 48 83 ec 28 48 89 54 RSP: 002b:00007ffea1ce3c38 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f805728b861 RDX: 0000000000020000 RSI: 00007f8057183000 RDI: 0000000000000003 RBP: 00007f8057183000 R08: 00007f8057182010 R09: 0000000000000000 R10: 0000000000000022 R11: 0000000000000246 R12: 0000559a60e8ff10 R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 Modules linked in: CR2: 00000000000001f9 ---[ end trace 01613595153f0cba ]--- RIP: 0010:nfsd_reply_cache_stats_show+0x3b/0x2d0 Code: 41 54 49 89 f4 48 89 fe 48 c7 c7 b3 10 33 88 53 bb e8 03 00 00 e8 88 82 d1 ff bf 58 89 41 00 e8 eb c5 85 00 48 83 eb 01 75 f0 <41> 8b 94 24 f8 01 00 00 48 c7 c6 be 10 33 88 4c 89 ef bb e8 03 00 RSP: 0018:ffffaa52004b3e08 EFLAGS: 00010246 RAX: 0000002bab45a7c6 RBX: 0000000000000000 RCX: 0000000000291b4c RDX: 0000002b00000000 RSI: 0000000000000004 RDI: 0000000000291b28 RBP: ffffaa52004b3e20 R08: 0000000000000004 R09: 0000002bab1c8c7a R10: ffffa424e5500000 R11: 00000000000002a9 R12: 0000000000000001 R13: ffffa424e4475000 R14: 0000000000000001 R15: ffffa424e4475000 FS: 00007f805735f580(0000) GS:ffffa424f8f80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000001f9 CR3: 00000000655ce005 CR4: 00000000003606e0 Killed Fixes: 3ba75830ce17 ("nfsd4: drc containerization") Signed-off-by: He Zhe Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 26ad75ae2be0..96352ab7bd81 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -571,7 +571,7 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) */ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) { - struct nfsd_net *nn = v; + struct nfsd_net *nn = m->private; seq_printf(m, "max entries: %u\n", nn->max_drc_entries); seq_printf(m, "num entries: %u\n", From df451f83e1fc0fa3764a2724b0faaaf9d07ab1b6 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Fri, 16 Aug 2019 18:50:00 +0200 Subject: [PATCH 057/266] gpio: of: fix Freescale SPI CS quirk handling On the gta04 we see: spi_gpio: probe of spi_lcd failed with error -2 The quirk introduced in commit e3023bf80639 ("gpio: of: Handle the Freescale SPI CS") can also be triggered by a temporary -EPROBE_DEFER and so "convert" it to a hard -ENOENT. Disable that conversion by checking for -EPROBE_DEFER. Fixes: e3023bf80639 ("gpio: of: Handle the Freescale SPI CS") Suggested-by: H. Nikolaus Schaller Signed-off-by: Andreas Kemnade Link: https://lore.kernel.org/r/20190816165000.32334-1-andreas@kemnade.info Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib-of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 567fb98c0892..9762dd6d99fa 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -363,7 +363,7 @@ struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id, /* Special handling for SPI GPIOs if used */ if (IS_ERR(desc)) desc = of_find_spi_gpio(dev, con_id, &of_flags); - if (IS_ERR(desc)) { + if (IS_ERR(desc) && PTR_ERR(desc) != -EPROBE_DEFER) { /* This quirk looks up flags and all */ desc = of_find_spi_cs_gpio(dev, con_id, idx, flags); if (!IS_ERR(desc)) From 314e01a6d7ddf04608440beb087b21d8aa32f03f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Aug 2019 23:41:06 -0700 Subject: [PATCH 058/266] xfs: fall back to native ioctls for unhandled compat ones Always try the native ioctl if we don't have a compat handler. This removes a lot of boilerplate code as 'modern' ioctls should generally be compat clean, and fixes the missing entries for the recently added FS_IOC_GETFSLABEL/FS_IOC_SETFSLABEL ioctls. Fixes: f7664b31975b ("xfs: implement online get/set fs label") Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl32.c | 54 ++------------------------------------------ 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 7fcf7569743f..bae08ef92ac3 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -553,57 +553,6 @@ xfs_file_compat_ioctl( trace_xfs_file_compat_ioctl(ip); switch (cmd) { - /* No size or alignment issues on any arch */ - case XFS_IOC_DIOINFO: - case XFS_IOC_FSGEOMETRY_V4: - case XFS_IOC_FSGEOMETRY: - case XFS_IOC_AG_GEOMETRY: - case XFS_IOC_FSGETXATTR: - case XFS_IOC_FSSETXATTR: - case XFS_IOC_FSGETXATTRA: - case XFS_IOC_FSSETDM: - case XFS_IOC_GETBMAP: - case XFS_IOC_GETBMAPA: - case XFS_IOC_GETBMAPX: - case XFS_IOC_FSCOUNTS: - case XFS_IOC_SET_RESBLKS: - case XFS_IOC_GET_RESBLKS: - case XFS_IOC_FSGROWFSLOG: - case XFS_IOC_GOINGDOWN: - case XFS_IOC_ERROR_INJECTION: - case XFS_IOC_ERROR_CLEARALL: - case FS_IOC_GETFSMAP: - case XFS_IOC_SCRUB_METADATA: - case XFS_IOC_BULKSTAT: - case XFS_IOC_INUMBERS: - return xfs_file_ioctl(filp, cmd, p); -#if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32) - /* - * These are handled fine if no alignment issues. To support x32 - * which uses native 64-bit alignment we must emit these cases in - * addition to the ia-32 compat set below. - */ - case XFS_IOC_ALLOCSP: - case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - case XFS_IOC_FSGEOMETRY_V1: - case XFS_IOC_FSGROWFSDATA: - case XFS_IOC_FSGROWFSRT: - case XFS_IOC_ZERO_RANGE: -#ifdef CONFIG_X86_X32 - /* - * x32 special: this gets a different cmd number from the ia-32 compat - * case below; the associated data will match native 64-bit alignment. - */ - case XFS_IOC_SWAPEXT: -#endif - return xfs_file_ioctl(filp, cmd, p); -#endif #if defined(BROKEN_X86_ALIGNMENT) case XFS_IOC_ALLOCSP_32: case XFS_IOC_FREESP_32: @@ -705,6 +654,7 @@ xfs_file_compat_ioctl( case XFS_IOC_FSSETDM_BY_HANDLE_32: return xfs_compat_fssetdm_by_handle(filp, arg); default: - return -ENOIOCTLCMD; + /* try the native version */ + return xfs_file_ioctl(filp, cmd, p); } } From 4529e6d7a6ab727aa85b1dd3cbfa9e82f10f730d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Aug 2019 23:41:06 -0700 Subject: [PATCH 059/266] xfs: compat_ioctl: use compat_ptr() For 31-bit s390 user space, we have to pass pointer arguments through compat_ptr() in the compat_ioctl handler. Signed-off-by: Arnd Bergmann Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index bae08ef92ac3..7bd7534f5051 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -547,7 +547,7 @@ xfs_file_compat_ioctl( struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - void __user *arg = (void __user *)p; + void __user *arg = compat_ptr(p); int error; trace_xfs_file_compat_ioctl(ip); @@ -655,6 +655,6 @@ xfs_file_compat_ioctl( return xfs_compat_fssetdm_by_handle(filp, arg); default: /* try the native version */ - return xfs_file_ioctl(filp, cmd, p); + return xfs_file_ioctl(filp, cmd, (unsigned long)arg); } } From edc58dd0123b552453a74369bd0c8d890b497b4b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 11 Aug 2019 15:52:25 -0700 Subject: [PATCH 060/266] vfs: fix page locking deadlocks when deduping files When dedupe wants to use the page cache to compare parts of two files for dedupe, we must be very careful to handle locking correctly. The current code doesn't do this. It must lock and unlock the page only once if the two pages are the same, since the overlapping range check doesn't catch this when blocksize < pagesize. If the pages are distinct but from the same file, we must observe page locking order and lock them in order of increasing offset to avoid clashing with writeback locking. Fixes: 876bec6f9bbfcb3 ("vfs: refactor clone/dedupe_file_range common functions") Signed-off-by: Darrick J. Wong Reviewed-by: Bill O'Donnell Reviewed-by: Matthew Wilcox (Oracle) --- fs/read_write.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 1f5088dec566..5bbf587f5bc1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1811,10 +1811,7 @@ static int generic_remap_check_len(struct inode *inode_in, return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ +/* Read a page's worth of file data into the page cache. */ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) { struct page *page; @@ -1826,10 +1823,32 @@ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) put_page(page); return ERR_PTR(-EIO); } - lock_page(page); return page; } +/* + * Lock two pages, ensuring that we lock in offset order if the pages are from + * the same file. + */ +static void vfs_lock_two_pages(struct page *page1, struct page *page2) +{ + /* Always lock in order of increasing index. */ + if (page1->index > page2->index) + swap(page1, page2); + + lock_page(page1); + if (page1 != page2) + lock_page(page2); +} + +/* Unlock two pages, being careful not to unlock the same page twice. */ +static void vfs_unlock_two_pages(struct page *page1, struct page *page2) +{ + unlock_page(page1); + if (page1 != page2) + unlock_page(page2); +} + /* * Compare extents of two files to see if they are the same. * Caller must have locked both inodes to prevent write races. @@ -1867,10 +1886,24 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, dest_page = vfs_dedupe_get_page(dest, destoff); if (IS_ERR(dest_page)) { error = PTR_ERR(dest_page); - unlock_page(src_page); put_page(src_page); goto out_error; } + + vfs_lock_two_pages(src_page, dest_page); + + /* + * Now that we've locked both pages, make sure they're still + * mapped to the file data we're interested in. If not, + * someone is invalidating pages on us and we lose. + */ + if (!PageUptodate(src_page) || !PageUptodate(dest_page) || + src_page->mapping != src->i_mapping || + dest_page->mapping != dest->i_mapping) { + same = false; + goto unlock; + } + src_addr = kmap_atomic(src_page); dest_addr = kmap_atomic(dest_page); @@ -1882,8 +1915,8 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, kunmap_atomic(dest_addr); kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); +unlock: + vfs_unlock_two_pages(src_page, dest_page); put_page(dest_page); put_page(src_page); From 12ece2d53d3e8f827e972caf497c165f7729c717 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 15 Aug 2019 11:16:24 -0700 Subject: [PATCH 061/266] x86/cpu: Explain Intel model naming convention Dave Hansen spelled out the rules in an e-mail: https://lkml.kernel.org/r/91eefbe4-e32b-d762-be4d-672ff915db47@intel.com Copy those right into the file to make it easy for people to find them. Suggested-by: Borislav Petkov Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Dave Hansen Cc: Ingo Molnar Cc: x86-ml Link: https://lkml.kernel.org/r/20190815224704.GA10025@agluck-desk2.amr.corp.intel.com --- arch/x86/include/asm/intel-family.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 0278aa66ef62..fe7c205233f1 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -11,6 +11,21 @@ * While adding a new CPUID for a new microarchitecture, add a new * group to keep logically sorted out in chronological order. Within * that group keep the CPUID for the variants sorted by model number. + * + * The defined symbol names have the following form: + * INTEL_FAM6{OPTFAMILY}_{MICROARCH}{OPTDIFF} + * where: + * OPTFAMILY Describes the family of CPUs that this belongs to. Default + * is assumed to be "_CORE" (and should be omitted). Other values + * currently in use are _ATOM and _XEON_PHI + * MICROARCH Is the code name for the micro-architecture for this core. + * N.B. Not the platform name. + * OPTDIFF If needed, a short string to differentiate by market segment. + * Exact strings here will vary over time. _DESKTOP, _MOBILE, and + * _X (short for Xeon server) should be used when they are + * appropriate. + * + * The #define line may optionally include a comment including platform names. */ #define INTEL_FAM6_CORE_YONAH 0x0E From 165d42c012be69900f0e2f8545626cb9e7d4a832 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Sat, 6 Jul 2019 19:00:21 +0530 Subject: [PATCH 062/266] drm/mediatek: mtk_drm_drv.c: Add of_node_put() before goto Each iteration of for_each_child_of_node puts the previous node, but in the case of a goto from the middle of the loop, there is no put, thus causing a memory leak. Hence add an of_node_put before the goto in two places. Issue found with Coccinelle. Fixes: 119f5173628a (drm/mediatek: Add DRM Driver for Mediatek SoC MT8173) Signed-off-by: Nishka Dasgupta Signed-off-by: CK Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index c021d4c8324f..7f5408cb2377 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -567,12 +567,15 @@ static int mtk_drm_probe(struct platform_device *pdev) comp = devm_kzalloc(dev, sizeof(*comp), GFP_KERNEL); if (!comp) { ret = -ENOMEM; + of_node_put(node); goto err_node; } ret = mtk_ddp_comp_init(dev, node, comp, comp_id, NULL); - if (ret) + if (ret) { + of_node_put(node); goto err_node; + } private->ddp_comp[comp_id] = comp; } From 5d888b481e6abc726b36c86f0bf13af1583bb336 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 14 Aug 2019 17:38:09 -0700 Subject: [PATCH 063/266] xfs: fix reflink source file racing with directio writes While trawling through the dedupe file comparison code trying to fix page deadlocking problems, Dave Chinner noticed that the reflink code only takes shared IOLOCK/MMAPLOCKs on the source file. Because page_mkwrite and directio writes do not take the EXCL versions of those locks, this means that reflink can race with writer processes. For pure remapping this can lead to undefined behavior and file corruption; for dedupe this means that we cannot be sure that the contents are identical when we decide to go ahead with the remapping. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_reflink.c | 65 ++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c4ec7afd1170..edbe37b7f636 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1190,11 +1190,11 @@ xfs_reflink_remap_blocks( } /* - * Grab the exclusive iolock for a data copy from src to dest, making - * sure to abide vfs locking order (lowest pointer value goes first) and - * breaking the pnfs layout leases on dest before proceeding. The loop - * is needed because we cannot call the blocking break_layout() with the - * src iolock held, and therefore have to back out both locks. + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. */ static int xfs_iolock_two_inodes_and_break_layout( @@ -1203,33 +1203,44 @@ xfs_iolock_two_inodes_and_break_layout( { int error; -retry: - if (src < dest) { - inode_lock_shared(src); - inode_lock_nested(dest, I_MUTEX_NONDIR2); - } else { - /* src >= dest */ - inode_lock(dest); - } + if (src > dest) + swap(src, dest); - error = break_layout(dest, false); - if (error == -EWOULDBLOCK) { - inode_unlock(dest); - if (src < dest) - inode_unlock_shared(src); +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { error = break_layout(dest, true); if (error) return error; - goto retry; } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); if (error) { - inode_unlock(dest); - if (src < dest) - inode_unlock_shared(src); + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; return error; } - if (src > dest) - inode_lock_shared_nested(src, I_MUTEX_NONDIR2); + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); + inode_unlock(dest); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + return 0; } @@ -1247,10 +1258,10 @@ xfs_reflink_remap_unlock( xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); inode_unlock(inode_out); if (!same_inode) - inode_unlock_shared(inode_in); + inode_unlock(inode_in); } /* @@ -1325,7 +1336,7 @@ xfs_reflink_remap_prep( if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); else - xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, + xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest, XFS_MMAPLOCK_EXCL); /* Check file eligibility and prepare for block sharing. */ From 0d7342c3637462fc6291b392ced9af0f4ca4dab4 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 14 Aug 2019 15:35:02 +0300 Subject: [PATCH 064/266] MAINTAINERS: Remove IP MASQUERADING record This entry is in MAINTAINERS for historical purpose. It doesn't match current sources since the commit adf82accc5f5 ("netfilter: x_tables: merge ip and ipv6 masquerade modules") moved the module. The net/netfilter/xt_MASQUERADE.c module is already under the netfilter section. Thus, there is no purpose to keep this separate entry in MAINTAINERS. Cc: Florian Westphal Cc: Juanjo Ciarlante Cc: netfilter-devel@vger.kernel.org Suggested-by: Pablo Neira Ayuso Signed-off-by: Denis Efremov Signed-off-by: Pablo Neira Ayuso --- MAINTAINERS | 5 ----- 1 file changed, 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a416574780d6..6839cfd91dde 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8439,11 +8439,6 @@ S: Maintained F: fs/io_uring.c F: include/uapi/linux/io_uring.h -IP MASQUERADING -M: Juanjo Ciarlante -S: Maintained -F: net/ipv4/netfilter/ipt_MASQUERADE.c - IPMI SUBSYSTEM M: Corey Minyard L: openipmi-developer@lists.sourceforge.net (moderated for non-subscribers) From f20faa06d83de440bec8e200870784c3458793c4 Mon Sep 17 00:00:00 2001 From: Todd Seidelmann Date: Wed, 14 Aug 2019 10:54:16 -0400 Subject: [PATCH 065/266] netfilter: ebtables: Fix argument order to ADD_COUNTER The ordering of arguments to the x_tables ADD_COUNTER macro appears to be wrong in ebtables (cf. ip_tables.c, ip6_tables.c, and arp_tables.c). This causes data corruption in the ebtables userspace tools because they get incorrect packet & byte counts from the kernel. Fixes: d72133e628803 ("netfilter: ebtables: use ADD_COUNTER macro") Signed-off-by: Todd Seidelmann Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index c8177a89f52c..4096d8a74a2b 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -221,7 +221,7 @@ unsigned int ebt_do_table(struct sk_buff *skb, return NF_DROP; } - ADD_COUNTER(*(counter_base + i), 1, skb->len); + ADD_COUNTER(*(counter_base + i), skb->len, 1); /* these should only watch: not modify, nor tell us * what to do with the packet @@ -959,8 +959,8 @@ static void get_counters(const struct ebt_counter *oldcounters, continue; counter_base = COUNTER_BASE(oldcounters, nentries, cpu); for (i = 0; i < nentries; i++) - ADD_COUNTER(counters[i], counter_base[i].pcnt, - counter_base[i].bcnt); + ADD_COUNTER(counters[i], counter_base[i].bcnt, + counter_base[i].pcnt); } } @@ -1280,7 +1280,7 @@ static int do_update_counters(struct net *net, const char *name, /* we add to the counters of the first cpu */ for (i = 0; i < num_counters; i++) - ADD_COUNTER(t->private->counters[i], tmp[i].pcnt, tmp[i].bcnt); + ADD_COUNTER(t->private->counters[i], tmp[i].bcnt, tmp[i].pcnt); write_unlock_bh(&t->lock); ret = 0; From 14c415862c0630e01712a4eeaf6159a2b1b6d2a4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Aug 2019 11:23:58 +0200 Subject: [PATCH 066/266] netfilter: nft_flow_offload: missing netlink attribute policy The netlink attribute policy for NFTA_FLOW_TABLE_NAME is missing. Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 060a4ed46d5e..01705ad74a9a 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -149,6 +149,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hook_mask); } +static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { + [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, + .len = NFT_NAME_MAXLEN - 1 }, +}; + static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -207,6 +212,7 @@ static const struct nft_expr_ops nft_flow_offload_ops = { static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, + .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; From 89a26cd4b501e9511d3cd3d22327fc76a75a38b3 Mon Sep 17 00:00:00 2001 From: Juliana Rodrigueiro Date: Fri, 16 Aug 2019 17:02:22 +0200 Subject: [PATCH 067/266] netfilter: xt_nfacct: Fix alignment mismatch in xt_nfacct_match_info When running a 64-bit kernel with a 32-bit iptables binary, the size of the xt_nfacct_match_info struct diverges. kernel: sizeof(struct xt_nfacct_match_info) : 40 iptables: sizeof(struct xt_nfacct_match_info)) : 36 Trying to append nfacct related rules results in an unhelpful message. Although it is suggested to look for more information in dmesg, nothing can be found there. # iptables -A -m nfacct --nfacct-name iptables: Invalid argument. Run `dmesg' for more information. This patch fixes the memory misalignment by enforcing 8-byte alignment within the struct's first revision. This solution is often used in many other uapi netfilter headers. Signed-off-by: Juliana Rodrigueiro Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_nfacct.h | 5 ++++ net/netfilter/xt_nfacct.c | 36 ++++++++++++++++-------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/include/uapi/linux/netfilter/xt_nfacct.h b/include/uapi/linux/netfilter/xt_nfacct.h index 5c8a4d760ee3..b5123ab8d54a 100644 --- a/include/uapi/linux/netfilter/xt_nfacct.h +++ b/include/uapi/linux/netfilter/xt_nfacct.h @@ -11,4 +11,9 @@ struct xt_nfacct_match_info { struct nf_acct *nfacct; }; +struct xt_nfacct_match_info_v1 { + char name[NFACCT_NAME_MAX]; + struct nf_acct *nfacct __attribute__((aligned(8))); +}; + #endif /* _XT_NFACCT_MATCH_H */ diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index d0ab1adf5bff..5aab6df74e0f 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -54,25 +54,39 @@ nfacct_mt_destroy(const struct xt_mtdtor_param *par) nfnl_acct_put(info->nfacct); } -static struct xt_match nfacct_mt_reg __read_mostly = { - .name = "nfacct", - .family = NFPROTO_UNSPEC, - .checkentry = nfacct_mt_checkentry, - .match = nfacct_mt, - .destroy = nfacct_mt_destroy, - .matchsize = sizeof(struct xt_nfacct_match_info), - .usersize = offsetof(struct xt_nfacct_match_info, nfacct), - .me = THIS_MODULE, +static struct xt_match nfacct_mt_reg[] __read_mostly = { + { + .name = "nfacct", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = nfacct_mt_checkentry, + .match = nfacct_mt, + .destroy = nfacct_mt_destroy, + .matchsize = sizeof(struct xt_nfacct_match_info), + .usersize = offsetof(struct xt_nfacct_match_info, nfacct), + .me = THIS_MODULE, + }, + { + .name = "nfacct", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = nfacct_mt_checkentry, + .match = nfacct_mt, + .destroy = nfacct_mt_destroy, + .matchsize = sizeof(struct xt_nfacct_match_info_v1), + .usersize = offsetof(struct xt_nfacct_match_info_v1, nfacct), + .me = THIS_MODULE, + }, }; static int __init nfacct_mt_init(void) { - return xt_register_match(&nfacct_mt_reg); + return xt_register_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg)); } static void __exit nfacct_mt_exit(void) { - xt_unregister_match(&nfacct_mt_reg); + xt_unregister_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg)); } module_init(nfacct_mt_init); From b0fdc01354f45d43f082025636ef808968a27b36 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 16 Aug 2019 18:06:26 +0200 Subject: [PATCH 068/266] sched/core: Schedule new worker even if PI-blocked If a task is PI-blocked (blocking on sleeping spinlock) then we don't want to schedule a new kworker if we schedule out due to lock contention because !RT does not do that as well. A spinning spinlock disables preemption and a worker does not schedule out on lock contention (but spin). On RT the RW-semaphore implementation uses an rtmutex so tsk_is_pi_blocked() will return true if a task blocks on it. In this case we will now start a new worker which may deadlock if one worker is waiting on progress from another worker. Since a RW-semaphore starts a new worker on !RT, we should do the same on RT. XFS is able to trigger this deadlock. Allow to schedule new worker if the current worker is PI-blocked. Signed-off-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20190816160626.12742-1-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..010d578118d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3904,7 +3904,7 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state || tsk_is_pi_blocked(tsk)) + if (!tsk->state) return; /* @@ -3920,6 +3920,9 @@ static inline void sched_submit_work(struct task_struct *tsk) preempt_enable_no_resched(); } + if (tsk_is_pi_blocked(tsk)) + return; + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. From 77d760328ee015cf89460c52bfd5a6b0a09b7472 Mon Sep 17 00:00:00 2001 From: Su Yanjun Date: Fri, 16 Aug 2019 16:43:21 +0800 Subject: [PATCH 069/266] perf/x86: Fix typo in comment No functional change. Signed-off-by: Su Yanjun Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1565945001-4413-1-git-send-email-suyj.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 81b005e4c7d9..325959d19d9a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1236,7 +1236,7 @@ void x86_pmu_enable_event(struct perf_event *event) * Add a single event to the PMU. * * The event is added to the group of enabled events - * but only if it can be scehduled with existing events. + * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { From f1c6ece23729257fb46562ff9224cf5f61b818da Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 12 Aug 2019 20:43:02 +0200 Subject: [PATCH 070/266] kprobes: Fix potential deadlock in kprobe_optimizer() lockdep reports the following deadlock scenario: WARNING: possible circular locking dependency detected kworker/1:1/48 is trying to acquire lock: 000000008d7a62b2 (text_mutex){+.+.}, at: kprobe_optimizer+0x163/0x290 but task is already holding lock: 00000000850b5e2d (module_mutex){+.+.}, at: kprobe_optimizer+0x31/0x290 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (module_mutex){+.+.}: __mutex_lock+0xac/0x9f0 mutex_lock_nested+0x1b/0x20 set_all_modules_text_rw+0x22/0x90 ftrace_arch_code_modify_prepare+0x1c/0x20 ftrace_run_update_code+0xe/0x30 ftrace_startup_enable+0x2e/0x50 ftrace_startup+0xa7/0x100 register_ftrace_function+0x27/0x70 arm_kprobe+0xb3/0x130 enable_kprobe+0x83/0xa0 enable_trace_kprobe.part.0+0x2e/0x80 kprobe_register+0x6f/0xc0 perf_trace_event_init+0x16b/0x270 perf_kprobe_init+0xa7/0xe0 perf_kprobe_event_init+0x3e/0x70 perf_try_init_event+0x4a/0x140 perf_event_alloc+0x93a/0xde0 __do_sys_perf_event_open+0x19f/0xf30 __x64_sys_perf_event_open+0x20/0x30 do_syscall_64+0x65/0x1d0 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (text_mutex){+.+.}: __lock_acquire+0xfcb/0x1b60 lock_acquire+0xca/0x1d0 __mutex_lock+0xac/0x9f0 mutex_lock_nested+0x1b/0x20 kprobe_optimizer+0x163/0x290 process_one_work+0x22b/0x560 worker_thread+0x50/0x3c0 kthread+0x112/0x150 ret_from_fork+0x3a/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(module_mutex); lock(text_mutex); lock(module_mutex); lock(text_mutex); *** DEADLOCK *** As a reproducer I've been using bcc's funccount.py (https://github.com/iovisor/bcc/blob/master/tools/funccount.py), for example: # ./funccount.py '*interrupt*' That immediately triggers the lockdep splat. Fix by acquiring text_mutex before module_mutex in kprobe_optimizer(). Signed-off-by: Andrea Righi Acked-by: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Linus Torvalds Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: d5b844a2cf50 ("ftrace/x86: Remove possible deadlock between register_kprobe() and ftrace_run_update_code()") Link: http://lkml.kernel.org/r/20190812184302.GA7010@xps-13 Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9873fc627d61..d9770a5393c8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -470,6 +470,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { + lockdep_assert_held(&text_mutex); /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -487,9 +488,7 @@ static void do_optimize_kprobes(void) list_empty(&optimizing_list)) return; - mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); - mutex_unlock(&text_mutex); } /* @@ -500,6 +499,7 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + lockdep_assert_held(&text_mutex); /* See comment in do_optimize_kprobes() */ lockdep_assert_cpus_held(); @@ -507,7 +507,6 @@ static void do_unoptimize_kprobes(void) if (list_empty(&unoptimizing_list)) return; - mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { @@ -524,7 +523,6 @@ static void do_unoptimize_kprobes(void) } else list_del_init(&op->list); } - mutex_unlock(&text_mutex); } /* Reclaim all kprobes on the free_list */ @@ -556,6 +554,7 @@ static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); cpus_read_lock(); + mutex_lock(&text_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -583,6 +582,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + mutex_unlock(&text_mutex); cpus_read_unlock(); mutex_unlock(&kprobe_mutex); From f897e60a12f0b9146357780d317879bce2a877dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Aug 2019 14:54:07 +0200 Subject: [PATCH 071/266] x86/apic: Handle missing global clockevent gracefully Some newer machines do not advertise legacy timers. The kernel can handle that situation if the TSC and the CPU frequency are enumerated by CPUID or MSRs and the CPU supports TSC deadline timer. If the CPU does not support TSC deadline timer the local APIC timer frequency has to be known as well. Some Ryzens machines do not advertize legacy timers, but there is no reliable way to determine the bus frequency which feeds the local APIC timer when the machine allows overclocking of that frequency. As there is no legacy timer the local APIC timer calibration crashes due to a NULL pointer dereference when accessing the not installed global clock event device. Switch the calibration loop to a non interrupt based one, which polls either TSC (if frequency is known) or jiffies. The latter requires a global clockevent. As the machines which do not have a global clockevent installed have a known TSC frequency this is a non issue. For older machines where TSC frequency is not known, there is no known case where the legacy timers do not exist as that would have been reported long ago. Reported-by: Daniel Drake Reported-by: Jiri Slaby Signed-off-by: Thomas Gleixner Tested-by: Daniel Drake Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1908091443030.21433@nanos.tec.linutronix.de Link: http://bugzilla.opensuse.org/show_bug.cgi?id=1142926#c12 --- arch/x86/kernel/apic/apic.c | 68 +++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f5291362da1a..aa5495d0f478 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -722,7 +722,7 @@ static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; /* - * Temporary interrupt handler. + * Temporary interrupt handler and polled calibration function. */ static void __init lapic_cal_handler(struct clock_event_device *dev) { @@ -851,7 +851,8 @@ bool __init apic_needs_pit(void) static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); - void (*real_handler)(struct clock_event_device *dev); + u64 tsc_perj = 0, tsc_start = 0; + unsigned long jif_start; unsigned long deltaj; long delta, deltatsc; int pm_referenced = 0; @@ -878,29 +879,65 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" "calibrating APIC timer ...\n"); + /* + * There are platforms w/o global clockevent devices. Instead of + * making the calibration conditional on that, use a polling based + * approach everywhere. + */ local_irq_disable(); - /* Replace the global interrupt handler */ - real_handler = global_clock_event->event_handler; - global_clock_event->event_handler = lapic_cal_handler; - /* * Setup the APIC counter to maximum. There is no way the lapic * can underflow in the 100ms detection time frame */ __setup_APIC_LVTT(0xffffffff, 0, 0); - /* Let the interrupts run */ + /* + * Methods to terminate the calibration loop: + * 1) Global clockevent if available (jiffies) + * 2) TSC if available and frequency is known + */ + jif_start = READ_ONCE(jiffies); + + if (tsc_khz) { + tsc_start = rdtsc(); + tsc_perj = div_u64((u64)tsc_khz * 1000, HZ); + } + + /* + * Enable interrupts so the tick can fire, if a global + * clockevent device is available + */ local_irq_enable(); - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) { + /* Wait for a tick to elapse */ + while (1) { + if (tsc_khz) { + u64 tsc_now = rdtsc(); + if ((tsc_now - tsc_start) >= tsc_perj) { + tsc_start += tsc_perj; + break; + } + } else { + unsigned long jif_now = READ_ONCE(jiffies); + + if (time_after(jif_now, jif_start)) { + jif_start = jif_now; + break; + } + } + cpu_relax(); + } + + /* Invoke the calibration routine */ + local_irq_disable(); + lapic_cal_handler(NULL); + local_irq_enable(); + } local_irq_disable(); - /* Restore the real event handler */ - global_clock_event->event_handler = real_handler; - /* Build delta t1-t2 as apic timer counts down */ delta = lapic_cal_t1 - lapic_cal_t2; apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); @@ -943,10 +980,11 @@ static int __init calibrate_APIC_clock(void) levt->features &= ~CLOCK_EVT_FEAT_DUMMY; /* - * PM timer calibration failed or not turned on - * so lets try APIC timer based calibration + * PM timer calibration failed or not turned on so lets try APIC + * timer based calibration, if a global clockevent device is + * available. */ - if (!pm_referenced) { + if (!pm_referenced && global_clock_event) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); /* From 33da8e7c814f77310250bb54a9db36a44c5de784 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 16 Aug 2019 12:33:54 -0500 Subject: [PATCH 072/266] signal: Allow cifs and drbd to receive their terminating signals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My recent to change to only use force_sig for a synchronous events wound up breaking signal reception cifs and drbd. I had overlooked the fact that by default kthreads start out with all signals set to SIG_IGN. So a change I thought was safe turned out to have made it impossible for those kernel thread to catch their signals. Reverting the work on force_sig is a bad idea because what the code was doing was very much a misuse of force_sig. As the way force_sig ultimately allowed the signal to happen was to change the signal handler to SIG_DFL. Which after the first signal will allow userspace to send signals to these kernel threads. At least for wake_ack_receiver in drbd that does not appear actively wrong. So correct this problem by adding allow_kernel_signal that will allow signals whose siginfo reports they were sent by the kernel through, but will not allow userspace generated signals, and update cifs and drbd to call allow_kernel_signal in an appropriate place so that their thread can receive this signal. Fixing things this way ensures that userspace won't be able to send signals and cause problems, that it is clear which signals the threads are expecting to receive, and it guarantees that nothing else in the system will be affected. This change was partly inspired by similar cifs and drbd patches that added allow_signal. Reported-by: ronnie sahlberg Reported-by: Christoph Böhmwalder Tested-by: Christoph Böhmwalder Cc: Steve French Cc: Philipp Reisner Cc: David Laight Fixes: 247bc9470b1e ("cifs: fix rmmod regression in cifs.ko caused by force_sig changes") Fixes: 72abe3bcf091 ("signal/cifs: Fix cifs_put_tcp_session to call send_sig instead of force_sig") Fixes: fee109901f39 ("signal/drbd: Use send_sig not force_sig") Fixes: 3cf5d076fb4d ("signal: Remove task parameter from force_sig") Signed-off-by: "Eric W. Biederman" --- drivers/block/drbd/drbd_main.c | 2 ++ fs/cifs/connect.c | 2 +- include/linux/signal.h | 15 ++++++++++++++- kernel/signal.c | 5 +++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 9bd4ddd12b25..5b248763a672 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -322,6 +322,8 @@ static int drbd_thread_setup(void *arg) thi->name[0], resource->name); + allow_kernel_signal(DRBD_SIGKILL); + allow_kernel_signal(SIGXCPU); restart: retval = thi->function(thi); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a15a6e738eb5..1795e80cbdf7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1113,7 +1113,7 @@ cifs_demultiplex_thread(void *p) mempool_resize(cifs_req_poolp, length + cifs_min_rcv); set_freezable(); - allow_signal(SIGKILL); + allow_kernel_signal(SIGKILL); while (server->tcpStatus != CifsExiting) { if (try_to_freeze()) continue; diff --git a/include/linux/signal.h b/include/linux/signal.h index b5d99482d3fe..1a5f88316b08 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -282,6 +282,9 @@ extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); +#define SIG_KTHREAD ((__force __sighandler_t)2) +#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3) + static inline void allow_signal(int sig) { /* @@ -289,7 +292,17 @@ static inline void allow_signal(int sig) * know it'll be handled, so that they don't get converted to * SIGKILL or just silently dropped. */ - kernel_sigaction(sig, (__force __sighandler_t)2); + kernel_sigaction(sig, SIG_KTHREAD); +} + +static inline void allow_kernel_signal(int sig) +{ + /* + * Kernel threads handle their own signals. Let the signal code + * know signals sent by the kernel will be handled, so that they + * don't get silently dropped. + */ + kernel_sigaction(sig, SIG_KTHREAD_KERNEL); } static inline void disallow_signal(int sig) diff --git a/kernel/signal.c b/kernel/signal.c index e667be6907d7..534fec266a33 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -90,6 +90,11 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; + /* Only allow kernel generated signals to this kthread */ + if (unlikely((t->flags & PF_KTHREAD) && + (handler == SIG_KTHREAD_KERNEL) && !force)) + return true; + return sig_handler_ignored(handler, sig); } From 38a429c898ddd210cc35463b096389f97c3c5a73 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 19 Aug 2019 16:39:27 +0900 Subject: [PATCH 073/266] netfilter: add include guard to nf_conntrack_h323_types.h Add a header include guard just in case. Signed-off-by: Masahiro Yamada Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_h323_types.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/netfilter/nf_conntrack_h323_types.h b/include/linux/netfilter/nf_conntrack_h323_types.h index 7a6871ac8784..74c6f9241944 100644 --- a/include/linux/netfilter/nf_conntrack_h323_types.h +++ b/include/linux/netfilter/nf_conntrack_h323_types.h @@ -4,6 +4,9 @@ * Copyright (c) 2006 Jing Min Zhao */ +#ifndef _NF_CONNTRACK_H323_TYPES_H +#define _NF_CONNTRACK_H323_TYPES_H + typedef struct TransportAddress_ipAddress { /* SEQUENCE */ int options; /* No use */ unsigned int ip; @@ -931,3 +934,5 @@ typedef struct RasMessage { /* CHOICE */ InfoRequestResponse infoRequestResponse; }; } RasMessage; + +#endif /* _NF_CONNTRACK_H323_TYPES_H */ From b72fb1dcd2ea9d29417711cb302cef3006fa8d5a Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Wed, 7 Aug 2019 14:11:55 -0700 Subject: [PATCH 074/266] HID: wacom: Correct distance scale for 2nd-gen Intuos devices Distance values reported by 2nd-gen Intuos tablets are on an inverted scale (0 == far, 63 == near). We need to change them over to a normal scale before reporting to userspace or else userspace drivers and applications can get confused. Ref: https://github.com/linuxwacom/input-wacom/issues/98 Fixes: eda01dab53 ("HID: wacom: Add four new Intuos devices") Signed-off-by: Jason Gerecke Cc: # v4.4+ Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 50074485b88b..7a9e229e6253 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -846,6 +846,8 @@ static int wacom_intuos_general(struct wacom_wac *wacom) y >>= 1; distance >>= 1; } + if (features->type == INTUOSHT2) + distance = features->distance_max - distance; input_report_abs(input, ABS_X, x); input_report_abs(input, ABS_Y, y); input_report_abs(input, ABS_DISTANCE, distance); From b640be5bc8e4673dc8049cf74176ddedecea5597 Mon Sep 17 00:00:00 2001 From: Even Xu Date: Fri, 9 Aug 2019 21:18:29 +0800 Subject: [PATCH 075/266] HID: intel-ish-hid: ipc: add EHL device id EHL is a new platform using ishtp solution, add its device id to support list. Signed-off-by: Even Xu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/hw-ish.h | 1 + drivers/hid/intel-ish-hid/ipc/pci-ish.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h index 1065692f90e2..5792a104000a 100644 --- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h +++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h @@ -24,6 +24,7 @@ #define ICL_MOBILE_DEVICE_ID 0x34FC #define SPT_H_DEVICE_ID 0xA135 #define CML_LP_DEVICE_ID 0x02FC +#define EHL_Ax_DEVICE_ID 0x4BB3 #define REVISION_ID_CHT_A0 0x6 #define REVISION_ID_CHT_Ax_SI 0x0 diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index aa80b4d3b740..279567baca3d 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -33,6 +33,7 @@ static const struct pci_device_id ish_pci_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, ICL_MOBILE_DEVICE_ID)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, SPT_H_DEVICE_ID)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, CML_LP_DEVICE_ID)}, + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, EHL_Ax_DEVICE_ID)}, {0, } }; MODULE_DEVICE_TABLE(pci, ish_pci_tbl); From 2d05dba2b25ecb0f8fc3a0b4eb2232da6454a47b Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 12 Aug 2019 18:04:44 +0200 Subject: [PATCH 076/266] HID: cp2112: prevent sleeping function called from invalid context When calling request_threaded_irq() with a CP2112, the function cp2112_gpio_irq_startup() is called in a IRQ context. Therefore we can not sleep, and we can not call cp2112_gpio_direction_input() there. Move the call to cp2112_gpio_direction_input() earlier to have a working driver. Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-cp2112.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c index 2310c96ccf4a..db1b55df0d13 100644 --- a/drivers/hid/hid-cp2112.c +++ b/drivers/hid/hid-cp2112.c @@ -1153,8 +1153,6 @@ static unsigned int cp2112_gpio_irq_startup(struct irq_data *d) INIT_DELAYED_WORK(&dev->gpio_poll_worker, cp2112_gpio_poll_callback); - cp2112_gpio_direction_input(gc, d->hwirq); - if (!dev->gpio_poll) { dev->gpio_poll = true; schedule_delayed_work(&dev->gpio_poll_worker, 0); @@ -1204,6 +1202,12 @@ static int __maybe_unused cp2112_allocate_irq(struct cp2112_device *dev, return PTR_ERR(dev->desc[pin]); } + ret = cp2112_gpio_direction_input(&dev->gc, pin); + if (ret < 0) { + dev_err(dev->gc.parent, "Failed to set GPIO to input dir\n"); + goto err_desc; + } + ret = gpiochip_lock_as_irq(&dev->gc, pin); if (ret) { dev_err(dev->gc.parent, "Failed to lock GPIO as interrupt\n"); From 7e10cc25bfa0dd3602bbcf5cc9c759a90eb675dc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Aug 2019 12:06:43 -0400 Subject: [PATCH 077/266] NFS: Don't refresh attributes with mounted-on-file information If we've been given the attributes of the mounted-on-file, then do not use those to check or update the attributes on the application-visible inode. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8a1758200b57..c764cfe456e5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1403,12 +1403,21 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return 0; + /* No fileid? Just exit */ + if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) + return 0; /* Has the inode gone and changed behind our back? */ - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + if (nfsi->fileid != fattr->fileid) { + /* Is this perhaps the mounted-on fileid? */ + if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && + nfsi->fileid == fattr->mounted_on_fileid) + return 0; return -ESTALE; + } if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -ESTALE; + if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) @@ -1768,18 +1777,6 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); -static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, - struct nfs_fattr *fattr) -{ - bool ret1 = true, ret2 = true; - - if (fattr->valid & NFS_ATTR_FATTR_FILEID) - ret1 = (nfsi->fileid == fattr->fileid); - if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) - ret2 = (nfsi->fileid == fattr->mounted_on_fileid); - return ret1 || ret2; -} - /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1810,7 +1807,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if (!nfs_fileid_valid(nfsi, fattr)) { + /* No fileid? Just exit */ + if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) + return 0; + /* Has the inode gone and changed behind our back? */ + if (nfsi->fileid != fattr->fileid) { + /* Is this perhaps the mounted-on fileid? */ + if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && + nfsi->fileid == fattr->mounted_on_fileid) + return 0; printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, From 90cf500e338ab3f3c0f126ba37e36fb6a9058441 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Aug 2019 15:03:11 -0400 Subject: [PATCH 078/266] NFSv4: Fix return values for nfs4_file_open() Currently, we are translating RPC level errors such as timeouts, as well as interrupts etc into EOPENSTALE, which forces a single replay of the open attempt. What we actually want to do is force the replay only in the cases where the returned error indicates that the file may have changed on the server. So the fix is to spell out the exact set of errors where we want to return EOPENSTALE. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4file.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 96db471ca2e5..339663d04bf8 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -73,13 +73,13 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (IS_ERR(inode)) { err = PTR_ERR(inode); switch (err) { - case -EPERM: - case -EACCES: - case -EDQUOT: - case -ENOSPC: - case -EROFS: - goto out_put_ctx; default: + goto out_put_ctx; + case -ENOENT: + case -ESTALE: + case -EISDIR: + case -ENOTDIR: + case -ELOOP: goto out_drop; } } From 9821421a291b548ef4369c6998745baa36ddecd5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Aug 2019 12:15:07 -0400 Subject: [PATCH 079/266] NFSv4: Fix return value in nfs_finish_open() If the file turns out to be of the wrong type after opening, we want to revalidate the path and retry, so return EOPENSTALE rather than ESTALE. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8d501093660f..0adfd8840110 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1487,7 +1487,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx, if (S_ISREG(file->f_path.dentry->d_inode->i_mode)) nfs_file_set_open_context(file, ctx); else - err = -ESTALE; + err = -EOPENSTALE; out: return err; } From f4340e9314dbfadc48758945f85fc3b16612d06f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Aug 2019 15:19:54 -0400 Subject: [PATCH 080/266] NFSv4/pnfs: Fix a page lock leak in nfs_pageio_resend() If the attempt to resend the pages fails, we need to ensure that we clean up those pages that were not transmitted. Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to application") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.5+ --- fs/nfs/pagelist.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ed4e1b07447b..15c254753f88 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1251,20 +1251,22 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { - LIST_HEAD(failed); + LIST_HEAD(pages); desc->pg_io_completion = hdr->io_completion; desc->pg_dreq = hdr->dreq; - while (!list_empty(&hdr->pages)) { - struct nfs_page *req = nfs_list_entry(hdr->pages.next); + list_splice_init(&hdr->pages, &pages); + while (!list_empty(&pages)) { + struct nfs_page *req = nfs_list_entry(pages.next); if (!nfs_pageio_add_request(desc, req)) - nfs_list_move_request(req, &failed); + break; } nfs_pageio_complete(desc); - if (!list_empty(&failed)) { - list_move(&failed, &hdr->pages); - return desc->pg_error < 0 ? desc->pg_error : -EIO; + if (!list_empty(&pages)) { + int err = desc->pg_error < 0 ? desc->pg_error : -EIO; + hdr->completion_ops->error_cleanup(&pages, err); + return err; } return 0; } From eb2c50da9e256dbbb3ff27694440e4c1900cfef8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Aug 2019 18:04:36 -0400 Subject: [PATCH 081/266] NFS: Ensure O_DIRECT reports an error if the bytes read/written is 0 If the attempt to resend the I/O results in no bytes being read/written, we must ensure that we report the error. Signed-off-by: Trond Myklebust Fixes: 0a00b77b331a ("nfs: mirroring support for direct io") Cc: stable@vger.kernel.org # v3.20+ --- fs/nfs/direct.c | 27 ++++++++++++++++++--------- fs/nfs/pagelist.c | 1 + 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0cb442406168..222d7115db71 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -401,15 +401,21 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) unsigned long bytes = 0; struct nfs_direct_req *dreq = hdr->dreq; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) - goto out_put; - spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) dreq->error = hdr->error; - else + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { + spin_unlock(&dreq->lock); + goto out_put; + } + + if (hdr->good_bytes != 0) nfs_direct_good_bytes(dreq, hdr); + if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) + dreq->error = 0; + spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { @@ -782,16 +788,19 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) bool request_commit = false; struct nfs_page *req = nfs_list_entry(hdr->pages.next); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) - goto out_put; - nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) dreq->error = hdr->error; - if (dreq->error == 0) { + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { + spin_unlock(&dreq->lock); + goto out_put; + } + + if (hdr->good_bytes != 0) { nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 15c254753f88..56cefa0ab804 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1266,6 +1266,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, if (!list_empty(&pages)) { int err = desc->pg_error < 0 ? desc->pg_error : -EIO; hdr->completion_ops->error_cleanup(&pages, err); + nfs_set_pgio_error(hdr, err, hdr->io_start); return err; } return 0; From 17d8c5d145000070c581f2a8aa01edc7998582ab Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 14 Aug 2019 14:19:09 -0400 Subject: [PATCH 082/266] NFS: Fix initialisation of I/O result struct in nfs_pgio_rpcsetup Initialise the result count to 0 rather than initialising it to the argument count. The reason is that we want to ensure we record the I/O stats correctly in the case where an error is returned (for instance in the layoutstats). Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 56cefa0ab804..20b3717cd7ca 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, } hdr->res.fattr = &hdr->fattr; - hdr->res.count = count; + hdr->res.count = 0; hdr->res.eof = 0; hdr->res.verf = &hdr->verf; nfs_fattr_init(&hdr->fattr); From 06c9fdf3b9f1acc6e53753c99c54c39764cc979f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 14 Aug 2019 15:42:43 -0400 Subject: [PATCH 083/266] NFS: On fatal writeback errors, we need to call nfs_inode_remove_request() If the writeback error is fatal, we need to remove the tracking structures (i.e. the nfs_page) from the inode. Fixes: 6fbda89b257f ("NFS: Replace custom error reporting mechanism...") Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 92d9cadc6102..3399149435ce 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -57,6 +57,7 @@ static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_inode_remove_request(struct nfs_page *req); static void nfs_clear_request_commit(struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); @@ -591,7 +592,9 @@ release_request: static void nfs_write_error(struct nfs_page *req, int error) { + nfs_set_pageerror(page_file_mapping(req->wb_page)); nfs_mapping_set_error(req->wb_page, error); + nfs_inode_remove_request(req); nfs_end_page_writeback(req); nfs_release_request(req); } From 0a46fff2f9108c2c44218380a43a736cf4612541 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 13 Aug 2019 16:16:54 +0300 Subject: [PATCH 084/266] x86/boot/compressed/64: Fix boot on machines with broken E820 table BIOS on Samsung 500C Chromebook reports very rudimentary E820 table that consists of 2 entries: BIOS-e820: [mem 0x0000000000000000-0x0000000000000fff] usable BIOS-e820: [mem 0x00000000fffff000-0x00000000ffffffff] reserved It breaks logic in find_trampoline_placement(): bios_start lands on the end of the first 4k page and trampoline start gets placed below 0. Detect underflow and don't touch bios_start for such cases. It makes kernel ignore E820 table on machines that doesn't have two usable pages below BIOS_START_MAX. Fixes: 1b3a62643660 ("x86/boot/compressed/64: Validate trampoline placement against E820") Signed-off-by: Kirill A. Shutemov Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: x86-ml Link: https://bugzilla.kernel.org/show_bug.cgi?id=203463 Link: https://lkml.kernel.org/r/20190813131654.24378-1-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/pgtable_64.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 5f2d03067ae5..2faddeb0398a 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -72,6 +72,8 @@ static unsigned long find_trampoline_placement(void) /* Find the first usable memory region under bios_start. */ for (i = boot_params->e820_entries - 1; i >= 0; i--) { + unsigned long new; + entry = &boot_params->e820_table[i]; /* Skip all entries above bios_start. */ @@ -84,15 +86,20 @@ static unsigned long find_trampoline_placement(void) /* Adjust bios_start to the end of the entry if needed. */ if (bios_start > entry->addr + entry->size) - bios_start = entry->addr + entry->size; + new = entry->addr + entry->size; /* Keep bios_start page-aligned. */ - bios_start = round_down(bios_start, PAGE_SIZE); + new = round_down(new, PAGE_SIZE); /* Skip the entry if it's too small. */ - if (bios_start - TRAMPOLINE_32BIT_SIZE < entry->addr) + if (new - TRAMPOLINE_32BIT_SIZE < entry->addr) continue; + /* Protect against underflow. */ + if (new - TRAMPOLINE_32BIT_SIZE > bios_start) + break; + + bios_start = new; break; } From f9ef724d4896763479f3921afd1ee61552fc9836 Mon Sep 17 00:00:00 2001 From: Jeronimo Borque Date: Sun, 18 Aug 2019 22:35:38 -0300 Subject: [PATCH 085/266] ALSA: hda - Fixes inverted Conexant GPIO mic mute led "enabled" parameter historically referred to the device input or output, not to the led indicator. After the changes added with the led helper functions the mic mute led logic refers to the led and not to the mic input which caused led indicator to be negated. Fixing logic in cxt_update_gpio_led and updated cxt_fixup_gpio_mute_hook Also updated debug messages to ease further debugging if necessary. Fixes: 184e302b46c9 ("ALSA: hda/conexant - Use the mic-mute LED helper") Suggested-by: Takashi Iwai Signed-off-by: Jeronimo Borque Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 14298ef45b21..968d3caab6ac 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -611,18 +611,20 @@ static void cxt_fixup_hp_gate_mic_jack(struct hda_codec *codec, /* update LED status via GPIO */ static void cxt_update_gpio_led(struct hda_codec *codec, unsigned int mask, - bool enabled) + bool led_on) { struct conexant_spec *spec = codec->spec; unsigned int oldval = spec->gpio_led; if (spec->mute_led_polarity) - enabled = !enabled; + led_on = !led_on; - if (enabled) - spec->gpio_led &= ~mask; - else + if (led_on) spec->gpio_led |= mask; + else + spec->gpio_led &= ~mask; + codec_dbg(codec, "mask:%d enabled:%d gpio_led:%d\n", + mask, led_on, spec->gpio_led); if (spec->gpio_led != oldval) snd_hda_codec_write(codec, 0x01, 0, AC_VERB_SET_GPIO_DATA, spec->gpio_led); @@ -633,8 +635,8 @@ static void cxt_fixup_gpio_mute_hook(void *private_data, int enabled) { struct hda_codec *codec = private_data; struct conexant_spec *spec = codec->spec; - - cxt_update_gpio_led(codec, spec->gpio_mute_led_mask, enabled); + /* muted -> LED on */ + cxt_update_gpio_led(codec, spec->gpio_mute_led_mask, !enabled); } /* turn on/off mic-mute LED via GPIO per capture hook */ @@ -656,7 +658,6 @@ static void cxt_fixup_mute_led_gpio(struct hda_codec *codec, { 0x01, AC_VERB_SET_GPIO_DIRECTION, 0x03 }, {} }; - codec_info(codec, "action: %d gpio_led: %d\n", action, spec->gpio_led); if (action == HDA_FIXUP_ACT_PRE_PROBE) { spec->gen.vmaster_mute.hook = cxt_fixup_gpio_mute_hook; From c49a0a80137c7ca7d6ced4c812c9e07a949f6f24 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 19 Aug 2019 15:52:35 +0000 Subject: [PATCH 086/266] x86/CPU/AMD: Clear RDRAND CPUID bit on AMD family 15h/16h There have been reports of RDRAND issues after resuming from suspend on some AMD family 15h and family 16h systems. This issue stems from a BIOS not performing the proper steps during resume to ensure RDRAND continues to function properly. RDRAND support is indicated by CPUID Fn00000001_ECX[30]. This bit can be reset by clearing MSR C001_1004[62]. Any software that checks for RDRAND support using CPUID, including the kernel, will believe that RDRAND is not supported. Update the CPU initialization to clear the RDRAND CPUID bit for any family 15h and 16h processor that supports RDRAND. If it is known that the family 15h or family 16h system does not have an RDRAND resume issue or that the system will not be placed in suspend, the "rdrand=force" kernel parameter can be used to stop the clearing of the RDRAND CPUID bit. Additionally, update the suspend and resume path to save and restore the MSR C001_1004 value to ensure that the RDRAND CPUID setting remains in place after resuming from suspend. Note, that clearing the RDRAND CPUID bit does not prevent a processor that normally supports the RDRAND instruction from executing it. So any code that determined the support based on family and model won't #UD. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Cc: Andrew Cooper Cc: Andrew Morton Cc: Chen Yu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: "linux-doc@vger.kernel.org" Cc: "linux-pm@vger.kernel.org" Cc: Nathan Chancellor Cc: Paolo Bonzini Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Cc: Thomas Gleixner Cc: "x86@kernel.org" Link: https://lkml.kernel.org/r/7543af91666f491547bd86cebb1e17c66824ab9f.1566229943.git.thomas.lendacky@amd.com --- .../admin-guide/kernel-parameters.txt | 7 ++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/amd.c | 66 ++++++++++++++ arch/x86/power/cpu.c | 86 ++++++++++++++++--- 4 files changed, 147 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 47d981a86e2f..4c1971960afa 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4090,6 +4090,13 @@ Run specified binary instead of /init from the ramdisk, used for early userspace startup. See initrd. + rdrand= [X86] + force - Override the decision by the kernel to hide the + advertisement of RDRAND support (this affects + certain AMD processors because of buggy BIOS + support, specifically around the suspend/resume + path). + rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6b4fc2788078..271d837d69a8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -381,6 +381,7 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8d4e50428b68..68c363c341bf 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -804,6 +804,64 @@ static void init_amd_ln(struct cpuinfo_x86 *c) msr_set_bit(MSR_AMD64_DE_CFG, 31); } +static bool rdrand_force; + +static int __init rdrand_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "force")) + rdrand_force = true; + else + return -EINVAL; + + return 0; +} +early_param("rdrand", rdrand_cmdline); + +static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) +{ + /* + * Saving of the MSR used to hide the RDRAND support during + * suspend/resume is done by arch/x86/power/cpu.c, which is + * dependent on CONFIG_PM_SLEEP. + */ + if (!IS_ENABLED(CONFIG_PM_SLEEP)) + return; + + /* + * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * RDRAND support using the CPUID function directly. + */ + if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) + return; + + msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62); + + /* + * Verify that the CPUID change has occurred in case the kernel is + * running virtualized and the hypervisor doesn't support the MSR. + */ + if (cpuid_ecx(1) & BIT(30)) { + pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n"); + return; + } + + clear_cpu_cap(c, X86_FEATURE_RDRAND); + pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n"); +} + +static void init_amd_jg(struct cpuinfo_x86 *c) +{ + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); +} + static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; @@ -818,6 +876,13 @@ static void init_amd_bd(struct cpuinfo_x86 *c) wrmsrl_safe(MSR_F15H_IC_CFG, value); } } + + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); } static void init_amd_zn(struct cpuinfo_x86 *c) @@ -860,6 +925,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x16: init_amd_jg(c); break; case 0x17: init_amd_zn(c); break; } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 24b079e94bc2..c9ef6a7a4a1a 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -23,7 +24,7 @@ #include #include #include -#include +#include #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; @@ -397,15 +398,14 @@ static int __init bsp_pm_check_init(void) core_initcall(bsp_pm_check_init); -static int msr_init_context(const u32 *msr_id, const int total_num) +static int msr_build_context(const u32 *msr_id, const int num) { - int i = 0; + struct saved_msrs *saved_msrs = &saved_context.saved_msrs; struct saved_msr *msr_array; + int total_num; + int i, j; - if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) { - pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n"); - return -EINVAL; - } + total_num = saved_msrs->num + num; msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL); if (!msr_array) { @@ -413,19 +413,30 @@ static int msr_init_context(const u32 *msr_id, const int total_num) return -ENOMEM; } - for (i = 0; i < total_num; i++) { - msr_array[i].info.msr_no = msr_id[i]; + if (saved_msrs->array) { + /* + * Multiple callbacks can invoke this function, so copy any + * MSR save requests from previous invocations. + */ + memcpy(msr_array, saved_msrs->array, + sizeof(struct saved_msr) * saved_msrs->num); + + kfree(saved_msrs->array); + } + + for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) { + msr_array[i].info.msr_no = msr_id[j]; msr_array[i].valid = false; msr_array[i].info.reg.q = 0; } - saved_context.saved_msrs.num = total_num; - saved_context.saved_msrs.array = msr_array; + saved_msrs->num = total_num; + saved_msrs->array = msr_array; return 0; } /* - * The following section is a quirk framework for problematic BIOSen: + * The following sections are a quirk framework for problematic BIOSen: * Sometimes MSRs are modified by the BIOSen after suspended to * RAM, this might cause unexpected behavior after wakeup. * Thus we save/restore these specified MSRs across suspend/resume @@ -440,7 +451,7 @@ static int msr_initialize_bdw(const struct dmi_system_id *d) u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL }; pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident); - return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); + return msr_build_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); } static const struct dmi_system_id msr_save_dmi_table[] = { @@ -455,9 +466,58 @@ static const struct dmi_system_id msr_save_dmi_table[] = { {} }; +static int msr_save_cpuid_features(const struct x86_cpu_id *c) +{ + u32 cpuid_msr_id[] = { + MSR_AMD64_CPUID_FN_1, + }; + + pr_info("x86/pm: family %#hx cpu detected, MSR saving is needed during suspending.\n", + c->family); + + return msr_build_context(cpuid_msr_id, ARRAY_SIZE(cpuid_msr_id)); +} + +static const struct x86_cpu_id msr_save_cpu_table[] = { + { + .vendor = X86_VENDOR_AMD, + .family = 0x15, + .model = X86_MODEL_ANY, + .feature = X86_FEATURE_ANY, + .driver_data = (kernel_ulong_t)msr_save_cpuid_features, + }, + { + .vendor = X86_VENDOR_AMD, + .family = 0x16, + .model = X86_MODEL_ANY, + .feature = X86_FEATURE_ANY, + .driver_data = (kernel_ulong_t)msr_save_cpuid_features, + }, + {} +}; + +typedef int (*pm_cpu_match_t)(const struct x86_cpu_id *); +static int pm_cpu_check(const struct x86_cpu_id *c) +{ + const struct x86_cpu_id *m; + int ret = 0; + + m = x86_match_cpu(msr_save_cpu_table); + if (m) { + pm_cpu_match_t fn; + + fn = (pm_cpu_match_t)m->driver_data; + ret = fn(m); + } + + return ret; +} + static int pm_check_save_msr(void) { dmi_check_system(msr_save_dmi_table); + pm_cpu_check(msr_save_cpu_table); + return 0; } From d0ff14fdc987303aeeb7de6f1bd72c3749ae2a9b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 1 Aug 2019 23:53:53 +0000 Subject: [PATCH 087/266] genirq: Properly pair kobject_del() with kobject_add() If alloc_descs() fails before irq_sysfs_init() has run, free_desc() in the cleanup path will call kobject_del() even though the kobject has not been added with kobject_add(). Fix this by making the call to kobject_del() conditional on whether irq_sysfs_init() has run. This problem surfaced because commit aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") makes kobject_del() stricter about pairing with kobject_add(). If the pairing is incorrrect, a WARNING and backtrace occur in sysfs_remove_group() because there is no parent. [ tglx: Add a comment to the code and make it work with CONFIG_SYSFS=n ] Fixes: ecb3f394c5db ("genirq: Expose interrupt information through sysfs") Signed-off-by: Michael Kelley Signed-off-by: Thomas Gleixner Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1564703564-4116-1-git-send-email-mikelley@microsoft.com --- kernel/irq/irqdesc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9484e88dabc2..9be995fc3c5a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -295,6 +295,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc) } } +static void irq_sysfs_del(struct irq_desc *desc) +{ + /* + * If irq_sysfs_init() has not yet been invoked (early boot), then + * irq_kobj_base is NULL and the descriptor was never added. + * kobject_del() complains about a object with no parent, so make + * it conditional. + */ + if (irq_kobj_base) + kobject_del(&desc->kobj); +} + static int __init irq_sysfs_init(void) { struct irq_desc *desc; @@ -325,6 +337,7 @@ static struct kobj_type irq_kobj_type = { }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} +static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ @@ -438,7 +451,7 @@ static void free_desc(unsigned int irq) * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ - kobject_del(&desc->kobj); + irq_sysfs_del(desc); delete_irq_desc(irq); /* From ef8d8ccdc216f797e66cb4a1372f5c4c285ce1e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Aug 2019 21:26:22 -0700 Subject: [PATCH 088/266] tcp: make sure EPOLLOUT wont be missed As Jason Baron explained in commit 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure"), it is crucial we properly set SOCK_NOSPACE when needed. However, Jason patch had a bug, because the 'nonblocking' status as far as sk_stream_wait_memory() is concerned is governed by MSG_DONTWAIT flag passed at sendmsg() time : long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); So it is very possible that tcp sendmsg() calls sk_stream_wait_memory(), and that sk_stream_wait_memory() returns -EAGAIN with SOCK_NOSPACE cleared, if sk->sk_sndtimeo has been set to a small (but not zero) value. This patch removes the 'noblock' variable since we must always set SOCK_NOSPACE if -EAGAIN is returned. It also renames the do_nonblock label since we might reach this code path even if we were in blocking mode. Fixes: 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure") Signed-off-by: Eric Dumazet Cc: Jason Baron Reported-by: Vladimir Rutsky Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Acked-by: Jason Baron Signed-off-by: David S. Miller --- net/core/stream.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/core/stream.c b/net/core/stream.c index e94bb02a5629..4f1d4aa5fb38 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -120,7 +120,6 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) int err = 0; long vm_wait = 0; long current_timeo = *timeo_p; - bool noblock = (*timeo_p ? false : true); DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) @@ -133,11 +132,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - if (!*timeo_p) { - if (noblock) - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - goto do_nonblock; - } + if (!*timeo_p) + goto do_eagain; if (signal_pending(current)) goto do_interrupted; sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -169,7 +165,13 @@ out: do_error: err = -EPIPE; goto out; -do_nonblock: +do_eagain: + /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can + * be generated later. + * When TCP receives ACK packets that make room, tcp_check_space() + * only calls tcp_new_space() if SOCK_NOSPACE is set. + */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; goto out; do_interrupted: From 3a7ef457e85173a5b9ec7a03016db5a57b717b33 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Tue, 13 Aug 2019 00:46:01 +0200 Subject: [PATCH 089/266] ipv6: Fix return value of ipv6_mc_may_pull() for malformed packets Commit ba5ea614622d ("bridge: simplify ip_mc_check_igmp() and ipv6_mc_check_mld() calls") replaces direct calls to pskb_may_pull() in br_ipv6_multicast_mld2_report() with calls to ipv6_mc_may_pull(), that returns -EINVAL on buffers too short to be valid IPv6 packets, while maintaining the previous handling of the return code. This leads to the direct opposite of the intended effect: if the packet is malformed, -EINVAL evaluates as true, and we'll happily proceed with the processing. Return 0 if the packet is too short, in the same way as this was fixed for IPv4 by commit 083b78a9ed64 ("ip: fix ip_mc_may_pull() return value"). I don't have a reproducer for this, unlike the one referred to by the IPv4 commit, but this is clearly broken. Fixes: ba5ea614622d ("bridge: simplify ip_mc_check_igmp() and ipv6_mc_check_mld() calls") Signed-off-by: Stefano Brivio Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index becdad576859..3f62b347b04a 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -206,7 +206,7 @@ static inline int ipv6_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len) - return -EINVAL; + return 0; return pskb_may_pull(skb, len); } From 2f102274e8129c9c0bb3a2bde0f641531aefea8b Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 13 Aug 2019 09:05:30 +0300 Subject: [PATCH 090/266] MAINTAINERS: net_failover: Fix typo in a filepath Replace "driver" with "drivers" in the filepath to net_failover.c Cc: Sridhar Samudrala Cc: David S. Miller Cc: netdev@vger.kernel.org Fixes: cfc80d9a1163 ("net: Introduce net_failover driver") Signed-off-by: Denis Efremov Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 22b8273069af..a744851db1df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11080,7 +11080,7 @@ NET_FAILOVER MODULE M: Sridhar Samudrala L: netdev@vger.kernel.org S: Supported -F: driver/net/net_failover.c +F: drivers/net/net_failover.c F: include/net/net_failover.h F: Documentation/networking/net_failover.rst From cd9d4ff9b78fcd0fc4708900ba3e52e71e1a7690 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 19 Aug 2019 07:04:25 +0200 Subject: [PATCH 091/266] Kconfig: Fix the reference to the IDT77105 Phy driver in the description of ATM_NICSTAR_USE_IDT77105 This should be IDT77105, not IDT77015. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/atm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/atm/Kconfig b/drivers/atm/Kconfig index 2e2efa577437..8c37294f1d1e 100644 --- a/drivers/atm/Kconfig +++ b/drivers/atm/Kconfig @@ -200,7 +200,7 @@ config ATM_NICSTAR_USE_SUNI make the card work). config ATM_NICSTAR_USE_IDT77105 - bool "Use IDT77015 PHY driver (25Mbps)" + bool "Use IDT77105 PHY driver (25Mbps)" depends on ATM_NICSTAR help Support for the PHYsical layer chip in ForeRunner LE25 cards. In From b68271609c4f16a79eae8069933f64345afcf888 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 19 Aug 2019 18:15:28 -0700 Subject: [PATCH 092/266] fs/xfs: Fix return code of xfs_break_leased_layouts() The parens used in the while loop would result in error being assigned the value 1 rather than the intended errno value. This is required to return -ETXTBSY from follow on break_layout() changes. Signed-off-by: Ira Weiny Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 0c954cad7449..a339bd5fa260 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -32,7 +32,7 @@ xfs_break_leased_layouts( struct xfs_inode *ip = XFS_I(inode); int error; - while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { + while ((error = break_layout(inode, false)) == -EWOULDBLOCK) { xfs_iunlock(ip, *iolock); *did_unlock = true; error = break_layout(inode, true); From 1edfb8ed6cc12107c2ec61b5be7bc881cfc4460e Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 19 Aug 2019 10:33:04 +0300 Subject: [PATCH 093/266] nfp: flower: verify that block cb is not busy before binding When processing FLOW_BLOCK_BIND command on indirect block, check that flow block cb is not busy. Fixes: 0d4fd02e7199 ("net: flow_offload: add flow_block_cb_is_busy() and use it") Reported-by: Jakub Kicinski Signed-off-by: Vlad Buslov Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/offload.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index e209f150c5f2..9917d64694c6 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -1416,6 +1416,13 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, switch (f->command) { case FLOW_BLOCK_BIND: + cb_priv = nfp_flower_indr_block_cb_priv_lookup(app, netdev); + if (cb_priv && + flow_block_cb_is_busy(nfp_flower_setup_indr_block_cb, + cb_priv, + &nfp_block_cb_list)) + return -EBUSY; + cb_priv = kmalloc(sizeof(*cb_priv), GFP_KERNEL); if (!cb_priv) return -ENOMEM; From 77ffd3465ba837e9dc714e17b014e77b2eae765a Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 15 Aug 2019 19:36:49 -0700 Subject: [PATCH 094/266] scsi: lpfc: Mitigate high memory pre-allocation by SCSI-MQ When SCSI-MQ is enabled, the SCSI-MQ layers will do pre-allocation of MQ resources based on shost values set by the driver. In newer cases of the driver, which attempts to set nr_hw_queues to the cpu count, the multipliers become excessive, with a single shost having SCSI-MQ pre-allocation reaching into the multiple GBytes range. NPIV, which creates additional shosts, only multiply this overhead. On lower-memory systems, this can exhaust system memory very quickly, resulting in a system crash or failures in the driver or elsewhere due to low memory conditions. After testing several scenarios, the situation can be mitigated by limiting the value set in shost->nr_hw_queues to 4. Although the shost values were changed, the driver still had per-cpu hardware queues of its own that allowed parallelization per-cpu. Testing revealed that even with the smallish number for nr_hw_queues for SCSI-MQ, performance levels remained near maximum with the within-driver affiinitization. A module parameter was created to allow the value set for the nr_hw_queues to be tunable. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Ming Lei Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 1 + drivers/scsi/lpfc/lpfc_attr.c | 15 +++++++++++++++ drivers/scsi/lpfc/lpfc_init.c | 10 ++++++---- drivers/scsi/lpfc/lpfc_sli4.h | 5 +++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 2c3bb8a966e5..bade2e025ecf 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -824,6 +824,7 @@ struct lpfc_hba { uint32_t cfg_cq_poll_threshold; uint32_t cfg_cq_max_proc_limit; uint32_t cfg_fcp_cpu_map; + uint32_t cfg_fcp_mq_threshold; uint32_t cfg_hdw_queue; uint32_t cfg_irq_chann; uint32_t cfg_suppress_rsp; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index ea62322ffe2b..8d8c495b5b60 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -5708,6 +5708,19 @@ LPFC_ATTR_RW(nvme_oas, 0, 0, 1, LPFC_ATTR_RW(nvme_embed_cmd, 1, 0, 2, "Embed NVME Command in WQE"); +/* + * lpfc_fcp_mq_threshold: Set the maximum number of Hardware Queues + * the driver will advertise it supports to the SCSI layer. + * + * 0 = Set nr_hw_queues by the number of CPUs or HW queues. + * 1,128 = Manually specify the maximum nr_hw_queue value to be set, + * + * Value range is [0,128]. Default value is 8. + */ +LPFC_ATTR_R(fcp_mq_threshold, LPFC_FCP_MQ_THRESHOLD_DEF, + LPFC_FCP_MQ_THRESHOLD_MIN, LPFC_FCP_MQ_THRESHOLD_MAX, + "Set the number of SCSI Queues advertised"); + /* * lpfc_hdw_queue: Set the number of Hardware Queues the driver * will advertise it supports to the NVME and SCSI layers. This also @@ -6030,6 +6043,7 @@ struct device_attribute *lpfc_hba_attrs[] = { &dev_attr_lpfc_cq_poll_threshold, &dev_attr_lpfc_cq_max_proc_limit, &dev_attr_lpfc_fcp_cpu_map, + &dev_attr_lpfc_fcp_mq_threshold, &dev_attr_lpfc_hdw_queue, &dev_attr_lpfc_irq_chann, &dev_attr_lpfc_suppress_rsp, @@ -7112,6 +7126,7 @@ lpfc_get_cfgparam(struct lpfc_hba *phba) /* Initialize first burst. Target vs Initiator are different. */ lpfc_nvme_enable_fb_init(phba, lpfc_nvme_enable_fb); lpfc_nvmet_fb_size_init(phba, lpfc_nvmet_fb_size); + lpfc_fcp_mq_threshold_init(phba, lpfc_fcp_mq_threshold); lpfc_hdw_queue_init(phba, lpfc_hdw_queue); lpfc_irq_chann_init(phba, lpfc_irq_chann); lpfc_enable_bbcr_init(phba, lpfc_enable_bbcr); diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index a7549ae32542..1ac98becb5ba 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -4309,10 +4309,12 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) shost->max_cmd_len = 16; if (phba->sli_rev == LPFC_SLI_REV4) { - if (phba->cfg_fcp_io_sched == LPFC_FCP_SCHED_BY_HDWQ) - shost->nr_hw_queues = phba->cfg_hdw_queue; - else - shost->nr_hw_queues = phba->sli4_hba.num_present_cpu; + if (!phba->cfg_fcp_mq_threshold || + phba->cfg_fcp_mq_threshold > phba->cfg_hdw_queue) + phba->cfg_fcp_mq_threshold = phba->cfg_hdw_queue; + + shost->nr_hw_queues = min_t(int, 2 * num_possible_nodes(), + phba->cfg_fcp_mq_threshold); shost->dma_boundary = phba->sli4_hba.pc_sli4_params.sge_supp_len-1; diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 3aeca387b22a..329f7aa7e169 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -44,6 +44,11 @@ #define LPFC_HBA_HDWQ_MAX 128 #define LPFC_HBA_HDWQ_DEF 0 +/* FCP MQ queue count limiting */ +#define LPFC_FCP_MQ_THRESHOLD_MIN 0 +#define LPFC_FCP_MQ_THRESHOLD_MAX 128 +#define LPFC_FCP_MQ_THRESHOLD_DEF 8 + /* Common buffer size to accomidate SCSI and NVME IO buffers */ #define LPFC_COMMON_IO_BUF_SZ 768 From 936376f88ff1845b384b3a82b9cd167e53039229 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Aug 2019 10:08:38 +0900 Subject: [PATCH 095/266] arm: select the dma-noncoherent symbols for all swiotlb builds We need to provide the arch hooks for non-coherent dma-direct and swiotlb for all swiotlb builds, not just when LPAS is enabled. Without that the Xen build that selects SWIOTLB indirectly through SWIOTLB_XEN fails to build. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Reported-by: Stefan Wahren Signed-off-by: Christoph Hellwig Tested-by: Stefan Wahren --- arch/arm/Kconfig | 4 ++++ arch/arm/mm/Kconfig | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 33b00579beff..24360211534a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -7,6 +7,8 @@ config ARM select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB + select ARCH_HAS_DMA_MMAP_PGPROT if SWIOTLB select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_KEEPINITRD @@ -18,6 +20,8 @@ config ARM select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL select ARCH_HAS_STRICT_MODULE_RWX if MMU + select ARCH_HAS_SYNC_DMA_FOR_DEVICE if SWIOTLB + select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index c54cd7ed90ba..c1222c0e9fd3 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -664,10 +664,6 @@ config ARM_LPAE !CPU_32v4 && !CPU_32v3 select PHYS_ADDR_T_64BIT select SWIOTLB - select ARCH_HAS_DMA_COHERENT_TO_PFN - select ARCH_HAS_DMA_MMAP_PGPROT - select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_SYNC_DMA_FOR_CPU help Say Y if you have an ARMv7 processor supporting the LPAE page table format and you would like to access memory beyond the From 1a15718b41df026cffd0e42cfdc38a1384ce19f9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 20 Aug 2019 08:58:12 +0200 Subject: [PATCH 096/266] ALSA: usb-audio: Add implicit fb quirk for Behringer UFX1604 Behringer UFX1604 requires the similar quirk to apply implicit fb like another Behringer model UFX1204 in order to fix the noisy playback. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=204631 Cc: Signed-off-by: Takashi Iwai --- sound/usb/pcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 75b96929f76c..e4bbf79de956 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -339,6 +339,7 @@ static int set_sync_ep_implicit_fb_quirk(struct snd_usb_substream *subs, ep = 0x81; ifnum = 2; goto add_sync_ep_from_ifnum; + case USB_ID(0x1397, 0x0001): /* Behringer UFX1604 */ case USB_ID(0x1397, 0x0002): /* Behringer UFX1204 */ ep = 0x81; ifnum = 1; From 2ca371d847511f97ef991ef612a2ce805489840e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rekowski?= Date: Mon, 19 Aug 2019 22:40:07 +0200 Subject: [PATCH 097/266] ALSA: hda/ca0132 - Add new SBZ quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a new PCI subsys ID for the SBZ, as found and tested by me and some reddit users. Link: https://lore.kernel.org/lkml/20190819204008.14426-1-p.rekowski@gmail.com Signed-off-by: Paweł Rekowski Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_ca0132.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c index 0d51823d7270..6d1fb7c11f17 100644 --- a/sound/pci/hda/patch_ca0132.c +++ b/sound/pci/hda/patch_ca0132.c @@ -1175,6 +1175,7 @@ static const struct snd_pci_quirk ca0132_quirks[] = { SND_PCI_QUIRK(0x1028, 0x0708, "Alienware 15 R2 2016", QUIRK_ALIENWARE), SND_PCI_QUIRK(0x1102, 0x0010, "Sound Blaster Z", QUIRK_SBZ), SND_PCI_QUIRK(0x1102, 0x0023, "Sound Blaster Z", QUIRK_SBZ), + SND_PCI_QUIRK(0x1102, 0x0027, "Sound Blaster Z", QUIRK_SBZ), SND_PCI_QUIRK(0x1102, 0x0033, "Sound Blaster ZxR", QUIRK_SBZ), SND_PCI_QUIRK(0x1458, 0xA016, "Recon3Di", QUIRK_R3DI), SND_PCI_QUIRK(0x1458, 0xA026, "Gigabyte G1.Sniper Z97", QUIRK_R3DI), From fcf887e7caaa813eea821d11bf2b7619a37df37a Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Fri, 16 Aug 2019 12:00:54 -0700 Subject: [PATCH 098/266] HID: wacom: correct misreported EKR ring values The EKR ring claims a range of 0 to 71 but actually reports values 1 to 72. The ring is used in relative mode so this change should not affect users. Signed-off-by: Aaron Armstrong Skomra Fixes: 72b236d60218f ("HID: wacom: Add support for Express Key Remote.") Cc: # v4.3+ Reviewed-by: Ping Cheng Reviewed-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 7a9e229e6253..1713235d28cb 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1061,7 +1061,7 @@ static int wacom_remote_irq(struct wacom_wac *wacom_wac, size_t len) input_report_key(input, BTN_BASE2, (data[11] & 0x02)); if (data[12] & 0x80) - input_report_abs(input, ABS_WHEEL, (data[12] & 0x7f)); + input_report_abs(input, ABS_WHEEL, (data[12] & 0x7f) - 1); else input_report_abs(input, ABS_WHEEL, 0); From a180d023ec7ba0e43b2385876950d9ce7ab618f1 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Mon, 19 Aug 2019 13:21:26 +0530 Subject: [PATCH 099/266] auxdisplay: ht16k33: Make ht16k33_fb_fix and ht16k33_fb_var constant The static structures ht16k33_fb_fix and ht16k33_fb_var, of types fb_fix_screeninfo and fb_var_screeninfo respectively, are not used except to be copied into other variables. Hence make both of them constant to prevent unintended modification. Issue found with Coccinelle. Acked-by: Robin van der Gracht Signed-off-by: Nishka Dasgupta Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/ht16k33.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/auxdisplay/ht16k33.c b/drivers/auxdisplay/ht16k33.c index 9c0bb771751d..a2fcde582e2a 100644 --- a/drivers/auxdisplay/ht16k33.c +++ b/drivers/auxdisplay/ht16k33.c @@ -74,7 +74,7 @@ struct ht16k33_priv { struct ht16k33_fbdev fbdev; }; -static struct fb_fix_screeninfo ht16k33_fb_fix = { +static const struct fb_fix_screeninfo ht16k33_fb_fix = { .id = DRIVER_NAME, .type = FB_TYPE_PACKED_PIXELS, .visual = FB_VISUAL_MONO10, @@ -85,7 +85,7 @@ static struct fb_fix_screeninfo ht16k33_fb_fix = { .accel = FB_ACCEL_NONE, }; -static struct fb_var_screeninfo ht16k33_fb_var = { +static const struct fb_var_screeninfo ht16k33_fb_var = { .xres = HT16K33_MATRIX_LED_MAX_ROWS, .yres = HT16K33_MATRIX_LED_MAX_COLS, .xres_virtual = HT16K33_MATRIX_LED_MAX_ROWS, From 8f2d163cb26da87e7d8e1677368b8ba1ba4d30b3 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 18 Jul 2019 12:38:10 +0200 Subject: [PATCH 100/266] mt76: mt76x0u: do not reset radio on resume On some machines mt76x0u firmware can hung during resume, what result on messages like below: [ 475.480062] mt76x0 1-8:1.0: Error: MCU response pre-completed! [ 475.990066] mt76x0 1-8:1.0: Error: send MCU cmd failed:-110 [ 475.990075] mt76x0 1-8:1.0: Error: MCU response pre-completed! [ 476.500003] mt76x0 1-8:1.0: Error: send MCU cmd failed:-110 [ 476.500012] mt76x0 1-8:1.0: Error: MCU response pre-completed! [ 477.010046] mt76x0 1-8:1.0: Error: send MCU cmd failed:-110 [ 477.010055] mt76x0 1-8:1.0: Error: MCU response pre-completed! [ 477.529997] mt76x0 1-8:1.0: Error: send MCU cmd failed:-110 [ 477.530006] mt76x0 1-8:1.0: Error: MCU response pre-completed! [ 477.824907] mt76x0 1-8:1.0: Error: send MCU cmd failed:-71 [ 477.824916] mt76x0 1-8:1.0: Error: MCU response pre-completed! [ 477.825029] usb 1-8: USB disconnect, device number 6 and possible whole system freeze. This can be avoided, if we do not perform mt76x0_chip_onoff() reset. Cc: stable@vger.kernel.org Fixes: 134b2d0d1fcf ("mt76x0: init files") Signed-off-by: Stanislaw Gruszka Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt76x0/usb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c index 627ed1fc7b15..645f4d15fb61 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c @@ -136,11 +136,11 @@ static const struct ieee80211_ops mt76x0u_ops = { .release_buffered_frames = mt76_release_buffered_frames, }; -static int mt76x0u_init_hardware(struct mt76x02_dev *dev) +static int mt76x0u_init_hardware(struct mt76x02_dev *dev, bool reset) { int err; - mt76x0_chip_onoff(dev, true, true); + mt76x0_chip_onoff(dev, true, reset); if (!mt76x02_wait_for_mac(&dev->mt76)) return -ETIMEDOUT; @@ -173,7 +173,7 @@ static int mt76x0u_register_device(struct mt76x02_dev *dev) if (err < 0) goto out_err; - err = mt76x0u_init_hardware(dev); + err = mt76x0u_init_hardware(dev, true); if (err < 0) goto out_err; @@ -309,7 +309,7 @@ static int __maybe_unused mt76x0_resume(struct usb_interface *usb_intf) if (ret < 0) goto err; - ret = mt76x0u_init_hardware(dev); + ret = mt76x0u_init_hardware(dev, false); if (ret) goto err; From 95844124385eae4bd9ca5f9514a0fc33d561ac3c Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Mon, 19 Aug 2019 13:20:07 +0200 Subject: [PATCH 101/266] rt2x00: clear IV's on start to fix AP mode regression To do not brake HW restart we should keep initialization vectors data. I assumed that on start the data is already initialized to zeros, but that not true on some scenarios and we should clear it. So add additional flag to check if we are under HW restart and clear IV's data if we are not. Patch fixes AP mode regression. Reported-and-tested-by: Emil Karlson Fixes: 710e6cc1595e ("rt2800: do not nullify initialization vector data") Signed-off-by: Stanislaw Gruszka Signed-off-by: Kalle Valo --- drivers/net/wireless/ralink/rt2x00/rt2800lib.c | 9 +++++++++ drivers/net/wireless/ralink/rt2x00/rt2x00.h | 1 + drivers/net/wireless/ralink/rt2x00/rt2x00dev.c | 13 ++++++++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c index c9b957ac5733..ecbe78b8027b 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c @@ -6094,6 +6094,15 @@ static int rt2800_init_registers(struct rt2x00_dev *rt2x00dev) rt2800_delete_wcid_attr(rt2x00dev, i); } + /* + * Clear encryption initialization vectors on start, but keep them + * for watchdog reset. Otherwise we will have wrong IVs and not be + * able to keep connections after reset. + */ + if (!test_bit(DEVICE_STATE_RESET, &rt2x00dev->flags)) + for (i = 0; i < 256; i++) + rt2800_register_write(rt2x00dev, MAC_IVEIV_ENTRY(i), 0); + /* * Clear all beacons */ diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h index 7e43690a861c..2b216edd0c7d 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h @@ -658,6 +658,7 @@ enum rt2x00_state_flags { DEVICE_STATE_ENABLED_RADIO, DEVICE_STATE_SCANNING, DEVICE_STATE_FLUSHING, + DEVICE_STATE_RESET, /* * Driver configuration diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c index 35414f97a978..9d158237ac67 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c @@ -1256,13 +1256,14 @@ static int rt2x00lib_initialize(struct rt2x00_dev *rt2x00dev) int rt2x00lib_start(struct rt2x00_dev *rt2x00dev) { - int retval; + int retval = 0; if (test_bit(DEVICE_STATE_STARTED, &rt2x00dev->flags)) { /* * This is special case for ieee80211_restart_hw(), otherwise * mac80211 never call start() two times in row without stop(); */ + set_bit(DEVICE_STATE_RESET, &rt2x00dev->flags); rt2x00dev->ops->lib->pre_reset_hw(rt2x00dev); rt2x00lib_stop(rt2x00dev); } @@ -1273,14 +1274,14 @@ int rt2x00lib_start(struct rt2x00_dev *rt2x00dev) */ retval = rt2x00lib_load_firmware(rt2x00dev); if (retval) - return retval; + goto out; /* * Initialize the device. */ retval = rt2x00lib_initialize(rt2x00dev); if (retval) - return retval; + goto out; rt2x00dev->intf_ap_count = 0; rt2x00dev->intf_sta_count = 0; @@ -1289,11 +1290,13 @@ int rt2x00lib_start(struct rt2x00_dev *rt2x00dev) /* Enable the radio */ retval = rt2x00lib_enable_radio(rt2x00dev); if (retval) - return retval; + goto out; set_bit(DEVICE_STATE_STARTED, &rt2x00dev->flags); - return 0; +out: + clear_bit(DEVICE_STATE_RESET, &rt2x00dev->flags); + return retval; } void rt2x00lib_stop(struct rt2x00_dev *rt2x00dev) From 50f5604476b2bd728910b2e1803a6eafd0eeaf3d Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Fri, 16 Aug 2019 15:55:51 +0300 Subject: [PATCH 102/266] iwlwifi: mvm: Allow multicast data frames only when associated The MAC context configuration always allowed multicast data frames to pass to the driver for all MAC context types, and in the case of station MAC context both when associated and when not associated. One of the outcomes of this configuration is having the FW forward encrypted multicast frames to the driver with Rx status indicating that the frame was not decrypted (as expected, since no keys were configured yet) which in turn results with unnecessary error messages. Change this behavior to allow multicast data frames only when they are actually expected, e.g., station MAC context is associated etc. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- .../net/wireless/intel/iwlwifi/mvm/mac-ctxt.c | 33 +++++++++++++++++-- .../net/wireless/intel/iwlwifi/mvm/mac80211.c | 10 ++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c index cb22d447fcb8..fe776e35b9d0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c @@ -554,7 +554,7 @@ static void iwl_mvm_mac_ctxt_cmd_common(struct iwl_mvm *mvm, cpu_to_le32(vif->bss_conf.use_short_slot ? MAC_FLG_SHORT_SLOT : 0); - cmd->filter_flags = cpu_to_le32(MAC_FILTER_ACCEPT_GRP); + cmd->filter_flags = 0; for (i = 0; i < IEEE80211_NUM_ACS; i++) { u8 txf = iwl_mvm_mac_ac_to_tx_fifo(mvm, i); @@ -623,6 +623,8 @@ static int iwl_mvm_mac_ctxt_cmd_sta(struct iwl_mvm *mvm, /* We need the dtim_period to set the MAC as associated */ if (vif->bss_conf.assoc && vif->bss_conf.dtim_period && !force_assoc_off) { + struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); + u8 ap_sta_id = mvmvif->ap_sta_id; u32 dtim_offs; /* @@ -658,6 +660,29 @@ static int iwl_mvm_mac_ctxt_cmd_sta(struct iwl_mvm *mvm, dtim_offs); ctxt_sta->is_assoc = cpu_to_le32(1); + + /* + * allow multicast data frames only as long as the station is + * authorized, i.e., GTK keys are already installed (if needed) + */ + if (ap_sta_id < IWL_MVM_STATION_COUNT) { + struct ieee80211_sta *sta; + + rcu_read_lock(); + + sta = rcu_dereference(mvm->fw_id_to_mac_id[ap_sta_id]); + if (!IS_ERR_OR_NULL(sta)) { + struct iwl_mvm_sta *mvmsta = + iwl_mvm_sta_from_mac80211(sta); + + if (mvmsta->sta_state == + IEEE80211_STA_AUTHORIZED) + cmd.filter_flags |= + cpu_to_le32(MAC_FILTER_ACCEPT_GRP); + } + + rcu_read_unlock(); + } } else { ctxt_sta->is_assoc = cpu_to_le32(0); @@ -703,7 +728,8 @@ static int iwl_mvm_mac_ctxt_cmd_listener(struct iwl_mvm *mvm, MAC_FILTER_IN_CONTROL_AND_MGMT | MAC_FILTER_IN_BEACON | MAC_FILTER_IN_PROBE_REQUEST | - MAC_FILTER_IN_CRC32); + MAC_FILTER_IN_CRC32 | + MAC_FILTER_ACCEPT_GRP); ieee80211_hw_set(mvm->hw, RX_INCLUDES_FCS); /* Allocate sniffer station */ @@ -727,7 +753,8 @@ static int iwl_mvm_mac_ctxt_cmd_ibss(struct iwl_mvm *mvm, iwl_mvm_mac_ctxt_cmd_common(mvm, vif, &cmd, NULL, action); cmd.filter_flags = cpu_to_le32(MAC_FILTER_IN_BEACON | - MAC_FILTER_IN_PROBE_REQUEST); + MAC_FILTER_IN_PROBE_REQUEST | + MAC_FILTER_ACCEPT_GRP); /* cmd.ibss.beacon_time/cmd.ibss.beacon_tsf are curently ignored */ cmd.ibss.bi = cpu_to_le32(vif->bss_conf.beacon_int); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 1c904b5226aa..a7bc00d1296f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -3327,10 +3327,20 @@ static int iwl_mvm_mac_sta_state(struct ieee80211_hw *hw, /* enable beacon filtering */ WARN_ON(iwl_mvm_enable_beacon_filter(mvm, vif, 0)); + /* + * Now that the station is authorized, i.e., keys were already + * installed, need to indicate to the FW that + * multicast data frames can be forwarded to the driver + */ + iwl_mvm_mac_ctxt_changed(mvm, vif, false, NULL); + iwl_mvm_rs_rate_init(mvm, sta, mvmvif->phy_ctxt->channel->band, true); } else if (old_state == IEEE80211_STA_AUTHORIZED && new_state == IEEE80211_STA_ASSOC) { + /* Multicast data frames are no longer allowed */ + iwl_mvm_mac_ctxt_changed(mvm, vif, false, NULL); + /* disable beacon filtering */ ret = iwl_mvm_disable_beacon_filter(mvm, vif, 0); WARN_ON(ret && From 884b75696873f5338c57a2613763ea8f37b4e26b Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 16 Aug 2019 15:55:52 +0300 Subject: [PATCH 103/266] iwlwifi: pcie: fix the byte count table format for 22560 devices Starting from 22560, the byte count is expected to be in bytes and we have now 14 bits. Ajust the code to this. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- .../net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 38d110338987..9ef6b8fe03c1 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -99,10 +99,7 @@ void iwl_pcie_gen2_update_byte_tbl(struct iwl_trans_pcie *trans_pcie, u16 len = byte_cnt; __le16 bc_ent; - if (trans_pcie->bc_table_dword) - len = DIV_ROUND_UP(len, 4); - - if (WARN_ON(len > 0xFFF || idx >= txq->n_window)) + if (WARN(idx >= txq->n_window, "%d >= %d\n", idx, txq->n_window)) return; filled_tfd_size = offsetof(struct iwl_tfh_tfd, tbs) + @@ -117,11 +114,20 @@ void iwl_pcie_gen2_update_byte_tbl(struct iwl_trans_pcie *trans_pcie, */ num_fetch_chunks = DIV_ROUND_UP(filled_tfd_size, 64) - 1; - bc_ent = cpu_to_le16(len | (num_fetch_chunks << 12)); - if (trans->cfg->device_family >= IWL_DEVICE_FAMILY_22560) + if (trans->cfg->device_family >= IWL_DEVICE_FAMILY_22560) { + /* Starting from 22560, the HW expects bytes */ + WARN_ON(trans_pcie->bc_table_dword); + WARN_ON(len > 0x3FFF); + bc_ent = cpu_to_le16(len | (num_fetch_chunks << 14)); scd_bc_tbl_gen3->tfd_offset[idx] = bc_ent; - else + } else { + /* Until 22560, the HW expects DW */ + WARN_ON(!trans_pcie->bc_table_dword); + len = DIV_ROUND_UP(len, 4); + WARN_ON(len > 0xFFF); + bc_ent = cpu_to_le16(len | (num_fetch_chunks << 12)); scd_bc_tbl->tfd_offset[idx] = bc_ent; + } } /* From 17e40e6979aaf60f356331bac129df20e1fd74a0 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 16 Aug 2019 15:55:53 +0300 Subject: [PATCH 104/266] iwlwifi: pcie: don't switch FW to qnj when ax201 is detected We have a too generic condition that switches from Qu configurations to QnJ configurations. We need to exclude some configurations so that they are not erroneously switched. Add the ax201 configuration to the list of exclusions. Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index f5df5b370d78..935e35dafce5 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -3603,6 +3603,7 @@ struct iwl_trans *iwl_trans_pcie_alloc(struct pci_dev *pdev, } else if (CSR_HW_RF_ID_TYPE_CHIP_ID(trans->hw_rf_id) == CSR_HW_RF_ID_TYPE_CHIP_ID(CSR_HW_RF_ID_TYPE_HR) && ((trans->cfg != &iwl_ax200_cfg_cc && + trans->cfg != &iwl_ax201_cfg_qu_hr && trans->cfg != &killer1650x_2ax_cfg && trans->cfg != &killer1650w_2ax_cfg && trans->cfg != &iwl_ax201_cfg_quz_hr) || From 5a8c31aa63578cb0ff390a57537f1cb4b312a1ed Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 16 Aug 2019 15:55:54 +0300 Subject: [PATCH 105/266] iwlwifi: pcie: fix recognition of QuZ devices If the HW revision of Qu devices we found is QuZ, then we need to switch the configuration accordingly in order to use the correct FW. Add a block of ifs in order do that. Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index de711c1160d3..7c5aaeaf7fe5 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1063,6 +1063,23 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) else if (iwl_trans->cfg == &iwl9560_2ac_160_cfg_qu_b0_jf_b0) iwl_trans->cfg = &iwl9560_2ac_160_cfg_qu_c0_jf_b0; } + + /* same thing for QuZ... */ + if (iwl_trans->hw_rev == CSR_HW_REV_TYPE_QUZ) { + if (cfg == &iwl_ax101_cfg_qu_hr) + cfg = &iwl_ax101_cfg_quz_hr; + else if (cfg == &iwl_ax201_cfg_qu_hr) + cfg = &iwl_ax201_cfg_quz_hr; + else if (cfg == &iwl9461_2ac_cfg_qu_b0_jf_b0) + cfg = &iwl9461_2ac_cfg_quz_a0_jf_b0_soc; + else if (cfg == &iwl9462_2ac_cfg_qu_b0_jf_b0) + cfg = &iwl9462_2ac_cfg_quz_a0_jf_b0_soc; + else if (cfg == &iwl9560_2ac_cfg_qu_b0_jf_b0) + cfg = &iwl9560_2ac_cfg_quz_a0_jf_b0_soc; + else if (cfg == &iwl9560_2ac_160_cfg_qu_b0_jf_b0) + cfg = &iwl9560_2ac_160_cfg_quz_a0_jf_b0_soc; + } + #endif pci_set_drvdata(pdev, iwl_trans); From fb89c39455e4b49881c5a42761bd71f03d3ef888 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Thu, 15 Aug 2019 23:56:35 +0300 Subject: [PATCH 106/266] xdp: unpin xdp umem pages in error path Fix mem leak caused by missed unpin routine for umem pages. Fixes: 8aef7340ae9695 ("xsk: introduce xdp_umem_page") Signed-off-by: Ivan Khoronzhuk Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 83de74ca729a..688aac7a6943 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -365,7 +365,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); if (!umem->pages) { err = -ENOMEM; - goto out_account; + goto out_pin; } for (i = 0; i < umem->npgs; i++) @@ -373,6 +373,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return 0; +out_pin: + xdp_umem_unpin_pages(umem); out_account: xdp_umem_unaccount_pages(umem); return err; From 16c75963723dfd8d7ca719527052f16be7258a23 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 20 Aug 2019 03:06:40 +0000 Subject: [PATCH 107/266] Drivers: hv: vmbus: Remove the unused "tsc_page" from struct hv_context This field is no longer used after the commit 63ed4e0c67df ("Drivers: hv: vmbus: Consolidate all Hyper-V specific clocksource code") , because it's replaced by the global variable "struct ms_hyperv_tsc_page *tsc_pg;" (now, the variable is in drivers/clocksource/hyperv_timer.c). Fixes: 63ed4e0c67df ("Drivers: hv: vmbus: Consolidate all Hyper-V specific clocksource code") Signed-off-by: Dexuan Cui Signed-off-by: Sasha Levin --- drivers/hv/hyperv_vmbus.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 362e70e9d145..fb16a622e8ab 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -146,8 +146,6 @@ struct hv_context { */ u64 guestid; - void *tsc_page; - struct hv_per_cpu_context __percpu *cpu_context; /* From d09bc83640d524b8467a660db7b1d15e6562a1de Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 20 Aug 2019 03:01:23 +0000 Subject: [PATCH 108/266] Input: hyperv-keyboard: Use in-place iterator API in the channel callback Simplify the ring buffer handling with the in-place API. Also avoid the dynamic allocation and the memory leak in the channel callback function. Signed-off-by: Dexuan Cui Acked-by: Dmitry Torokhov Signed-off-by: Sasha Levin --- drivers/input/serio/hyperv-keyboard.c | 35 +++++---------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/drivers/input/serio/hyperv-keyboard.c b/drivers/input/serio/hyperv-keyboard.c index 88ae7c2ac3c8..e486a8a74c40 100644 --- a/drivers/input/serio/hyperv-keyboard.c +++ b/drivers/input/serio/hyperv-keyboard.c @@ -237,40 +237,17 @@ static void hv_kbd_handle_received_packet(struct hv_device *hv_dev, static void hv_kbd_on_channel_callback(void *context) { + struct vmpacket_descriptor *desc; struct hv_device *hv_dev = context; - void *buffer; - int bufferlen = 0x100; /* Start with sensible size */ u32 bytes_recvd; u64 req_id; - int error; - buffer = kmalloc(bufferlen, GFP_ATOMIC); - if (!buffer) - return; + foreach_vmbus_pkt(desc, hv_dev->channel) { + bytes_recvd = desc->len8 * 8; + req_id = desc->trans_id; - while (1) { - error = vmbus_recvpacket_raw(hv_dev->channel, buffer, bufferlen, - &bytes_recvd, &req_id); - switch (error) { - case 0: - if (bytes_recvd == 0) { - kfree(buffer); - return; - } - - hv_kbd_handle_received_packet(hv_dev, buffer, - bytes_recvd, req_id); - break; - - case -ENOBUFS: - kfree(buffer); - /* Handle large packet */ - bufferlen = bytes_recvd; - buffer = kmalloc(bytes_recvd, GFP_ATOMIC); - if (!buffer) - return; - break; - } + hv_kbd_handle_received_packet(hv_dev, desc, bytes_recvd, + req_id); } } From 89eb4d8d25722a0a0194cf7fa47ba602e32a6da7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 19 Aug 2019 16:44:09 +0200 Subject: [PATCH 109/266] Tools: hv: kvp: eliminate 'may be used uninitialized' warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building hv_kvp_daemon GCC-8.3 complains: hv_kvp_daemon.c: In function ‘kvp_get_ip_info.constprop’: hv_kvp_daemon.c:812:30: warning: ‘ip_buffer’ may be used uninitialized in this function [-Wmaybe-uninitialized] struct hv_kvp_ipaddr_value *ip_buffer; this seems to be a false positive: we only use ip_buffer when op == KVP_OP_GET_IP_INFO and it is only unset when op == KVP_OP_ENUMERATE. Silence the warning by initializing ip_buffer to NULL. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Sasha Levin --- tools/hv/hv_kvp_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index f5597503c771..e9ef4ca6a655 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -809,7 +809,7 @@ kvp_get_ip_info(int family, char *if_name, int op, int sn_offset = 0; int error = 0; char *buffer; - struct hv_kvp_ipaddr_value *ip_buffer; + struct hv_kvp_ipaddr_value *ip_buffer = NULL; char cidr_mask[5]; /* /xyz */ int weight; int i; From a9fc4340aee041dd186d1fb8f1b5d1e9caf28212 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 7 May 2019 07:46:55 +0000 Subject: [PATCH 110/266] Drivers: hv: vmbus: Fix virt_to_hvpfn() for X86_PAE In the case of X86_PAE, unsigned long is u32, but the physical address type should be u64. Due to the bug here, the netvsc driver can not load successfully, and sometimes the VM can panic due to memory corruption (the hypervisor writes data to the wrong location). Fixes: 6ba34171bcbd ("Drivers: hv: vmbus: Remove use of slow_virt_to_phys()") Cc: stable@vger.kernel.org Cc: Michael Kelley Reported-and-tested-by: Juliana Rodrigueiro Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/channel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 5f9505a087f6..23f358cb7f49 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -26,7 +26,7 @@ static unsigned long virt_to_hvpfn(void *addr) { - unsigned long paddr; + phys_addr_t paddr; if (is_vmalloc_addr(addr)) paddr = page_to_phys(vmalloc_to_page(addr)) + From 500f9fbadef86466a435726192f4ca4df7d94236 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Aug 2019 12:15:59 -0600 Subject: [PATCH 111/266] io_uring: fix potential hang with polled IO If a request issue ends up being punted to async context to avoid blocking, we can get into a situation where the original application enters the poll loop for that very request before it has been issued. This should not be an issue, except that the polling will hold the io_uring uring_ctx mutex for the duration of the poll. When the async worker has actually issued the request, it needs to acquire this mutex to add the request to the poll issued list. Since the application polling is already holding this mutex, the workqueue sleeps on the mutex forever, and the application thus never gets a chance to poll for the very request it was interested in. Fix this by ensuring that the polling drops the uring_ctx occasionally if it's not making any progress. Reported-by: Jeffrey M. Birnbaum Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 24bbe3cb7ad4..36f04d0b197b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -805,11 +805,34 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, long min) { - int ret = 0; + int iters, ret = 0; + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + + iters = 0; do { int tmin = 0; + /* + * If a submit got punted to a workqueue, we can have the + * application entering polling for a command before it gets + * issued. That app will hold the uring_lock for the duration + * of the poll right here, so we need to take a breather every + * now and then to ensure that the issue has a chance to add + * the poll to the issued list. Otherwise we can spin here + * forever, while the workqueue is stuck trying to acquire the + * very same mutex. + */ + if (!(++iters & 7)) { + mutex_unlock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); + } + if (*nr_events < min) tmin = min - *nr_events; @@ -819,6 +842,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); + mutex_unlock(&ctx->uring_lock); return ret; } @@ -2280,15 +2304,7 @@ static int io_sq_thread(void *data) unsigned nr_events = 0; if (ctx->flags & IORING_SETUP_IOPOLL) { - /* - * We disallow the app entering submit/complete - * with polling, but we still need to lock the - * ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); io_iopoll_check(ctx, &nr_events, 0); - mutex_unlock(&ctx->uring_lock); } else { /* * Normal IO, just pretend everything completed. @@ -3190,9 +3206,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, min_complete = min(min_complete, ctx->cq_entries); if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); - mutex_unlock(&ctx->uring_lock); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } From 504db087aaccdb32af61539916409f7dca31ceb5 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Mon, 12 Aug 2019 23:00:36 +0300 Subject: [PATCH 112/266] nvme-multipath: fix possible I/O hang when paths are updated nvme_state_set_live() making a path available triggers requeue_work in order to resubmit requests that ended up on requeue_list when no paths were available. This requeue_work may race with concurrent nvme_ns_head_make_request() that do not observe the live path yet. Such concurrent requests may by made by either: - New IO submission. - Requeue_work triggered by nvme_failover_req() or another ana_work. A race may cause requeue_work capture the state of requeue_list before more requests get onto the list. These requests will stay on the list forever unless requeue_work is triggered again. In order to prevent such race, nvme_state_set_live() should synchronize_srcu(&head->srcu) before triggering the requeue_work and prevent nvme_ns_head_make_request referencing an old snapshot of the path list. Reviewed-by: Christoph Hellwig Signed-off-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 888d4543894e..af831d3d15d0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -428,6 +428,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) srcu_read_unlock(&head->srcu, srcu_idx); } + synchronize_srcu(&ns->head->srcu); kblockd_schedule_work(&ns->head->requeue_work); } From a89fcca8185633993018dc081d6b021d005e6d0b Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 14 Aug 2019 11:26:10 -0300 Subject: [PATCH 113/266] nvme: Fix cntlid validation when not using NVMEoF Commit 1b1031ca63b2 ("nvme: validate cntlid during controller initialisation") introduced a validation for controllers with duplicate cntlid that runs on nvme_init_subsystem(). The problem is that the validation relies on ctrl->cntlid, and this value is assigned (from id_ctrl value) after the call for nvme_init_subsystem() in nvme_init_identify() for non-fabrics scenario. That leads to ctrl->cntlid always being 0 in case we have a physical set of controllers in the same subsystem. This patch fixes that by loading the discovered cntlid id_ctrl value into ctrl->cntlid before the subsystem initialization, only for the non-fabrics case. The patch was tested with emulated nvme devices (qemu) having two controllers in a single subsystem. Without the patch, we couldn't make it work failing in the duplicate check; when running with the patch, we could see the subsystem holding both controllers. For the fabrics case we see ctrl->cntlid has a more intricate relation with the admin connect, so we didn't change that. Fixes: 1b1031ca63b2 ("nvme: validate cntlid during controller initialisation") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c258a1ce4b28..fea83fd95252 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2597,6 +2597,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) goto out_free; } + if (!(ctrl->ops->flags & NVME_F_FABRICS)) + ctrl->cntlid = le16_to_cpu(id->cntlid); + if (!ctrl->identified) { int i; @@ -2697,7 +2700,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) goto out_free; } } else { - ctrl->cntlid = le16_to_cpu(id->cntlid); ctrl->hmpre = le32_to_cpu(id->hmpre); ctrl->hmmin = le32_to_cpu(id->hmmin); ctrl->hmminds = le32_to_cpu(id->hmminds); From cb32de1b7e2591f844f18a5513fde8e2bd49bce0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 16 Aug 2019 15:16:19 -0500 Subject: [PATCH 114/266] nvme: Add quirk for LiteON CL1 devices running FW 22301111 One of the components in LiteON CL1 device has limitations that can be encountered based upon boundary race conditions using the nvme bus specific suspend to idle flow. When this situation occurs the drive doesn't resume properly from suspend-to-idle. LiteON has confirmed this problem and fixed in the next firmware version. As this firmware is already in the field, avoid running nvme specific suspend to idle flow. Fixes: d916b1be94b6 ("nvme-pci: use host managed power state for suspend") Link: http://lists.infradead.org/pipermail/linux-nvme/2019-July/thread.html Signed-off-by: Mario Limonciello Signed-off-by: Charles Hyde Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 ++++++++++ drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 3 ++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fea83fd95252..d3d6b7bd6903 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2257,6 +2257,16 @@ static const struct nvme_core_quirk_entry core_quirks[] = { .vid = 0x1179, .mn = "THNSF5256GPUK TOSHIBA", .quirks = NVME_QUIRK_NO_APST, + }, + { + /* + * This LiteON CL1-3D*-Q11 firmware version has a race + * condition associated with actions related to suspend to idle + * LiteON has resolved the problem in future firmware + */ + .vid = 0x14a4, + .fr = "22301111", + .quirks = NVME_QUIRK_SIMPLE_SUSPEND, } }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 778b3a0b6adb..2d678fb968c7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -92,6 +92,11 @@ enum nvme_quirks { * Broken Write Zeroes. */ NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), + + /* + * Force simple suspend/resume path. + */ + NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6bd9b1033965..732d5b63ec05 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2876,7 +2876,8 @@ static int nvme_suspend(struct device *dev) * state (which may not be possible if the link is up). */ if (pm_suspend_via_firmware() || !ctrl->npss || - !pcie_aspm_enabled(pdev)) { + !pcie_aspm_enabled(pdev) || + (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) { nvme_dev_disable(ndev, true); return 0; } From a3a0e43fd77013819e4b6f55e37e0efe8e35d805 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Aug 2019 11:03:11 -0600 Subject: [PATCH 115/266] io_uring: don't enter poll loop if we have CQEs pending We need to check if we have CQEs pending before starting a poll loop, as those could be the events we will be spinning for (and hence we'll find none). This can happen if a CQE triggers an error, or if it is found by eg an IRQ before we get a chance to find it through polling. Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 36f04d0b197b..e7a43a354d91 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -679,6 +679,13 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static unsigned io_cqring_events(struct io_cq_ring *ring) +{ + /* See comment at the top of this file */ + smp_rmb(); + return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); +} + /* * Find and free completed poll iocbs */ @@ -818,6 +825,14 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, do { int tmin = 0; + /* + * Don't enter poll loop if we already have events pending. + * If we do, we can potentially be spinning for commands that + * already triggered a CQE (eg in error). + */ + if (io_cqring_events(ctx->cq_ring)) + break; + /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets @@ -2449,13 +2464,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) return submit; } -static unsigned io_cqring_events(struct io_cq_ring *ring) -{ - /* See comment at the top of this file */ - smp_rmb(); - return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); -} - /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. From 27b7fb1ab7bfad45f5702ff0c78a4822a41b1456 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 15 Aug 2019 11:38:30 +0300 Subject: [PATCH 116/266] RDMA/mlx5: Fix MR npages calculation for IB_ACCESS_HUGETLB When ODP is enabled with IB_ACCESS_HUGETLB then the required pages should be calculated based on the extent of the MR, which is rounded to the nearest huge page alignment. Fixes: d2183c6f1958 ("RDMA/umem: Move page_shift from ib_umem to ib_odp_umem") Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20190815083834.9245-5-leon@kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 7 +------ drivers/infiniband/hw/mlx5/mem.c | 5 +++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 08da840ed7ee..56553668256f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -379,14 +379,9 @@ EXPORT_SYMBOL(ib_umem_release); int ib_umem_page_count(struct ib_umem *umem) { - int i; - int n; + int i, n = 0; struct scatterlist *sg; - if (umem->is_odp) - return ib_umem_num_pages(umem); - - n = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) n += sg_dma_len(sg) >> PAGE_SHIFT; diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index fe1a76d8531c..a40e0abf2338 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -57,9 +57,10 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int entry; if (umem->is_odp) { - unsigned int page_shift = to_ib_umem_odp(umem)->page_shift; + struct ib_umem_odp *odp = to_ib_umem_odp(umem); + unsigned int page_shift = odp->page_shift; - *ncont = ib_umem_page_count(umem); + *ncont = ib_umem_odp_num_pages(odp); *count = *ncont << (page_shift - PAGE_SHIFT); *shift = page_shift; if (order) From 9b440078017f194e56eaae3ac32f333f420c5c4e Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Mon, 19 Aug 2019 16:02:57 +0200 Subject: [PATCH 117/266] RDMA/siw: Fix potential NULL de-ref In siw_connect() we have an error flow where there is no valid qp pointer. Make sure we don't try to de-ref in that situation. Fixes: 6c52fdc244b5 ("rdma/siw: connection management") Reported-by: Dan Carpenter Signed-off-by: Bernard Metzler Link: https://lore.kernel.org/r/20190819140257.19319-1-bmt@zurich.ibm.com Signed-off-by: Doug Ledford --- drivers/infiniband/sw/siw/siw_cm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 9ce8a1b925d2..fc97571a640b 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1515,7 +1515,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) } } error: - siw_dbg_qp(qp, "failed: %d\n", rv); + siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_socket_disassoc(s); @@ -1540,7 +1540,8 @@ error: } else if (s) { sock_release(s); } - siw_qp_put(qp); + if (qp) + siw_qp_put(qp); return rv; } From d58c1834bf0d218a0bc00f8fb44874551b21da84 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 15 Aug 2019 15:20:33 -0400 Subject: [PATCH 118/266] IB/hfi1: Drop stale TID RDMA packets In a congested fabric with adaptive routing enabled, traces show that the sender could receive stale TID RDMA NAK packets that contain newer KDETH PSNs and older Verbs PSNs. If not dropped, these packets could cause the incorrect rewinding of the software flows and the incorrect completion of TID RDMA WRITE requests, and eventually leading to memory corruption and kernel crash. The current code drops stale TID RDMA ACK/NAK packets solely based on KDETH PSNs, which may lead to erroneous processing. This patch fixes the issue by also checking the Verbs PSN. Addition checks are added before rewinding the TID RDMA WRITE DATA packets. Fixes: 9e93e967f7b4 ("IB/hfi1: Add a function to receive TID RDMA ACK packet") Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/20190815192033.105923.44192.stgit@awfm-01.aw.intel.com Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 996fc298207e..94070144fef5 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4509,7 +4509,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) struct rvt_swqe *wqe; struct tid_rdma_request *req; struct tid_rdma_flow *flow; - u32 aeth, psn, req_psn, ack_psn, resync_psn, ack_kpsn; + u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn; unsigned long flags; u16 fidx; @@ -4538,6 +4538,9 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) ack_kpsn--; } + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_op_err; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) @@ -4550,7 +4553,8 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); /* Drop stale ACK/NAK */ - if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0) + if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 || + cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0) goto ack_op_err; while (cmp_psn(ack_kpsn, @@ -4712,7 +4716,12 @@ done: switch ((aeth >> IB_AETH_CREDIT_SHIFT) & IB_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ + if (!req->flows) + break; flow = &req->flows[req->acked_tail]; + flpsn = full_flow_psn(flow, flow->flow_state.lpsn); + if (cmp_psn(psn, flpsn) > 0) + break; trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); From 35d5c8b82e2c32e8e29ca195bb4dac60ba7d97fc Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 15 Aug 2019 15:20:39 -0400 Subject: [PATCH 119/266] IB/hfi1: Unsafe PSN checking for TID RDMA READ Resp packet When processing a TID RDMA READ RESP packet that causes KDETH EFLAGS errors, the packet's IB PSN is checked against qp->s_last_psn and qp->s_psn without the protection of qp->s_lock, which is not safe. This patch fixes the issue by acquiring qp->s_lock first. Fixes: 9905bf06e890 ("IB/hfi1: Add functions to receive TID RDMA READ response") Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/20190815192039.105923.7852.stgit@awfm-01.aw.intel.com Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 94070144fef5..01c8b0280700 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2687,12 +2687,12 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, u32 fpsn; lockdep_assert_held(&qp->r_lock); + spin_lock(&qp->s_lock); /* If the psn is out of valid range, drop the packet */ if (cmp_psn(ibpsn, qp->s_last_psn) < 0 || cmp_psn(ibpsn, qp->s_psn) > 0) - return ret; + goto s_unlock; - spin_lock(&qp->s_lock); /* * Note that NAKs implicitly ACK outstanding SEND and RDMA write * requests and implicitly NAK RDMA read and atomic requests issued From a8adbf7d0d0a6e3bf7f99da461a06039364e028b Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 15 Aug 2019 15:20:45 -0400 Subject: [PATCH 120/266] IB/hfi1: Add additional checks when handling TID RDMA READ RESP packet In a congested fabric with adaptive routing enabled, traces show that packets could be delivered out of order, which could cause incorrect processing of stale packets. For stale TID RDMA READ RESP packets that cause KDETH EFLAGS errors, this patch adds additional checks before processing the packets. Fixes: 9905bf06e890 ("IB/hfi1: Add functions to receive TID RDMA READ response") Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/20190815192045.105923.59813.stgit@awfm-01.aw.intel.com Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 01c8b0280700..23bb2498e5b4 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2740,9 +2740,12 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, wqe = do_rc_completion(qp, wqe, ibp); if (qp->s_acked == qp->s_tail) - break; + goto s_unlock; } + if (qp->s_acked == qp->s_tail) + goto s_unlock; + /* Handle the eflags for the request */ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) goto s_unlock; From 90fdae66e72bf0381d168f12dca0259617927895 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 15 Aug 2019 15:20:51 -0400 Subject: [PATCH 121/266] IB/hfi1: Add additional checks when handling TID RDMA WRITE DATA packet In a congested fabric with adaptive routing enabled, traces show that packets could be delivered out of order, which could cause incorrect processing of stale packets. For stale TID RDMA WRITE DATA packets that cause KDETH EFLAGS errors, this patch adds additional checks before processing the packets. Fixes: d72fe7d5008b ("IB/hfi1: Add a function to receive TID RDMA WRITE DATA packet") Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/20190815192051.105923.69979.stgit@awfm-01.aw.intel.com Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 23bb2498e5b4..7bccb59d8a30 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2945,8 +2945,15 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, */ spin_lock(&qp->s_lock); qpriv = qp->priv; + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID || + qpriv->r_tid_tail == qpriv->r_tid_head) + goto unlock; e = &qp->s_ack_queue[qpriv->r_tid_tail]; + if (e->opcode != TID_OP(WRITE_REQ)) + goto unlock; req = ack_to_tid_req(e); + if (req->comp_seg == req->cur_seg) + goto unlock; flow = &req->flows[req->clear_tail]; trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); From d9d1f5e7bb82415591e8b62b222cbb88c4797ef3 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 15 Aug 2019 15:20:58 -0400 Subject: [PATCH 122/266] IB/hfi1: Drop stale TID RDMA packets that cause TIDErr In a congested fabric with adaptive routing enabled, traces show that packets could be delivered out of order. A stale TID RDMA data packet could lead to TidErr if the TID entries have been released by duplicate data packets generated from retries, and subsequently erroneously force the qp into error state in the current implementation. Since the payload has already been dropped by hardware, the packet can be simply dropped and it is no longer necessary to put the qp into error state. Fixes: 9905bf06e890 ("IB/hfi1: Add functions to receive TID RDMA READ response") Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/20190815192058.105923.72324.stgit@awfm-01.aw.intel.com Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 47 ++------------------------- 1 file changed, 3 insertions(+), 44 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 7bccb59d8a30..6141f4edc6bf 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2574,18 +2574,9 @@ void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) hfi1_kern_clear_hw_flow(priv->rcd, qp); } -static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd, - struct hfi1_packet *packet, u8 rcv_type, - u8 opcode) +static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type) { struct rvt_qp *qp = packet->qp; - struct hfi1_qp_priv *qpriv = qp->priv; - u32 ipsn; - struct ib_other_headers *ohdr = packet->ohdr; - struct rvt_ack_entry *e; - struct tid_rdma_request *req; - struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); - u32 i; if (rcv_type >= RHF_RCV_TYPE_IB) goto done; @@ -2602,41 +2593,9 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd, if (rcv_type == RHF_RCV_TYPE_EAGER) { hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); hfi1_schedule_send(qp); - goto done_unlock; } - /* - * For TID READ response, error out QP after freeing the tid - * resources. - */ - if (opcode == TID_OP(READ_RESP)) { - ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); - if (cmp_psn(ipsn, qp->s_last_psn) > 0 && - cmp_psn(ipsn, qp->s_psn) < 0) { - hfi1_kern_read_tid_flow_free(qp); - spin_unlock(&qp->s_lock); - rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); - goto done; - } - goto done_unlock; - } - - /* - * Error out the qp for TID RDMA WRITE - */ - hfi1_kern_clear_hw_flow(qpriv->rcd, qp); - for (i = 0; i < rvt_max_atomic(rdi); i++) { - e = &qp->s_ack_queue[i]; - if (e->opcode == TID_OP(WRITE_REQ)) { - req = ack_to_tid_req(e); - hfi1_kern_exp_rcv_clear_all(req); - } - } - spin_unlock(&qp->s_lock); - rvt_rc_error(qp, IB_WC_LOC_LEN_ERR); - goto done; - -done_unlock: + /* Since no payload is delivered, just drop the packet */ spin_unlock(&qp->s_lock); done: return true; @@ -2925,7 +2884,7 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, if (lnh == HFI1_LRH_GRH) goto r_unlock; - if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode)) + if (tid_rdma_tid_err(packet, rcv_type)) goto r_unlock; } From 948a7287b29e06b8c629f5e70235d857a175ceaf Mon Sep 17 00:00:00 2001 From: Ido Kalir Date: Thu, 15 Aug 2019 11:38:27 +0300 Subject: [PATCH 123/266] IB/core: Fix NULL pointer dereference when bind QP to counter If QP is not visible to the pid, then we try to decrease its reference count and return from the function before the QP pointer is initialized. This lead to NULL pointer dereference. Fix it by pass directly the res to the rdma_restract_put as arg instead of &qp->res. This fixes below call trace: [ 5845.110329] BUG: kernel NULL pointer dereference, address: 00000000000000dc [ 5845.120482] Oops: 0002 [#1] SMP PTI [ 5845.129119] RIP: 0010:rdma_restrack_put+0x5/0x30 [ib_core] [ 5845.169450] Call Trace: [ 5845.170544] rdma_counter_get_qp+0x5c/0x70 [ib_core] [ 5845.172074] rdma_counter_bind_qpn_alloc+0x6f/0x1a0 [ib_core] [ 5845.173731] nldev_stat_set_doit+0x314/0x330 [ib_core] [ 5845.175279] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [ 5845.176772] ? __kmalloc_node_track_caller+0x20b/0x2b0 [ 5845.178321] rdma_nl_rcv+0xcb/0x120 [ib_core] [ 5845.179753] netlink_unicast+0x179/0x220 [ 5845.181066] netlink_sendmsg+0x2d8/0x3d0 [ 5845.182338] sock_sendmsg+0x30/0x40 [ 5845.183544] __sys_sendto+0xdc/0x160 [ 5845.184832] ? syscall_trace_enter+0x1f8/0x2e0 [ 5845.186209] ? __audit_syscall_exit+0x1d9/0x280 [ 5845.187584] __x64_sys_sendto+0x24/0x30 [ 5845.188867] do_syscall_64+0x48/0x120 [ 5845.190097] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 1bd8e0a9d0fd1 ("RDMA/counter: Allow manual mode configuration support") Signed-off-by: Ido Kalir Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20190815083834.9245-2-leon@kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/core/counters.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index b79890739a2c..955d061af06a 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -424,7 +424,7 @@ static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) return qp; err: - rdma_restrack_put(&qp->res); + rdma_restrack_put(res); return NULL; } From c8b32408b4074232d93e64b6c23b2aa96dde448e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 15 Aug 2019 11:38:28 +0300 Subject: [PATCH 124/266] RDMA/counters: Properly implement PID checks "Auto" configuration mode is called for visible in that PID namespace and it ensures that all counters and QPs are coexist in the same namespace and belong to same PID. Fixes: 99fa331dc862 ("RDMA/counter: Add "auto" configuration mode support") Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20190815083834.9245-3-leon@kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/core/counters.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 955d061af06a..af8c85d18e62 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -149,13 +149,11 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, struct auto_mode_param *param = &counter->mode.param; bool match = true; - if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) + if (!rdma_is_visible_in_pid_ns(&qp->res)) return false; - /* Ensure that counter belong to right PID */ - if (!rdma_is_kernel_res(&counter->res) && - !rdma_is_kernel_res(&qp->res) && - (task_pid_vnr(counter->res.task) != current->pid)) + /* Ensure that counter belongs to the right PID */ + if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task)) return false; if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) From 60c78668ae50d6b815ead4a62216822a92097125 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 15 Aug 2019 11:38:29 +0300 Subject: [PATCH 125/266] RDMA/restrack: Rewrite PID namespace check to be reliable task_active_pid_ns() is wrong API to check PID namespace because it posses some restrictions and return PID namespace where the process was allocated. It created mismatches with current namespace, which can be different. Rewrite whole rdma_is_visible_in_pid_ns() logic to provide reliable results without any relation to allocated PID namespace. Fixes: 8be565e65fa9 ("RDMA/nldev: Factor out the PID namespace check") Fixes: 6a6c306a09b5 ("RDMA/restrack: Make is_visible_in_pid_ns() as an API") Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20190815083834.9245-4-leon@kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 3 +-- drivers/infiniband/core/restrack.c | 15 +++++++-------- include/rdma/restrack.h | 3 +-- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 87d40d1ecdde..020c26976558 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -382,8 +382,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; - curr = rdma_restrack_count(device, i, - task_active_pid_ns(current)); + curr = rdma_restrack_count(device, i); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index bddff426ee0f..a07665f7ef8c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -107,10 +107,8 @@ void rdma_restrack_clean(struct ib_device *dev) * rdma_restrack_count() - the current usage of specific object * @dev: IB device * @type: actual type of object to operate - * @ns: PID namespace */ -int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, - struct pid_namespace *ns) +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) { struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *e; @@ -119,10 +117,9 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, xa_lock(&rt->xa); xas_for_each(&xas, e, U32_MAX) { - if (ns == &init_pid_ns || - (!rdma_is_kernel_res(e) && - ns == task_active_pid_ns(e->task))) - cnt++; + if (!rdma_is_visible_in_pid_ns(e)) + continue; + cnt++; } xa_unlock(&rt->xa); return cnt; @@ -360,5 +357,7 @@ bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res) */ if (rdma_is_kernel_res(res)) return task_active_pid_ns(current) == &init_pid_ns; - return task_active_pid_ns(current) == task_active_pid_ns(res->task); + + /* PID 0 means that resource is not found in current namespace */ + return task_pid_vnr(res->task); } diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index b0fc6b26bdf5..83df1ec6664e 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -105,8 +105,7 @@ struct rdma_restrack_entry { }; int rdma_restrack_count(struct ib_device *dev, - enum rdma_restrack_type type, - struct pid_namespace *ns); + enum rdma_restrack_type type); void rdma_restrack_kadd(struct rdma_restrack_entry *res); void rdma_restrack_uadd(struct rdma_restrack_entry *res); From 0e6613b41edd2f55a4b33234c5f31410c1ed3783 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 15 Aug 2019 11:38:31 +0300 Subject: [PATCH 126/266] IB/mlx5: Consolidate use_umr checks into single function Introduce helper function to unify various use_umr checks. Signed-off-by: Moni Shoua Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20190815083834.9245-6-leon@kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 14 ++++++++++++++ drivers/infiniband/hw/mlx5/mr.c | 4 +--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index f6a53455bf8b..9ae587b74b12 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1475,4 +1475,18 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev, bool dyn_bfreg); int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); + +static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev, + bool do_modify_atomic) +{ + if (MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) + return false; + + if (do_modify_atomic && + MLX5_CAP_GEN(dev->mdev, atomic) && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) + return false; + + return true; +} #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index b74fad08412f..8bce65c03b84 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1293,9 +1293,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (err < 0) return ERR_PTR(err); - use_umr = !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled) && - (!MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled) || - !MLX5_CAP_GEN(dev->mdev, atomic)); + use_umr = mlx5_ib_can_use_umr(dev, true); if (order <= mr_cache_max_order(dev) && use_umr) { mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont, From 008157528ac5658502c0f87e872778c56c41109c Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 15 Aug 2019 11:38:32 +0300 Subject: [PATCH 127/266] IB/mlx5: Report and handle ODP support properly ODP depends on the several device capabilities, among them is the ability to send UMR WQEs with that modify atomic and entity size of the MR. Therefore, only if all conditions to send such a UMR WQE are met then driver can report that ODP is supported. Use this check of conditions in all places where driver needs to know about ODP support. Also, implicit ODP support depends on ability of driver to send UMR WQEs for an indirect mkey. Therefore, verify that all conditions to do so are met when reporting support. Fixes: c8d75a980fab ("IB/mlx5: Respect new UMR capabilities") Signed-off-by: Moni Shoua Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20190815083834.9245-7-leon@kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 6 +++--- drivers/infiniband/hw/mlx5/odp.c | 17 +++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e12a4404096b..0569bcab02d4 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1023,7 +1023,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { - if (MLX5_CAP_GEN(mdev, pg)) + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; props->odp_caps = dev->odp_caps; } @@ -6139,6 +6139,8 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->port[i].roce.last_port_state = IB_PORT_DOWN; } + mlx5_ib_internal_fill_odp_caps(dev); + err = mlx5_ib_init_multiport_master(dev); if (err) return err; @@ -6563,8 +6565,6 @@ static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev) static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev) { - mlx5_ib_internal_fill_odp_caps(dev); - return mlx5_ib_odp_init_one(dev); } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 1d257d1b3b0d..0a59912a4cef 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -301,7 +301,8 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) memset(caps, 0, sizeof(*caps)); - if (!MLX5_CAP_GEN(dev->mdev, pg)) + if (!MLX5_CAP_GEN(dev->mdev, pg) || + !mlx5_ib_can_use_umr(dev, true)) return; caps->general_caps = IB_ODP_SUPPORT; @@ -355,7 +356,8 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && MLX5_CAP_GEN(dev->mdev, null_mkey) && - MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && + !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; return; @@ -1622,8 +1624,10 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { int ret = 0; - if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) - ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) + return ret; + + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); @@ -1633,9 +1637,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) } } - if (!MLX5_CAP_GEN(dev->mdev, pg)) - return ret; - ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq); return ret; @@ -1643,7 +1644,7 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) { - if (!MLX5_CAP_GEN(dev->mdev, pg)) + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) return; mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq); From 25a4517214ffa217a443181f7f885b914e6b328f Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 15 Aug 2019 11:38:33 +0300 Subject: [PATCH 128/266] IB/mlx5: Fix MR re-registration flow to use UMR properly The UMR WQE in the MR re-registration flow requires that modify_atomic and modify_entity_size capabilities are enabled. Therefore, check that the these capabilities are present before going to umr flow and go through slow path if not. Fixes: c8d75a980fab ("IB/mlx5: Respect new UMR capabilities") Signed-off-by: Moni Shoua Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20190815083834.9245-8-leon@kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 8bce65c03b84..3401f5f6792e 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1446,7 +1446,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, goto err; } - if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) { + if (!mlx5_ib_can_use_umr(dev, true) || + (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) { /* * UMR can't be used - MKey needs to be replaced. */ From 841b07f99a4766d66f50d8a2ab941bce94cd4e70 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 15 Aug 2019 11:38:34 +0300 Subject: [PATCH 129/266] IB/mlx5: Block MR WR if UMR is not possible Check conditions that are mandatory to post_send UMR WQEs. 1. Modifying page size. 2. Modifying remote atomic permissions if atomic access is required. If either condition is not fulfilled then fail to post_send() flow. Fixes: c8d75a980fab ("IB/mlx5: Respect new UMR capabilities") Signed-off-by: Moni Shoua Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20190815083834.9245-9-leon@kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 379328b2598f..72869ff4a334 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4162,7 +4162,7 @@ static u64 get_xlt_octo(u64 bytes) MLX5_IB_UMR_OCTOWORD; } -static __be64 frwr_mkey_mask(void) +static __be64 frwr_mkey_mask(bool atomic) { u64 result; @@ -4175,10 +4175,12 @@ static __be64 frwr_mkey_mask(void) MLX5_MKEY_MASK_LW | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | - MLX5_MKEY_MASK_A | MLX5_MKEY_MASK_SMALL_FENCE | MLX5_MKEY_MASK_FREE; + if (atomic) + result |= MLX5_MKEY_MASK_A; + return cpu_to_be64(result); } @@ -4204,7 +4206,7 @@ static __be64 sig_mkey_mask(void) } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr, u8 flags) + struct mlx5_ib_mr *mr, u8 flags, bool atomic) { int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; @@ -4212,7 +4214,7 @@ static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, umr->flags = flags; umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); - umr->mkey_mask = frwr_mkey_mask(); + umr->mkey_mask = frwr_mkey_mask(atomic); } static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) @@ -4811,10 +4813,22 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; + bool atomic = wr->access & IB_ACCESS_REMOTE_ATOMIC; u8 flags = 0; + if (!mlx5_ib_can_use_umr(dev, atomic)) { + mlx5_ib_warn(to_mdev(qp->ibqp.device), + "Fast update of %s for MR is disabled\n", + (MLX5_CAP_GEN(dev->mdev, + umr_modify_entity_size_disabled)) ? + "entity size" : + "atomic access"); + return -EINVAL; + } + if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { mlx5_ib_warn(to_mdev(qp->ibqp.device), "Invalid IB_SEND_INLINE send flag\n"); @@ -4826,7 +4840,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, if (umr_inline) flags |= MLX5_UMR_INLINE; - set_reg_umr_seg(*seg, mr, flags); + set_reg_umr_seg(*seg, mr, flags, atomic); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); From a7bfb93f0211b4a2f1ffeeb259ed6206bac30460 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 19 Aug 2019 12:27:39 +0800 Subject: [PATCH 130/266] RDMA/cma: fix null-ptr-deref Read in cma_cleanup In cma_init, if cma_configfs_init fails, need to free the previously memory and return fail, otherwise will trigger null-ptr-deref Read in cma_cleanup. cma_cleanup cma_configfs_exit configfs_unregister_subsystem Fixes: 045959db65c6 ("IB/cma: Add configfs for rdma_cm") Reported-by: Hulk Robot Signed-off-by: zhengbin Reviewed-by: Parav Pandit Link: https://lore.kernel.org/r/1566188859-103051-1-git-send-email-zhengbin13@huawei.com Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 19f1730a4f24..a68d0ccf67a4 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4724,10 +4724,14 @@ static int __init cma_init(void) if (ret) goto err; - cma_configfs_init(); + ret = cma_configfs_init(); + if (ret) + goto err_ib; return 0; +err_ib: + ib_unregister_client(&cma_client); err: unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); From 5c1baaa82cea2c815a5180ded402a7cd455d1810 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sun, 18 Aug 2019 15:23:01 -0500 Subject: [PATCH 131/266] IB/mlx4: Fix memory leaks In mlx4_ib_alloc_pv_bufs(), 'tun_qp->tx_ring' is allocated through kcalloc(). However, it is not always deallocated in the following execution if an error occurs, leading to memory leaks. To fix this issue, free 'tun_qp->tx_ring' whenever an error occurs. Signed-off-by: Wenwen Wang Acked-by: Leon Romanovsky Link: https://lore.kernel.org/r/1566159781-4642-1-git-send-email-wenwen@cs.uga.edu Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mad.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 68c951491a08..57079110af9b 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -1677,8 +1677,6 @@ tx_err: tx_buf_size, DMA_TO_DEVICE); kfree(tun_qp->tx_ring[i].buf.addr); } - kfree(tun_qp->tx_ring); - tun_qp->tx_ring = NULL; i = MLX4_NUM_TUNNEL_BUFS; err: while (i > 0) { @@ -1687,6 +1685,8 @@ err: rx_buf_size, DMA_FROM_DEVICE); kfree(tun_qp->ring[i].addr); } + kfree(tun_qp->tx_ring); + tun_qp->tx_ring = NULL; kfree(tun_qp->ring); tun_qp->ring = NULL; return -ENOMEM; From b08afa064c320e5d85cdc27228426b696c4c8dae Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sun, 18 Aug 2019 14:29:31 -0500 Subject: [PATCH 132/266] infiniband: hfi1: fix a memory leak bug In fault_opcodes_read(), 'data' is not deallocated if debugfs_file_get() fails, leading to a memory leak. To fix this bug, introduce the 'free_data' label to free 'data' before returning the error. Signed-off-by: Wenwen Wang Reviewed-by: Leon Romanovsky Acked-by: Dennis Dalessandro Link: https://lore.kernel.org/r/1566156571-4335-1-git-send-email-wenwen@cs.uga.edu Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c index 93613e5def9b..814324d17295 100644 --- a/drivers/infiniband/hw/hfi1/fault.c +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -214,7 +214,7 @@ static ssize_t fault_opcodes_read(struct file *file, char __user *buf, return -ENOMEM; ret = debugfs_file_get(file->f_path.dentry); if (unlikely(ret)) - return ret; + goto free_data; bit = find_first_bit(fault->opcodes, bitsize); while (bit < bitsize) { zero = find_next_zero_bit(fault->opcodes, bitsize, bit); @@ -232,6 +232,7 @@ static ssize_t fault_opcodes_read(struct file *file, char __user *buf, data[size - 1] = '\n'; data[size] = '\0'; ret = simple_read_from_buffer(buf, len, pos, data, size); +free_data: kfree(data); return ret; } From 2323d7baab2b18d87d9bc267452e387aa9f0060a Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sun, 18 Aug 2019 13:54:46 -0500 Subject: [PATCH 133/266] infiniband: hfi1: fix memory leaks In fault_opcodes_write(), 'data' is allocated through kcalloc(). However, it is not deallocated in the following execution if an error occurs, leading to memory leaks. To fix this issue, introduce the 'free_data' label to free 'data' before returning the error. Signed-off-by: Wenwen Wang Reviewed-by: Leon Romanovsky Acked-by: Dennis Dalessandro Link: https://lore.kernel.org/r/1566154486-3713-1-git-send-email-wenwen@cs.uga.edu Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/fault.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c index 814324d17295..986c12153e62 100644 --- a/drivers/infiniband/hw/hfi1/fault.c +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -141,12 +141,14 @@ static ssize_t fault_opcodes_write(struct file *file, const char __user *buf, if (!data) return -ENOMEM; copy = min(len, datalen - 1); - if (copy_from_user(data, buf, copy)) - return -EFAULT; + if (copy_from_user(data, buf, copy)) { + ret = -EFAULT; + goto free_data; + } ret = debugfs_file_get(file->f_path.dentry); if (unlikely(ret)) - return ret; + goto free_data; ptr = data; token = ptr; for (ptr = data; *ptr; ptr = end + 1, token = ptr) { @@ -195,6 +197,7 @@ static ssize_t fault_opcodes_write(struct file *file, const char __user *buf, ret = len; debugfs_file_put(file->f_path.dentry); +free_data: kfree(data); return ret; } From 4651d1802f7063e4d8c0bcad957f46ece0c04024 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 19 Aug 2019 14:36:01 -0400 Subject: [PATCH 134/266] net/smc: make sure EPOLLOUT is raised Currently, we are only explicitly setting SOCK_NOSPACE on a write timeout for non-blocking sockets. Epoll() edge-trigger mode relies on SOCK_NOSPACE being set when -EAGAIN is returned to ensure that EPOLLOUT is raised. Expand the setting of SOCK_NOSPACE to non-blocking sockets as well that can use SO_SNDTIMEO to adjust their write timeout. This mirrors the behavior that Eric Dumazet introduced for tcp sockets. Signed-off-by: Jason Baron Cc: Eric Dumazet Cc: Ursula Braun Cc: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f0de323d15d6..6c8f09c1ce51 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -76,13 +76,11 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; - bool noblock; long timeo; int rc = 0; /* similar to sk_stream_wait_memory */ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - noblock = timeo ? false : true; add_wait_queue(sk_sleep(sk), &wait); while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -97,8 +95,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) break; } if (!timeo) { - if (noblock) - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + /* ensure EPOLLOUT is subsequently generated */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); rc = -EAGAIN; break; } From 96a1b033ac24ccc58156f05c183b2cba0b9412d5 Mon Sep 17 00:00:00 2001 From: "Terry S. Duncan" Date: Mon, 19 Aug 2019 17:24:02 -0700 Subject: [PATCH 135/266] net/ncsi: Ensure 32-bit boundary for data cksum The NCSI spec indicates that if the data does not end on a 32 bit boundary, one to three padding bytes equal to 0x00 shall be present to align the checksum field to a 32-bit boundary. Signed-off-by: Terry S. Duncan Signed-off-by: David S. Miller --- net/ncsi/ncsi-cmd.c | 2 +- net/ncsi/ncsi-rsp.c | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 5c3fad8cba57..eab4346b0a39 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -54,7 +54,7 @@ static void ncsi_cmd_build_header(struct ncsi_pkt_hdr *h, checksum = ncsi_calculate_checksum((unsigned char *)h, sizeof(*h) + nca->payload); pchecksum = (__be32 *)((void *)h + sizeof(struct ncsi_pkt_hdr) + - nca->payload); + ALIGN(nca->payload, 4)); *pchecksum = htonl(checksum); } diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 7581bf919885..d876bd55f356 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -47,7 +47,8 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED || ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) { netdev_dbg(nr->ndp->ndev.dev, - "NCSI: non zero response/reason code\n"); + "NCSI: non zero response/reason code %04xh, %04xh\n", + ntohs(h->code), ntohs(h->reason)); return -EPERM; } @@ -55,7 +56,7 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, * sender doesn't support checksum according to NCSI * specification. */ - pchecksum = (__be32 *)((void *)(h + 1) + payload - 4); + pchecksum = (__be32 *)((void *)(h + 1) + ALIGN(payload, 4) - 4); if (ntohl(*pchecksum) == 0) return 0; @@ -63,7 +64,9 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, sizeof(*h) + payload - 4); if (*pchecksum != htonl(checksum)) { - netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n"); + netdev_dbg(nr->ndp->ndev.dev, + "NCSI: checksum mismatched; recd: %08x calc: %08x\n", + *pchecksum, htonl(checksum)); return -EINVAL; } From a1c4cd67840ef80f6ca5f73326fa9a6719303a95 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 20 Aug 2019 13:52:47 +0800 Subject: [PATCH 136/266] net: fix __ip_mc_inc_group usage in ip_mc_inc_group, memory allocation flag, not mcast mode, is expected by __ip_mc_inc_group similar issue in __ip_mc_join_group, both mcase mode and gfp_t are needed here, so use ____ip_mc_inc_group(...) Fixes: 9fb20801dab4 ("net: Fix ip_mc_{dec,inc}_group allocation context") Signed-off-by: Li RongQing Signed-off-by: Florian Fainelli Signed-off-by: Zhang Yu Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 180f6896b98b..480d0b22db1a 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(__ip_mc_inc_group); void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { - __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); + __ip_mc_inc_group(in_dev, addr, GFP_KERNEL); } EXPORT_SYMBOL(ip_mc_inc_group); @@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, iml->sflist = NULL; iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); - __ip_mc_inc_group(in_dev, addr, mode); + ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL); err = 0; done: return err; From 90ae409f9eb3bcaf38688f9ec22375816053a08e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Aug 2019 11:45:49 +0900 Subject: [PATCH 137/266] dma-direct: fix zone selection after an unaddressable CMA allocation The new dma_alloc_contiguous hides if we allocate CMA or regular pages, and thus fails to retry a ZONE_NORMAL allocation if the CMA allocation succeeds but isn't addressable. That means we either fail outright or dip into a small zone that might not succeed either. Thanks to Hillf Danton for debugging this issue. Fixes: b1d2dc009dec ("dma-contiguous: add dma_{alloc,free}_contiguous() helpers") Reported-by: Tobias Klausmann Signed-off-by: Christoph Hellwig Tested-by: Tobias Klausmann --- drivers/iommu/dma-iommu.c | 3 +++ include/linux/dma-contiguous.h | 5 +---- kernel/dma/contiguous.c | 8 ++------ kernel/dma/direct.c | 10 +++++++++- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index d991d40f797f..f68a62c3c32b 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -965,10 +965,13 @@ static void *iommu_dma_alloc_pages(struct device *dev, size_t size, { bool coherent = dev_is_dma_coherent(dev); size_t alloc_size = PAGE_ALIGN(size); + int node = dev_to_node(dev); struct page *page = NULL; void *cpu_addr; page = dma_alloc_contiguous(dev, alloc_size, gfp); + if (!page) + page = alloc_pages_node(node, gfp, get_order(alloc_size)); if (!page) return NULL; diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index c05d4e661489..03f8e98e3bcc 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -160,10 +160,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, static inline struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - size_t align = get_order(PAGE_ALIGN(size)); - - return alloc_pages_node(node, gfp, align); + return NULL; } static inline void dma_free_contiguous(struct device *dev, struct page *page, diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 2bd410f934b3..69cfb4345388 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; - size_t align = get_order(PAGE_ALIGN(size)); + size_t count = size >> PAGE_SHIFT; struct page *page = NULL; struct cma *cma = NULL; @@ -243,14 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) /* CMA can be used only in the context which permits sleeping */ if (cma && gfpflags_allow_blocking(gfp)) { + size_t align = get_order(size); size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); } - /* Fallback allocation of normal pages */ - if (!page) - page = alloc_pages_node(node, gfp, align); return page; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 795c9b095d75..706113c6bebc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -85,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { + size_t alloc_size = PAGE_ALIGN(size); + int node = dev_to_node(dev); struct page *page = NULL; u64 phys_mask; @@ -95,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp &= ~__GFP_ZERO; gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); + page = dma_alloc_contiguous(dev, alloc_size, gfp); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + dma_free_contiguous(dev, page, alloc_size); + page = NULL; + } again: - page = dma_alloc_contiguous(dev, size, gfp); + if (!page) + page = alloc_pages_node(node, gfp, get_order(alloc_size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); page = NULL; From 377ec83643efcae869528b4b26a5070fdeba3abd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 11 Aug 2019 19:18:03 -0500 Subject: [PATCH 138/266] dmaengine: fsldma: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warnings (Building: powerpc-ppa8548_defconfig powerpc): drivers/dma/fsldma.c: In function ‘fsl_dma_chan_probe’: drivers/dma/fsldma.c:1165:26: warning: this statement may fall through [-Wimplicit-fallthrough=] chan->toggle_ext_pause = fsl_chan_toggle_ext_pause; ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/dma/fsldma.c:1166:2: note: here case FSL_DMA_IP_83XX: ^~~~ Reported-by: kbuild test robot Acked-by: Li Yang Signed-off-by: Gustavo A. R. Silva --- drivers/dma/fsldma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 23e0a356f167..ad72b3f42ffa 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -1163,6 +1163,7 @@ static int fsl_dma_chan_probe(struct fsldma_device *fdev, switch (chan->feature & FSL_DMA_IP_MASK) { case FSL_DMA_IP_85XX: chan->toggle_ext_pause = fsl_chan_toggle_ext_pause; + /* Fall through */ case FSL_DMA_IP_83XX: chan->toggle_ext_start = fsl_chan_toggle_ext_start; chan->set_src_loop_size = fsl_chan_set_src_loop_size; From 06264adfa2bcc8abb556dec9af0e86150a67faf0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 19:29:16 -0500 Subject: [PATCH 139/266] ARM: riscpc: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warning (Building: rpc_defconfig arm): arch/arm/mach-rpc/riscpc.c: In function ‘parse_tag_acorn’: arch/arm/mach-rpc/riscpc.c:48:13: warning: this statement may fall through [-Wimplicit-fallthrough=] vram_size += PAGE_SIZE * 256; ~~~~~~~~~~^~~~~~~~~~~~~~~~~~ arch/arm/mach-rpc/riscpc.c:49:2: note: here case 256: ^~~~ Signed-off-by: Gustavo A. R. Silva --- arch/arm/mach-rpc/riscpc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-rpc/riscpc.c b/arch/arm/mach-rpc/riscpc.c index 0ce56ad754ce..ea2c84214bac 100644 --- a/arch/arm/mach-rpc/riscpc.c +++ b/arch/arm/mach-rpc/riscpc.c @@ -46,6 +46,7 @@ static int __init parse_tag_acorn(const struct tag *tag) switch (tag->u.acorn.vram_pages) { case 512: vram_size += PAGE_SIZE * 256; + /* Fall through - ??? */ case 256: vram_size += PAGE_SIZE * 256; default: From edf6a05976980b5c21f19a60fde175f736e4ab61 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 13:01:03 -0500 Subject: [PATCH 140/266] drm/sun4i: sun6i_mipi_dsi: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warning (Building: multi_v7_defconfig arm): drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c: In function ‘sun6i_dsi_transfer’: drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c:993:6: warning: this statement may fall through [-Wimplicit-fallthrough=] if (msg->rx_len == 1) { ^ drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c:998:2: note: here default: ^~~~~~~ Signed-off-by: Gustavo A. R. Silva --- drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c b/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c index a1fc8b520985..b889ad3e86e1 100644 --- a/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c +++ b/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c @@ -993,6 +993,7 @@ static ssize_t sun6i_dsi_transfer(struct mipi_dsi_host *host, ret = sun6i_dsi_dcs_read(dsi, msg); break; } + /* Else, fall through */ default: ret = -EINVAL; From 5334653d4ff29f5e1f216a2f5d3a86c19a479b1a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 12:47:06 -0500 Subject: [PATCH 141/266] drm/sun4i: tcon: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warning (Building: sunxi_defconfig arm): drivers/gpu/drm/sun4i/sun4i_tcon.c: In function ‘sun4i_tcon0_mode_set_dithering’: drivers/gpu/drm/sun4i/sun4i_tcon.c:318:7: warning: this statement may fall through [-Wimplicit-fallthrough=] val |= SUN4I_TCON0_FRM_CTL_MODE_B; drivers/gpu/drm/sun4i/sun4i_tcon.c:319:2: note: here case MEDIA_BUS_FMT_RGB666_1X18: ^~~~ Signed-off-by: Gustavo A. R. Silva --- drivers/gpu/drm/sun4i/sun4i_tcon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/sun4i/sun4i_tcon.c b/drivers/gpu/drm/sun4i/sun4i_tcon.c index 64c43ee6bd92..df0cc8f46d7b 100644 --- a/drivers/gpu/drm/sun4i/sun4i_tcon.c +++ b/drivers/gpu/drm/sun4i/sun4i_tcon.c @@ -314,6 +314,7 @@ static void sun4i_tcon0_mode_set_dithering(struct sun4i_tcon *tcon, /* R and B components are only 5 bits deep */ val |= SUN4I_TCON0_FRM_CTL_MODE_R; val |= SUN4I_TCON0_FRM_CTL_MODE_B; + /* Fall through */ case MEDIA_BUS_FMT_RGB666_1X18: case MEDIA_BUS_FMT_RGB666_1X7X3_SPWG: /* Fall through: enable dithering */ From 3f0289cb9e0ee38e0075328e59b9cd88bf5ea474 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 12:54:32 -0500 Subject: [PATCH 142/266] mtd: sa1100: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warning (Building: assabet_defconfig arm): drivers/mtd/maps/sa1100-flash.c: In function ‘sa1100_probe_subdev’: drivers/mtd/maps/sa1100-flash.c:82:3: warning: this statement may fall through [-Wimplicit-fallthrough=] printk(KERN_WARNING "SA1100 flash: unknown base address " ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ "0x%08lx, assuming CS0\n", phys); ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/mtd/maps/sa1100-flash.c:85:2: note: here case SA1100_CS0_PHYS: ^~~~ Signed-off-by: Gustavo A. R. Silva --- drivers/mtd/maps/sa1100-flash.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c index 895510d40ce4..47602af4ee34 100644 --- a/drivers/mtd/maps/sa1100-flash.c +++ b/drivers/mtd/maps/sa1100-flash.c @@ -81,6 +81,7 @@ static int sa1100_probe_subdev(struct sa_subdev_info *subdev, struct resource *r default: printk(KERN_WARNING "SA1100 flash: unknown base address " "0x%08lx, assuming CS0\n", phys); + /* Fall through */ case SA1100_CS0_PHYS: subdev->map.bankwidth = (MSC0 & MSC_RBW) ? 2 : 4; From c9cbbb9f04f3ee27970f08d3aa6e6742a43d4ca5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 13:07:46 -0500 Subject: [PATCH 143/266] watchdog: wdt285: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warning (Building: footbridge_defconfig arm): drivers/watchdog/wdt285.c: In function ‘watchdog_ioctl’: drivers/watchdog/wdt285.c:170:3: warning: this statement may fall through [-Wimplicit-fallthrough=] watchdog_ping(); ^~~~~~~~~~~~~~~ drivers/watchdog/wdt285.c:172:2: note: here case WDIOC_GETTIMEOUT: ^~~~ Signed-off-by: Gustavo A. R. Silva --- drivers/watchdog/wdt285.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/wdt285.c b/drivers/watchdog/wdt285.c index 4eacfb1ce1ac..eb729d704836 100644 --- a/drivers/watchdog/wdt285.c +++ b/drivers/watchdog/wdt285.c @@ -168,7 +168,7 @@ static long watchdog_ioctl(struct file *file, unsigned int cmd, soft_margin = new_margin; reload = soft_margin * (mem_fclk_21285 / 256); watchdog_ping(); - /* Fall */ + /* Fall through */ case WDIOC_GETTIMEOUT: ret = put_user(soft_margin, int_arg); break; From 5274fdba8e3c04e9ac1ba457379afc8835f9aa0f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 15:55:26 -0500 Subject: [PATCH 144/266] power: supply: ab8500_charger: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warning (Building: allmodconfig arm): drivers/power/supply/ab8500_charger.c: In function ‘ab8500_charger_max_usb_curr’: drivers/power/supply/ab8500_charger.c:738:6: warning: this statement may fall through [-Wimplicit-fallthrough=] if (di->vbus_detected) { ^ drivers/power/supply/ab8500_charger.c:745:2: note: here case USB_STAT_HM_IDGND: ^~~~ Signed-off-by: Gustavo A. R. Silva --- drivers/power/supply/ab8500_charger.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/power/supply/ab8500_charger.c b/drivers/power/supply/ab8500_charger.c index 30de448de802..86d88aec94a1 100644 --- a/drivers/power/supply/ab8500_charger.c +++ b/drivers/power/supply/ab8500_charger.c @@ -742,6 +742,7 @@ static int ab8500_charger_max_usb_curr(struct ab8500_charger *di, USB_CH_IP_CUR_LVL_1P5; break; } + /* Else, fall through */ case USB_STAT_HM_IDGND: dev_err(di->dev, "USB Type - Charging not allowed\n"); di->max_usb_in_curr.usb_type_max = USB_CH_IP_CUR_LVL_0P05; From 93cbcf5d22bbe6f2ecf64765d5f6085beceb3ee8 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 16:03:09 -0500 Subject: [PATCH 145/266] MIPS: Octeon: Mark expected switch fall-through Mark switch cases where we are expecting to fall through. Fix the following warning (Building: cavium_octeon_defconfig mips): arch/mips/include/asm/octeon/cvmx-sli-defs.h:47:6: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Gustavo A. R. Silva --- arch/mips/include/asm/octeon/cvmx-sli-defs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/include/asm/octeon/cvmx-sli-defs.h b/arch/mips/include/asm/octeon/cvmx-sli-defs.h index 52cf96ea43e5..cbc7cdae1c6a 100644 --- a/arch/mips/include/asm/octeon/cvmx-sli-defs.h +++ b/arch/mips/include/asm/octeon/cvmx-sli-defs.h @@ -46,6 +46,7 @@ static inline uint64_t CVMX_SLI_PCIE_MSI_RCV_FUNC(void) case OCTEON_CN78XX & OCTEON_FAMILY_MASK: if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) return 0x0000000000003CB0ull; + /* Else, fall through */ default: return 0x0000000000023CB0ull; } From da1fb2909e701ffbae8c5d6111f475603355e6e2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 16:20:05 -0500 Subject: [PATCH 146/266] scsi: libsas: sas_discover: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warning (Building: mtx1_defconfig mips): drivers/scsi/libsas/sas_discover.c: In function ‘sas_discover_domain’: ./include/linux/printk.h:309:2: warning: this statement may fall through [-Wimplicit-fallthrough=] printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/scsi/libsas/sas_discover.c:459:3: note: in expansion of macro ‘pr_notice’ pr_notice("ATA device seen but CONFIG_SCSI_SAS_ATA=N so cannot attach\n"); ^~~~~~~~~ drivers/scsi/libsas/sas_discover.c:462:2: note: here default: ^~~~~~~ Signed-off-by: Gustavo A. R. Silva --- drivers/scsi/libsas/sas_discover.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index abcad097ff2f..f47b4b281b14 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -459,6 +459,7 @@ static void sas_discover_domain(struct work_struct *work) pr_notice("ATA device seen but CONFIG_SCSI_SAS_ATA=N so cannot attach\n"); /* Fall through */ #endif + /* Fall through - only for the #else condition above. */ default: error = -ENXIO; pr_err("unhandled device %d\n", dev->dev_type); From c3cb6674df4c4a70f949e412dfe2230483092523 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2019 19:07:46 -0500 Subject: [PATCH 147/266] video: fbdev: acornfb: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. Fix the following warning (Building: rpc_defconfig arm): drivers/video/fbdev/acornfb.c: In function ‘acornfb_parse_dram’: drivers/video/fbdev/acornfb.c:860:9: warning: this statement may fall through [-Wimplicit-fallthrough=] size *= 1024; ~~~~~^~~~~~~ drivers/video/fbdev/acornfb.c:861:3: note: here case 'K': ^~~~ Signed-off-by: Gustavo A. R. Silva --- drivers/video/fbdev/acornfb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/fbdev/acornfb.c b/drivers/video/fbdev/acornfb.c index 92f23e3bc27a..7cacae5a8797 100644 --- a/drivers/video/fbdev/acornfb.c +++ b/drivers/video/fbdev/acornfb.c @@ -858,6 +858,7 @@ static void acornfb_parse_dram(char *opt) case 'M': case 'm': size *= 1024; + /* Fall through */ case 'K': case 'k': size *= 1024; From 6de3c9e3f6b3eaf66859e1379b3f35dda781416b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 15 Aug 2019 11:41:06 +0200 Subject: [PATCH 148/266] ALSA: usb-audio: Fix invalid NULL check in snd_emuusb_set_samplerate() The quirk function snd_emuusb_set_samplerate() has a NULL check for the mixer element, but this is useless in the current code. It used to be a check against mixer->id_elems[unitid] but it was changed later to the value after mixer_eleme_list_to_info() which is always non-NULL due to the container_of() usage. This patch fixes the check before the conversion. While we're at it, correct a typo in the comment in the function, too. Fixes: 8c558076c740 ("ALSA: usb-audio: Clean up mixer element list traverse") Cc: Signed-off-by: Takashi Iwai --- sound/usb/mixer_quirks.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index 199fa157a411..27dcb3743690 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -1155,17 +1155,17 @@ void snd_emuusb_set_samplerate(struct snd_usb_audio *chip, { struct usb_mixer_interface *mixer; struct usb_mixer_elem_info *cval; - int unitid = 12; /* SamleRate ExtensionUnit ID */ + int unitid = 12; /* SampleRate ExtensionUnit ID */ list_for_each_entry(mixer, &chip->mixer_list, list) { - cval = mixer_elem_list_to_info(mixer->id_elems[unitid]); - if (cval) { + if (mixer->id_elems[unitid]) { + cval = mixer_elem_list_to_info(mixer->id_elems[unitid]); snd_usb_mixer_set_ctl_value(cval, UAC_SET_CUR, cval->control << 8, samplerate_id); snd_usb_mixer_notify_id(mixer, unitid); + break; } - break; } } From 5fd2f91ad483baffdbe798f8a08f1b41442d1e24 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 1 Aug 2019 09:30:33 +0200 Subject: [PATCH 149/266] mac80211: fix possible sta leak If TDLS station addition is rejected, the sta memory is leaked. Avoid this by moving the check before the allocation. Cc: stable@vger.kernel.org Fixes: 7ed5285396c2 ("mac80211: don't initiate TDLS connection if station is not associated to AP") Link: https://lore.kernel.org/r/20190801073033.7892-1-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4d458067d80d..111c400199ec 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1546,6 +1546,11 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (is_multicast_ether_addr(mac)) return -EINVAL; + if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER) && + sdata->vif.type == NL80211_IFTYPE_STATION && + !sdata->u.mgd.associated) + return -EINVAL; + sta = sta_info_alloc(sdata, mac, GFP_KERNEL); if (!sta) return -ENOMEM; @@ -1553,10 +1558,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) sta->sta.tdls = true; - if (sta->sta.tdls && sdata->vif.type == NL80211_IFTYPE_STATION && - !sdata->u.mgd.associated) - return -EINVAL; - err = sta_apply_parameters(local, sta, params); if (err) { sta_info_free(local, sta); From b67fd72e84a88cae64cea8ab47ccdaab3bb3094d Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Mon, 5 Aug 2019 14:34:00 +0200 Subject: [PATCH 150/266] cfg80211: Fix Extended Key ID key install checks Fix two shortcomings in the Extended Key ID API: 1) Allow the userspace to install pairwise keys using keyid 1 without NL80211_KEY_NO_TX set. This allows the userspace to install and activate pairwise keys with keyid 1 in the same way as for keyid 0, simplifying the API usage for e.g. FILS and FT key installs. 2) IEEE 802.11 - 2016 restricts Extended Key ID usage to CCMP/GCMP ciphers in IEEE 802.11 - 2016 "9.4.2.25.4 RSN capabilities". Enforce that when installing a key. Cc: stable@vger.kernel.org # 5.2 Fixes: 6cdd3979a2bd ("nl80211/cfg80211: Extended Key ID support") Signed-off-by: Alexander Wetzel Link: https://lore.kernel.org/r/20190805123400.51567-1-alexander@wetzel-home.de Signed-off-by: Johannes Berg --- net/wireless/util.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/wireless/util.c b/net/wireless/util.c index d0e35b7b9e35..e74837824cea 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -233,25 +233,30 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, switch (params->cipher) { case WLAN_CIPHER_SUITE_TKIP: + /* Extended Key ID can only be used with CCMP/GCMP ciphers */ + if ((pairwise && key_idx) || + params->mode != NL80211_KEY_RX_TX) + return -EINVAL; + break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - /* IEEE802.11-2016 allows only 0 and - when using Extended Key - * ID - 1 as index for pairwise keys. + /* IEEE802.11-2016 allows only 0 and - when supporting + * Extended Key ID - 1 as index for pairwise keys. * @NL80211_KEY_NO_TX is only allowed for pairwise keys when * the driver supports Extended Key ID. * @NL80211_KEY_SET_TX can't be set when installing and * validating a key. */ - if (params->mode == NL80211_KEY_NO_TX) { - if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_EXT_KEY_ID)) + if ((params->mode == NL80211_KEY_NO_TX && !pairwise) || + params->mode == NL80211_KEY_SET_TX) + return -EINVAL; + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID)) { + if (pairwise && (key_idx < 0 || key_idx > 1)) return -EINVAL; - else if (!pairwise || key_idx < 0 || key_idx > 1) - return -EINVAL; - } else if ((pairwise && key_idx) || - params->mode == NL80211_KEY_SET_TX) { + } else if (pairwise && key_idx) { return -EINVAL; } break; From 54577e5018a8c0cb79c9a0fa118a55c68715d398 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 20 Aug 2019 17:35:52 +0200 Subject: [PATCH 151/266] selftests: kvm: fix state save/load on processors without XSAVE state_test and smm_test are failing on older processors that do not have xcr0. This is because on those processor KVM does provide support for KVM_GET/SET_XSAVE (to avoid having to rely on the older KVM_GET/SET_FPU) but not for KVM_GET/SET_XCRS. Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/lib/x86_64/processor.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 6cb34a0fa200..0a5e487dbc50 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1060,9 +1060,11 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", r); - r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", - r); + if (kvm_check_cap(KVM_CAP_XCRS)) { + r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", + r); + } r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", @@ -1103,9 +1105,11 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", r); - r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", - r); + if (kvm_check_cap(KVM_CAP_XCRS)) { + r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", + r); + } r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", From d012a06ab1d23178fc6856d8d2161fbcc4dd8ebd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 15 Aug 2019 09:43:32 +0200 Subject: [PATCH 152/266] Revert "KVM: x86/mmu: Zap only the relevant pages when removing a memslot" This reverts commit 4e103134b862314dc2f2f18f2fb0ab972adc3f5f. Alex Williamson reported regressions with device assignment with this patch. Even though the bug is probably elsewhere and still latent, this is needed to fix the regression. Fixes: 4e103134b862 ("KVM: x86/mmu: Zap only the relevant pages when removing a memslot", 2019-02-05) Reported-by: Alex Willamson Cc: stable@vger.kernel.org Cc: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 24843cf49579..218b277bfda3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5653,38 +5653,7 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) { - struct kvm_mmu_page *sp; - LIST_HEAD(invalid_list); - unsigned long i; - bool flush; - gfn_t gfn; - - spin_lock(&kvm->mmu_lock); - - if (list_empty(&kvm->arch.active_mmu_pages)) - goto out_unlock; - - flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false); - - for (i = 0; i < slot->npages; i++) { - gfn = slot->base_gfn + i; - - for_each_valid_sp(kvm, sp, gfn) { - if (sp->gfn != gfn) - continue; - - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - flush = false; - cond_resched_lock(&kvm->mmu_lock); - } - } - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - -out_unlock: - spin_unlock(&kvm->mmu_lock); + kvm_mmu_zap_all(kvm); } void kvm_mmu_init_vm(struct kvm *kvm) From 0d31d4dbf38412f5b8b11b4511d07b840eebe8cb Mon Sep 17 00:00:00 2001 From: "Hodaszi, Robert" Date: Fri, 14 Jun 2019 13:16:01 +0000 Subject: [PATCH 153/266] Revert "cfg80211: fix processing world regdomain when non modular" This reverts commit 96cce12ff6e0 ("cfg80211: fix processing world regdomain when non modular"). Re-triggering a reg_process_hint with the last request on all events, can make the regulatory domain fail in case of multiple WiFi modules. On slower boards (espacially with mdev), enumeration of the WiFi modules can end up in an intersected regulatory domain, and user cannot set it with 'iw reg set' anymore. This is happening, because: - 1st module enumerates, queues up a regulatory request - request gets processed by __reg_process_hint_driver(): - checks if previous was set by CORE -> yes - checks if regulator domain changed -> yes, from '00' to e.g. 'US' -> sends request to the 'crda' - 2nd module enumerates, queues up a regulator request (which triggers the reg_todo() work) - reg_todo() -> reg_process_pending_hints() sees, that the last request is not processed yet, so it tries to process it again. __reg_process_hint driver() will run again, and: - checks if the last request's initiator was the core -> no, it was the driver (1st WiFi module) - checks, if the previous initiator was the driver -> yes - checks if the regulator domain changed -> yes, it was '00' (set by core, and crda call did not return yet), and should be changed to 'US' ------> __reg_process_hint_driver calls an intersect Besides, the reg_process_hint call with the last request is meaningless since the crda call has a timeout work. If that timeout expires, the first module's request will lost. Cc: stable@vger.kernel.org Fixes: 96cce12ff6e0 ("cfg80211: fix processing world regdomain when non modular") Signed-off-by: Robert Hodaszi Link: https://lore.kernel.org/r/20190614131600.GA13897@a1-hr Signed-off-by: Johannes Berg --- net/wireless/reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 4831ad745f91..327479ce69f5 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2788,7 +2788,7 @@ static void reg_process_pending_hints(void) /* When last_request->processed becomes true this will be rescheduled */ if (lr && !lr->processed) { - reg_process_hint(lr); + pr_debug("Pending regulatory request, waiting for it to be processed...\n"); return; } From 3b5be16c7e90a69c93349d210766250fffcb54bd Mon Sep 17 00:00:00 2001 From: He Zhe Date: Tue, 20 Aug 2019 22:53:10 +0800 Subject: [PATCH 154/266] modules: page-align module section allocations only for arches supporting strict module rwx We should keep the case of "#define debug_align(X) (X)" for all arches without CONFIG_HAS_STRICT_MODULE_RWX ability, which would save people, who are sensitive to system size, a lot of memory when using modules, especially for embedded systems. This is also the intention of the original #ifdef... statement and still valid for now. Note that this still keeps the effect of the fix of the following commit, 38f054d549a8 ("modules: always page-align module section allocations"), since when CONFIG_ARCH_HAS_STRICT_MODULE_RWX is enabled, module pages are aligned. Signed-off-by: He Zhe Signed-off-by: Jessica Yu --- kernel/module.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/module.c b/kernel/module.c index cd8df516666d..9ee93421269c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -64,9 +64,14 @@ /* * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data + * to ensure complete separation of code and data, but + * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX # define debug_align(X) ALIGN(X, PAGE_SIZE) +#else +# define debug_align(X) (X) +#endif /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) From ed19e3035c5a16034e896eed28c5e72e02e2ff58 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Thu, 8 Aug 2019 19:25:47 +0300 Subject: [PATCH 155/266] drm/i915: Fix HW readout for crtc_clock in HDMI mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The conversion during HDMI HW readout from port_clock to crtc_clock was missed when HDMI 10bpc support was added, so fix that. v2: - Unscrew the non-HDMI case. Fixes: cd9e11a8bf25 ("drm/i915/icl: Add 10-bit support for hdmi") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109593 Cc: Radhakrishna Sripada Cc: Ville Syrjälä Signed-off-by: Imre Deak Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20190808162547.7009-1-imre.deak@intel.com (cherry picked from commit 2969a78aead38b49e80c821a5c683544ab16160d) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_ddi.c | 4 ++-- drivers/gpu/drm/i915/intel_drv.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 7925a176f900..1cb1fa74cfbc 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -1465,8 +1465,8 @@ static void ddi_dotclock_get(struct intel_crtc_state *pipe_config) else if (intel_crtc_has_dp_encoder(pipe_config)) dotclock = intel_dotclock_calculate(pipe_config->port_clock, &pipe_config->dp_m_n); - else if (pipe_config->has_hdmi_sink && pipe_config->pipe_bpp == 36) - dotclock = pipe_config->port_clock * 2 / 3; + else if (pipe_config->has_hdmi_sink && pipe_config->pipe_bpp > 24) + dotclock = pipe_config->port_clock * 24 / pipe_config->pipe_bpp; else dotclock = pipe_config->port_clock; diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 1d58f7ec5d84..f11979879e7b 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -829,7 +829,7 @@ struct intel_crtc_state { /* * Frequence the dpll for the port should run at. Differs from the - * adjusted dotclock e.g. for DP or 12bpc hdmi mode. This is also + * adjusted dotclock e.g. for DP or 10/12bpc hdmi mode. This is also * already multiplied by pixel_multiplier. */ int port_clock; From 806ce6e2117a42528e7bb979e04e28229b34a612 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 20 Aug 2019 16:18:04 +0200 Subject: [PATCH 156/266] selftests/bpf: fix test_cgroup_storage on s390 test_cgroup_storage fails on s390 with an assertion failure: packets are dropped when they shouldn't. The problem is that BPF_DW packet count is accessed as BPF_W with an offset of 0, which is not correct on big-endian machines. Since the point of this test is not to verify narrow loads/stores, simply use BPF_DW when working with packet counts. Fixes: 68cfa3ac6b8d ("selftests/bpf: add a cgroup storage test") Fixes: 919646d2a3a9 ("selftests/bpf: extend the storage test to test per-cpu cgroup storage") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_cgroup_storage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c index 2fc4625c1a15..655729004391 100644 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -20,9 +20,9 @@ int main(int argc, char **argv) BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), - BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1), - BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), BPF_LD_MAP_FD(BPF_REG_1, 0), /* map fd */ BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */ @@ -30,7 +30,7 @@ int main(int argc, char **argv) BPF_FUNC_get_local_storage), BPF_MOV64_IMM(BPF_REG_1, 1), BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0), - BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1), BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), From e91dcb536ae263ecff07118e36bf820c229a6ecd Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 19 Aug 2019 14:38:47 +0200 Subject: [PATCH 157/266] selftests/bpf: fix test_btf_dump with O= test_btf_dump fails when run with O=, because it needs to access source files and assumes they live in ./progs/, which is not the case in this scenario. Fix by instructing kselftest to copy btf_dump_test_case_*.c files to the test directory. Since kselftest does not preserve directory structure, adjust the test to look in ./progs/ and then in ./. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 3 +++ tools/testing/selftests/bpf/test_btf_dump.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c085964e1d05..69b98d8d3b5b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -34,6 +34,9 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c))) TEST_GEN_FILES = $(BPF_OBJ_FILES) +BTF_C_FILES = $(wildcard progs/btf_dump_test_case_*.c) +TEST_FILES = $(BTF_C_FILES) + # Also test sub-register code-gen if LLVM has eBPF v3 processor support which # contains both ALU32 and JMP32 instructions. SUBREG_CODEGEN := $(shell echo "int cal(int a) { return a > 0; }" | \ diff --git a/tools/testing/selftests/bpf/test_btf_dump.c b/tools/testing/selftests/bpf/test_btf_dump.c index 8f850823d35f..6e75dd3cb14f 100644 --- a/tools/testing/selftests/bpf/test_btf_dump.c +++ b/tools/testing/selftests/bpf/test_btf_dump.c @@ -97,6 +97,13 @@ int test_btf_dump_case(int n, struct btf_dump_test_case *test_case) } snprintf(test_file, sizeof(test_file), "progs/%s.c", test_case->name); + if (access(test_file, R_OK) == -1) + /* + * When the test is run with O=, kselftest copies TEST_FILES + * without preserving the directory structure. + */ + snprintf(test_file, sizeof(test_file), "%s.c", + test_case->name); /* * Diff test output and expected test output, contained between * START-EXPECTED-OUTPUT and END-EXPECTED-OUTPUT lines in test case. From 0604409df9e04cdec7b08d471c8c1c0c10b5554d Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 20 Aug 2019 15:41:34 +0200 Subject: [PATCH 158/266] selftests/bpf: add config fragment BPF_JIT When running test_kmod.sh the following shows up # sysctl cannot stat /proc/sys/net/core/bpf_jit_enable No such file or directory cannot: stat_/proc/sys/net/core/bpf_jit_enable # # sysctl cannot stat /proc/sys/net/core/bpf_jit_harden No such file or directory cannot: stat_/proc/sys/net/core/bpf_jit_harden # Rework to enable CONFIG_BPF_JIT to solve "No such file or directory" Signed-off-by: Anders Roxell Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index f7a0744db31e..5dc109f4c097 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -34,3 +34,4 @@ CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m CONFIG_IPV6_SIT=m +CONFIG_BPF_JIT=y From 3035bb72ee47d494c041465b4add9c6407c832ed Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 20 Aug 2019 15:41:21 +0200 Subject: [PATCH 159/266] selftests/bpf: install files test_xdp_vlan.sh When ./test_xdp_vlan_mode_generic.sh runs it complains that it can't find file test_xdp_vlan.sh. # selftests: bpf: test_xdp_vlan_mode_generic.sh # ./test_xdp_vlan_mode_generic.sh: line 9: ./test_xdp_vlan.sh: No such file or directory Rework so that test_xdp_vlan.sh gets installed, added to the variable TEST_PROGS_EXTENDED. Fixes: d35661fcf95d ("selftests/bpf: add wrapper scripts for test_xdp_vlan.sh") Signed-off-by: Anders Roxell Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 69b98d8d3b5b..96752ebd938f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -71,7 +71,8 @@ TEST_PROGS := test_kmod.sh \ TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ tcp_client.py \ - tcp_server.py + tcp_server.py \ + test_xdp_vlan.sh # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \ From 08c04c84a5cde3af9baac0645a7496d6dcd76822 Mon Sep 17 00:00:00 2001 From: Bryan Gurney Date: Fri, 16 Aug 2019 10:09:53 -0400 Subject: [PATCH 160/266] dm dust: use dust block size for badblocklist index Change the "frontend" dust_remove_block, dust_add_block, and dust_query_block functions to store the "dust block number", instead of the sector number corresponding to the "dust block number". For the "backend" functions dust_map_read and dust_map_write, right-shift by sect_per_block_shift. This fixes the inability to emulate failure beyond the first sector of each "dust block" (for devices with a "dust block size" larger than 512 bytes). Fixes: e4f3fabd67480bf ("dm: add dust target") Cc: stable@vger.kernel.org Signed-off-by: Bryan Gurney Signed-off-by: Mike Snitzer --- drivers/md/dm-dust.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 845f376a72d9..8288887b7f94 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -25,6 +25,7 @@ struct dust_device { unsigned long long badblock_count; spinlock_t dust_lock; unsigned int blksz; + int sect_per_block_shift; unsigned int sect_per_block; sector_t start; bool fail_read_on_bb:1; @@ -79,7 +80,7 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block) unsigned long flags; spin_lock_irqsave(&dd->dust_lock, flags); - bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + bblock = dust_rb_search(&dd->badblocklist, block); if (bblock == NULL) { if (!dd->quiet_mode) { @@ -113,7 +114,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) } spin_lock_irqsave(&dd->dust_lock, flags); - bblock->bb = block * dd->sect_per_block; + bblock->bb = block; if (!dust_rb_insert(&dd->badblocklist, bblock)) { if (!dd->quiet_mode) { DMERR("%s: block %llu already in badblocklist", @@ -138,7 +139,7 @@ static int dust_query_block(struct dust_device *dd, unsigned long long block) unsigned long flags; spin_lock_irqsave(&dd->dust_lock, flags); - bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + bblock = dust_rb_search(&dd->badblocklist, block); if (bblock != NULL) DMINFO("%s: block %llu found in badblocklist", __func__, block); else @@ -165,6 +166,7 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock, int ret = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { + thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); ret = __dust_map_read(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); @@ -195,6 +197,7 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock, unsigned long flags; if (fail_read_on_bb) { + thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); __dust_map_write(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); @@ -331,6 +334,8 @@ static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv) dd->blksz = blksz; dd->start = tmp; + dd->sect_per_block_shift = __ffs(sect_per_block); + /* * Whether to fail a read on a "bad" block. * Defaults to false; enabled later by message. From e0702d90b79d430b0ccc276ead4f88440bb51352 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 19 Aug 2019 12:58:14 +0300 Subject: [PATCH 161/266] dm zoned: fix potential NULL dereference in dmz_do_reclaim() This function is supposed to return error pointers so it matches the dmz_get_rnd_zone_for_reclaim() function. The current code could lead to a NULL dereference in dmz_do_reclaim() Fixes: b234c6d7a703 ("dm zoned: improve error handling in reclaim") Signed-off-by: Dan Carpenter Reviewed-by: Dmitry Fomichev Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 2a5bc51fd6d5..595a73110e17 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1588,7 +1588,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) struct dm_zone *zone; if (list_empty(&zmd->map_seq_list)) - return NULL; + return ERR_PTR(-EBUSY); list_for_each_entry(zone, &zmd->map_seq_list, link) { if (!zone->bzone) @@ -1597,7 +1597,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) return zone; } - return NULL; + return ERR_PTR(-EBUSY); } /* From dc1a3e8e0cc6b2293b48c044710e63395aeb4fb4 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sun, 18 Aug 2019 19:18:34 -0500 Subject: [PATCH 162/266] dm raid: add missing cleanup in raid_ctr() If rs_prepare_reshape() fails, no cleanup is executed, leading to leak of the raid_set structure allocated at the beginning of raid_ctr(). To fix this issue, go to the label 'bad' if the error occurs. Fixes: 11e4723206683 ("dm raid: stop keeping raid set frozen altogether") Cc: stable@vger.kernel.org Signed-off-by: Wenwen Wang Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 8a60a4a070ac..1f933dd197cd 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3194,7 +3194,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) */ r = rs_prepare_reshape(rs); if (r) - return r; + goto bad; /* Reshaping ain't recovery, so disable recovery */ rs_setup_recovery(rs, MaxSector); From e4427372398c31f57450565de277f861a4db5b3b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 10 Jun 2019 19:22:55 +0200 Subject: [PATCH 163/266] selftests/kvm: make platform_info_test pass on AMD test_msr_platform_info_disabled() generates EXIT_SHUTDOWN but VMCB state is undefined after that so an attempt to launch this guest again from test_msr_platform_info_enabled() fails. Reorder the tests to make test pass. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/platform_info_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 40050e44ec0a..f9334bd3cce9 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -99,8 +99,8 @@ int main(int argc, char *argv[]) msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - test_msr_platform_info_disabled(vm); test_msr_platform_info_enabled(vm); + test_msr_platform_info_disabled(vm); vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); kvm_vm_free(vm); From 1bc8d18c75fef3b478dbdfef722aae09e2a9fde7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 21 Aug 2019 20:00:02 +0200 Subject: [PATCH 164/266] ALSA: line6: Fix memory leak at line6_init_pcm() error path I forgot to release the allocated object at the early error path in line6_init_pcm(). For addressing it, slightly shuffle the code so that the PCM destructor (pcm->private_free) is assigned properly before all error paths. Fixes: 3450121997ce ("ALSA: line6: Fix write on zero-sized buffer") Cc: Signed-off-by: Takashi Iwai --- sound/usb/line6/pcm.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c index 2c03e0f6bf72..f70211e6b174 100644 --- a/sound/usb/line6/pcm.c +++ b/sound/usb/line6/pcm.c @@ -550,6 +550,15 @@ int line6_init_pcm(struct usb_line6 *line6, line6pcm->volume_monitor = 255; line6pcm->line6 = line6; + spin_lock_init(&line6pcm->out.lock); + spin_lock_init(&line6pcm->in.lock); + line6pcm->impulse_period = LINE6_IMPULSE_DEFAULT_PERIOD; + + line6->line6pcm = line6pcm; + + pcm->private_data = line6pcm; + pcm->private_free = line6_cleanup_pcm; + line6pcm->max_packet_size_in = usb_maxpacket(line6->usbdev, usb_rcvisocpipe(line6->usbdev, ep_read), 0); @@ -562,15 +571,6 @@ int line6_init_pcm(struct usb_line6 *line6, return -EINVAL; } - spin_lock_init(&line6pcm->out.lock); - spin_lock_init(&line6pcm->in.lock); - line6pcm->impulse_period = LINE6_IMPULSE_DEFAULT_PERIOD; - - line6->line6pcm = line6pcm; - - pcm->private_data = line6pcm; - pcm->private_free = line6_cleanup_pcm; - err = line6_create_audio_out_urbs(line6pcm); if (err < 0) return err; From aad12c2394189f606ce0308ab65505fdd9081a10 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 21 Aug 2019 14:29:29 +0300 Subject: [PATCH 165/266] trivial: netns: fix typo in 'struct net.passive' description Replace 'decided' with 'decide' so that comment would be /* To decide when the network namespace should be freed. */ Signed-off-by: Mike Rapoport Signed-off-by: David S. Miller --- include/net/net_namespace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index cb668bc2692d..ab40d7afdc54 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -52,7 +52,7 @@ struct bpf_prog; #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) struct net { - refcount_t passive; /* To decided when the network + refcount_t passive; /* To decide when the network * namespace should be freed. */ refcount_t count; /* To decided when the network From 7846f58fba964af7cb8cf77d4d13c33254725211 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 21 Aug 2019 12:25:13 -0700 Subject: [PATCH 166/266] x86/boot: Fix boot regression caused by bootparam sanitizing commit a90118c445cc ("x86/boot: Save fields explicitly, zero out everything else") had two errors: * It preserved boot_params.acpi_rsdp_addr, and * It failed to preserve boot_params.hdr Therefore, zero out acpi_rsdp_addr, and preserve hdr. Fixes: a90118c445cc ("x86/boot: Save fields explicitly, zero out everything else") Reported-by: Neil MacLeod Suggested-by: Thomas Gleixner Signed-off-by: John Hubbard Signed-off-by: Thomas Gleixner Tested-by: Neil MacLeod Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190821192513.20126-1-jhubbard@nvidia.com --- arch/x86/include/asm/bootparam_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index f5e90a849bca..9e5f3c722c33 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -59,7 +59,6 @@ static void sanitize_boot_params(struct boot_params *boot_params) BOOT_PARAM_PRESERVE(apm_bios_info), BOOT_PARAM_PRESERVE(tboot_addr), BOOT_PARAM_PRESERVE(ist_info), - BOOT_PARAM_PRESERVE(acpi_rsdp_addr), BOOT_PARAM_PRESERVE(hd0_info), BOOT_PARAM_PRESERVE(hd1_info), BOOT_PARAM_PRESERVE(sys_desc_table), @@ -71,6 +70,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) BOOT_PARAM_PRESERVE(eddbuf_entries), BOOT_PARAM_PRESERVE(edd_mbr_sig_buf_entries), BOOT_PARAM_PRESERVE(edd_mbr_sig_buffer), + BOOT_PARAM_PRESERVE(hdr), BOOT_PARAM_PRESERVE(e820_table), BOOT_PARAM_PRESERVE(eddbuf), }; From 2d683eaaeeb9d33d23674ae635e0ef1448523d18 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 21 Aug 2019 16:41:23 +0200 Subject: [PATCH 167/266] net: cpsw: fix NULL pointer exception in the probe error path In certain cases when the probe function fails the error path calls cpsw_remove_dt() before calling platform_set_drvdata(). This is an issue as cpsw_remove_dt() uses platform_get_drvdata() to retrieve the cpsw_common data and leds to a NULL pointer exception. This patches fixes it by calling platform_set_drvdata() earlier in the probe. Fixes: 83a8471ba255 ("net: ethernet: ti: cpsw: refactor probe to group common hw initialization") Reported-by: Maxime Chevallier Signed-off-by: Antoine Tenart Reviewed-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 32a89744972d..a46b8b2e44e1 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2775,6 +2775,7 @@ static int cpsw_probe(struct platform_device *pdev) if (!cpsw) return -ENOMEM; + platform_set_drvdata(pdev, cpsw); cpsw->dev = dev; mode = devm_gpiod_get_array_optional(dev, "mode", GPIOD_OUT_LOW); @@ -2879,7 +2880,6 @@ static int cpsw_probe(struct platform_device *pdev) goto clean_cpts; } - platform_set_drvdata(pdev, cpsw); priv = netdev_priv(ndev); priv->cpsw = cpsw; priv->ndev = ndev; From 98f58ada2d37e68125c056f1fc005748251879c2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 15 Aug 2019 08:27:09 -0500 Subject: [PATCH 168/266] drm/amdgpu/gfx9: update pg_flags after determining if gfx off is possible We need to set certain power gating flags after we determine if the firmware version is sufficient to support gfxoff. Previously we set the pg flags in early init, but we later we might have disabled gfxoff if the firmware versions didn't support it. Move adding the additional pg flags after we determine whether or not to support gfxoff. Fixes: 005440066f92 ("drm/amdgpu: enable gfxoff again on raven series (v2)") Tested-by: Kai-Heng Feng Tested-by: Tom St Denis Signed-off-by: Alex Deucher Cc: Kai-Heng Feng Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/soc15.c | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 04b8ac4432c7..4ea67f94cae2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -604,6 +604,10 @@ static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev) (adev->gfx.rlc_feature_version < 1) || !adev->gfx.rlc.is_rlc_v2_1) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; + if (adev->pm.pp_feature & PP_GFXOFF_MASK) + adev->pg_flags |= AMD_PG_SUPPORT_GFX_PG | + AMD_PG_SUPPORT_CP | + AMD_PG_SUPPORT_RLC_SMU_HS; break; default: break; diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 23265414d448..04fbf05d7176 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -992,11 +992,6 @@ static int soc15_common_early_init(void *handle) adev->pg_flags = AMD_PG_SUPPORT_SDMA | AMD_PG_SUPPORT_VCN; } - - if (adev->pm.pp_feature & PP_GFXOFF_MASK) - adev->pg_flags |= AMD_PG_SUPPORT_GFX_PG | - AMD_PG_SUPPORT_CP | - AMD_PG_SUPPORT_RLC_SMU_HS; break; default: /* FIXME: not supported yet */ From 00430144ff7343369222a110985aaa6726fb26e0 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 19 Aug 2019 23:38:02 +0800 Subject: [PATCH 169/266] drm/amd/powerplay: fix variable type errors in smu_v11_0_setup_pptable fix size type errors, from uint32_t to uint16_t. it will cause only initializes the highest 16 bits in smu_get_atom_data_table function. bug report: This fixes the following static checker warning. drivers/gpu/drm/amd/amdgpu/../powerplay/smu_v11_0.c:390 smu_v11_0_setup_pptable() warn: passing casted pointer '&size' to 'smu_get_atom_data_table()' 32 vs 16. Signed-off-by: Kevin Wang Reported-by: Dan Carpenter Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index 5fde5cf65b42..3ac061a3c3c5 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -327,6 +327,7 @@ static int smu_v11_0_setup_pptable(struct smu_context *smu) const struct smc_firmware_header_v1_0 *hdr; int ret, index; uint32_t size; + uint16_t atom_table_size; uint8_t frev, crev; void *table; uint16_t version_major, version_minor; @@ -354,10 +355,11 @@ static int smu_v11_0_setup_pptable(struct smu_context *smu) index = get_index_into_master_table(atom_master_list_of_data_tables_v2_1, powerplayinfo); - ret = smu_get_atom_data_table(smu, index, (uint16_t *)&size, &frev, &crev, + ret = smu_get_atom_data_table(smu, index, &atom_table_size, &frev, &crev, (uint8_t **)&table); if (ret) return ret; + size = atom_table_size; } if (!smu->smu_table.power_play_table) From 155f85c0d56896552439fd4fb5f43dfc4e9f842a Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 20 Aug 2019 13:28:51 +0800 Subject: [PATCH 170/266] drm/amd/powerplay: remove duplicate macro smu_get_uclk_dpm_states in amdgpu_smu.h remove duplicate macro smu_get_uclk_dpm_states in amdgpu_smu.h " #define smu_get_uclk_dpm_states(smu, clocks_in_khz, num_states) \ ((smu)->ppt_funcs->get_uclk_dpm_states ? (smu)->ppt_funcs->get_uclk_dpm_states((smu), (clocks_in_khz), (num_states)) : 0) #define smu_get_max_sustainable_clocks_by_dc(smu, max_clocks) \ ((smu)->funcs->get_max_sustainable_clocks_by_dc ? (smu)->funcs->get_max_sustainable_clocks_by_dc((smu), (max_clocks)) : 0) #define smu_get_uclk_dpm_states(smu, clocks_in_khz, num_states) \ ((smu)->ppt_funcs->get_uclk_dpm_states ? (smu)->ppt_funcs->get_uclk_dpm_states((smu), (clocks_in_khz), (num_states)) : 0) " Signed-off-by: Kevin Wang Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index a0f52c86d8c7..a78b2e295895 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -907,8 +907,6 @@ struct smu_funcs ((smu)->funcs->register_irq_handler ? (smu)->funcs->register_irq_handler(smu) : 0) #define smu_set_azalia_d3_pme(smu) \ ((smu)->funcs->set_azalia_d3_pme ? (smu)->funcs->set_azalia_d3_pme((smu)) : 0) -#define smu_get_uclk_dpm_states(smu, clocks_in_khz, num_states) \ - ((smu)->ppt_funcs->get_uclk_dpm_states ? (smu)->ppt_funcs->get_uclk_dpm_states((smu), (clocks_in_khz), (num_states)) : 0) #define smu_get_max_sustainable_clocks_by_dc(smu, max_clocks) \ ((smu)->funcs->get_max_sustainable_clocks_by_dc ? (smu)->funcs->get_max_sustainable_clocks_by_dc((smu), (max_clocks)) : 0) #define smu_get_uclk_dpm_states(smu, clocks_in_khz, num_states) \ From 221a2bdbd5d3871a5f41d912b2f06cc02e8f8b38 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Tue, 20 Aug 2019 15:11:37 +0800 Subject: [PATCH 171/266] drm/amd/amdgpu: disable MMHUB PG for navi10 Disable MMHUB PG for navi10 according to the production requirement. Signed-off-by: Kenneth Feng Reviewed-by: Hawking Zhang Reviewed-by: Kevin Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 662612f89c70..9922bce3fd89 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -552,7 +552,6 @@ static int nv_common_early_init(void *handle) AMD_CG_SUPPORT_BIF_LS; adev->pg_flags = AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_VCN_DPG | - AMD_PG_SUPPORT_MMHUB | AMD_PG_SUPPORT_ATHUB; adev->external_rev_id = adev->rev_id + 0x1; break; From 1a701ea924815b0518733aa8d5d05c1f6fa87062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Tue, 20 Aug 2019 15:39:53 +0200 Subject: [PATCH 172/266] drm/amdgpu: prevent memory leaks in AMDGPU_CS ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Error out if the AMDGPU_CS ioctl is called with multiple SYNCOBJ_OUT and/or TIMELINE_SIGNAL chunks, since otherwise the last chunk wins while the allocated array as well as the reference counts of sync objects are leaked. Signed-off-by: Nicolai Hähnle Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 4e4094f842e7..8b26c970a3cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1143,6 +1143,9 @@ static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p, num_deps = chunk->length_dw * 4 / sizeof(struct drm_amdgpu_cs_chunk_sem); + if (p->post_deps) + return -EINVAL; + p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps), GFP_KERNEL); p->num_post_deps = 0; @@ -1166,8 +1169,7 @@ static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p, static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p, - struct amdgpu_cs_chunk - *chunk) + struct amdgpu_cs_chunk *chunk) { struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps; unsigned num_deps; @@ -1177,6 +1179,9 @@ static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p num_deps = chunk->length_dw * 4 / sizeof(struct drm_amdgpu_cs_chunk_syncobj); + if (p->post_deps) + return -EINVAL; + p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps), GFP_KERNEL); p->num_post_deps = 0; From ec6e491353b9024d4b1a65c48b21e3bc0faeae4e Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Wed, 21 Aug 2019 11:27:13 -0400 Subject: [PATCH 173/266] drm/amd/display: Calculate bpc based on max_requested_bpc [Why] The only place where state->max_bpc is updated on the connector is at the start of atomic check during drm_atomic_connector_check. It isn't updated when adding the connectors to the atomic state after the fact. It also doesn't necessarily reflect the right value when called in amdgpu during mode validation outside of atomic check. This can cause the wrong bpc to be used even if the max_requested_bpc is the correct value. [How] Don't rely on state->max_bpc reflecting the real bpc value and just do the min(...) based on display info bpc and max_requested_bpc. Fixes: 01933ba42d3d ("drm/amd/display: Use current connector state if NULL when checking bpc") Signed-off-by: Nicholas Kazlauskas Reviewed-by: Leo Li Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 4a29f72334d0..45be7a2132bb 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3131,13 +3131,25 @@ static enum dc_color_depth convert_color_depth_from_display_info(const struct drm_connector *connector, const struct drm_connector_state *state) { - uint32_t bpc = connector->display_info.bpc; + uint8_t bpc = (uint8_t)connector->display_info.bpc; + + /* Assume 8 bpc by default if no bpc is specified. */ + bpc = bpc ? bpc : 8; if (!state) state = connector->state; if (state) { - bpc = state->max_bpc; + /* + * Cap display bpc based on the user requested value. + * + * The value for state->max_bpc may not correctly updated + * depending on when the connector gets added to the state + * or if this was called outside of atomic check, so it + * can't be used directly. + */ + bpc = min(bpc, state->max_requested_bpc); + /* Round down to the nearest even number. */ bpc = bpc - (bpc & 1); } From 738a2e4b1774fe0d20d6c027a7cbafb6a1619675 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 21 Aug 2019 17:07:46 -0700 Subject: [PATCH 174/266] net: dsa: bcm_sf2: Do not configure PHYLINK on CPU port The SF2 binding does not specify that the CPU port should have properties mandatory for successfully instantiating a PHYLINK object. As such, there will be missing properties (including fixed-link) and when attempting to validate and later configure link modes, we will have an incorrect set of parameters (interface, speed, duplex). Simply prevent the CPU port from being configured through PHYLINK since bcm_sf2_imp_setup() takes care of that already. Fixes: 0e27921816ad ("net: dsa: Use PHYLINK for the CPU/DSA ports") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 3811fdbda13e..28c963a21dac 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -478,6 +478,7 @@ static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port, unsigned long *supported, struct phylink_link_state *state) { + struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, }; if (!phy_interface_mode_is_rgmii(state->interface) && @@ -487,8 +488,10 @@ static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port, state->interface != PHY_INTERFACE_MODE_INTERNAL && state->interface != PHY_INTERFACE_MODE_MOCA) { bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); - dev_err(ds->dev, - "Unsupported interface: %d\n", state->interface); + if (port != core_readl(priv, CORE_IMP0_PRT_ID)) + dev_err(ds->dev, + "Unsupported interface: %d for port %d\n", + state->interface, port); return; } @@ -526,6 +529,9 @@ static void bcm_sf2_sw_mac_config(struct dsa_switch *ds, int port, u32 id_mode_dis = 0, port_mode; u32 reg, offset; + if (port == core_readl(priv, CORE_IMP0_PRT_ID)) + return; + if (priv->type == BCM7445_DEVICE_ID) offset = CORE_STS_OVERRIDE_GMIIP_PORT(port); else From f17f7648a49aa6728649ddf79bdbcac4f1970ce4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 20 Aug 2019 10:19:47 +0800 Subject: [PATCH 175/266] ipv6/addrconf: allow adding multicast addr if IFA_F_MCAUTOJOIN is set In commit 93a714d6b53d ("multicast: Extend ip address command to enable multicast group join/leave on") we added a new flag IFA_F_MCAUTOJOIN to make user able to add multicast address on ethernet interface. This works for IPv4, but not for IPv6. See the inet6_addr_add code. static int inet6_addr_add() { ... if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, true...) } ifp = ipv6_add_addr(idev, cfg, true, extack); <- always fail with maddr if (!IS_ERR(ifp)) { ... } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, false...) } } But in ipv6_add_addr() it will check the address type and reject multicast address directly. So this feature is never worked for IPv6. We should not remove the multicast address check totally in ipv6_add_addr(), but could accept multicast address only when IFA_F_MCAUTOJOIN flag supplied. v2: update commit description Fixes: 93a714d6b53d ("multicast: Extend ip address command to enable multicast group join/leave on") Reported-by: Jianlin Shi Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index dc73888c7859..ced995f3fec4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1045,7 +1045,8 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, int err = 0; if (addr_type == IPV6_ADDR_ANY || - addr_type & IPV6_ADDR_MULTICAST || + (addr_type & IPV6_ADDR_MULTICAST && + !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) || (!(idev->dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(idev->dev) && addr_type & IPV6_ADDR_LOOPBACK)) From 0f404bbdaf1624f4d25dd67da7ff85eab005beac Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 20 Aug 2019 10:46:00 +0800 Subject: [PATCH 176/266] net: fix icmp_socket_deliver argument 2 input it expects a unsigned int, but got a __be32 Signed-off-by: Li RongQing Signed-off-by: Zhang Yu Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1510e951f451..bf7b5d45de99 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -902,7 +902,7 @@ static bool icmp_redirect(struct sk_buff *skb) return false; } - icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); + icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway)); return true; } From cc07db5a5b100bc8eaab5097a23d72f858979750 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 20 Aug 2019 12:11:44 +0300 Subject: [PATCH 177/266] gve: Copy and paste bug in gve_get_stats() There is a copy and paste error so we have "rx" where "tx" was intended in the priv->tx[] array. Fixes: f5cedc84a30d ("gve: Add transmit and receive support") Signed-off-by: Dan Carpenter Reviewed-by: Catherine Sullivan Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 497298752381..aca95f64bde8 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -50,7 +50,7 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) u64_stats_fetch_begin(&priv->tx[ring].statss); s->tx_packets += priv->tx[ring].pkt_done; s->tx_bytes += priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry(&priv->tx[ring].statss, start)); } } From 7035eef4496d95b69b0bc18e0bced09304e0afdf Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 21 Aug 2019 11:45:25 -0700 Subject: [PATCH 178/266] md: update MAINTAINERS info I have been reviewing patches for md in the past few months. Mark me as the MD maintainer, as I have effectively been filling that role. Cc: NeilBrown Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 43604d6ab96c..eae4e0d1117a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14882,9 +14882,9 @@ F: include/linux/arm_sdei.h F: include/uapi/linux/arm_sdei.h SOFTWARE RAID (Multiple Disks) SUPPORT -M: Shaohua Li +M: Song Liu L: linux-raid@vger.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git S: Supported F: drivers/md/Makefile F: drivers/md/Kconfig From f9f0e9ed350e15d51ad07364b4cf910de50c472a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 20 Aug 2019 21:43:42 +0200 Subject: [PATCH 179/266] ALSA: usb-audio: Check mixer unit bitmap yet more strictly The bmControls (for UAC1) or bmMixerControls (for UAC2/3) bitmap has a variable size depending on both input and output pins. Its size is to fit with input * output bits. The problem is that the input size can't be determined simply from the unit descriptor itself but it needs to parse the whole connected sources. Although the uac_mixer_unit_get_channels() tries to check some possible overflow of this bitmap, it's incomplete due to the lack of the evaluation of input pins. For covering possible overflows, this patch adds the bitmap overflow check in the loop of input pins in parse_audio_mixer_unit(). Fixes: 0bfe5e434e66 ("ALSA: usb-audio: Check mixer unit descriptors more strictly") Cc: Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index b5927c3d5bc0..eceab19766db 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -739,7 +739,6 @@ static int uac_mixer_unit_get_channels(struct mixer_build *state, struct uac_mixer_unit_descriptor *desc) { int mu_channels; - void *c; if (desc->bLength < sizeof(*desc)) return -EINVAL; @@ -762,13 +761,6 @@ static int uac_mixer_unit_get_channels(struct mixer_build *state, break; } - if (!mu_channels) - return 0; - - c = uac_mixer_unit_bmControls(desc, state->mixer->protocol); - if (c - (void *)desc + (mu_channels - 1) / 8 >= desc->bLength) - return 0; /* no bmControls -> skip */ - return mu_channels; } @@ -2009,6 +2001,31 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid, * Mixer Unit */ +/* check whether the given in/out overflows bmMixerControls matrix */ +static bool mixer_bitmap_overflow(struct uac_mixer_unit_descriptor *desc, + int protocol, int num_ins, int num_outs) +{ + u8 *hdr = (u8 *)desc; + u8 *c = uac_mixer_unit_bmControls(desc, protocol); + size_t rest; /* remaining bytes after bmMixerControls */ + + switch (protocol) { + case UAC_VERSION_1: + default: + rest = 1; /* iMixer */ + break; + case UAC_VERSION_2: + rest = 2; /* bmControls + iMixer */ + break; + case UAC_VERSION_3: + rest = 6; /* bmControls + wMixerDescrStr */ + break; + } + + /* overflow? */ + return c + (num_ins * num_outs + 7) / 8 + rest > hdr + hdr[0]; +} + /* * build a mixer unit control * @@ -2137,6 +2154,9 @@ static int parse_audio_mixer_unit(struct mixer_build *state, int unitid, if (err < 0) return err; num_ins += iterm.channels; + if (mixer_bitmap_overflow(desc, state->mixer->protocol, + num_ins, num_outs)) + break; for (; ich < num_ins; ich++) { int och, ich_has_controls = 0; From 5c498950f730aa17c5f8a2cdcb903524e4002ed2 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 19 Jul 2019 15:32:19 +0100 Subject: [PATCH 180/266] libceph: allow ceph_buffer_put() to receive a NULL ceph_buffer Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/buffer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h index 5e58bb29b1a3..11cdc7c60480 100644 --- a/include/linux/ceph/buffer.h +++ b/include/linux/ceph/buffer.h @@ -30,7 +30,8 @@ static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) static inline void ceph_buffer_put(struct ceph_buffer *b) { - kref_put(&b->kref, ceph_buffer_release); + if (b) + kref_put(&b->kref, ceph_buffer_release); } extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end); From 86968ef21596515958d5f0a40233d02be78ecec0 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 19 Jul 2019 15:32:20 +0100 Subject: [PATCH 181/266] ceph: fix buffer free while holding i_ceph_lock in __ceph_setxattr() Calling ceph_buffer_put() in __ceph_setxattr() may end up freeing the i_xattrs.prealloc_blob buffer while holding the i_ceph_lock. This can be fixed by postponing the call until later, when the lock is released. The following backtrace was triggered by fstests generic/117. BUG: sleeping function called from invalid context at mm/vmalloc.c:2283 in_atomic(): 1, irqs_disabled(): 0, pid: 650, name: fsstress 3 locks held by fsstress/650: #0: 00000000870a0fe8 (sb_writers#8){.+.+}, at: mnt_want_write+0x20/0x50 #1: 00000000ba0c4c74 (&type->i_mutex_dir_key#6){++++}, at: vfs_setxattr+0x55/0xa0 #2: 000000008dfbb3f2 (&(&ci->i_ceph_lock)->rlock){+.+.}, at: __ceph_setxattr+0x297/0x810 CPU: 1 PID: 650 Comm: fsstress Not tainted 5.2.0+ #437 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x67/0x90 ___might_sleep.cold+0x9f/0xb1 vfree+0x4b/0x60 ceph_buffer_release+0x1b/0x60 __ceph_setxattr+0x2b4/0x810 __vfs_setxattr+0x66/0x80 __vfs_setxattr_noperm+0x59/0xf0 vfs_setxattr+0x81/0xa0 setxattr+0x115/0x230 ? filename_lookup+0xc9/0x140 ? rcu_read_lock_sched_held+0x74/0x80 ? rcu_sync_lockdep_assert+0x2e/0x60 ? __sb_start_write+0x142/0x1a0 ? mnt_want_write+0x20/0x50 path_setxattr+0xba/0xd0 __x64_sys_lsetxattr+0x24/0x30 do_syscall_64+0x50/0x1c0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7ff23514359a Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 37b458a9af3a..c083557b3657 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1036,6 +1036,7 @@ int __ceph_setxattr(struct inode *inode, const char *name, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf = NULL; + struct ceph_buffer *old_blob = NULL; int issued; int err; int dirty = 0; @@ -1109,13 +1110,15 @@ retry: struct ceph_buffer *blob; spin_unlock(&ci->i_ceph_lock); - dout(" preaallocating new blob size=%d\n", required_blob_size); + ceph_buffer_put(old_blob); /* Shouldn't be required */ + dout(" pre-allocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); + /* prealloc_blob can't be released while holding i_ceph_lock */ if (ci->i_xattrs.prealloc_blob) - ceph_buffer_put(ci->i_xattrs.prealloc_blob); + old_blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = blob; goto retry; } @@ -1131,6 +1134,7 @@ retry: } spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); if (lock_snap_rwsem) up_read(&mdsc->snap_rwsem); if (dirty) From 12fe3dda7ed89c95cc0ef7abc001ad1ad3e092f8 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 19 Jul 2019 15:32:21 +0100 Subject: [PATCH 182/266] ceph: fix buffer free while holding i_ceph_lock in __ceph_build_xattrs_blob() Calling ceph_buffer_put() in __ceph_build_xattrs_blob() may result in freeing the i_xattrs.blob buffer while holding the i_ceph_lock. This can be fixed by having this function returning the old blob buffer and have the callers of this function freeing it when the lock is released. The following backtrace was triggered by fstests generic/117. BUG: sleeping function called from invalid context at mm/vmalloc.c:2283 in_atomic(): 1, irqs_disabled(): 0, pid: 649, name: fsstress 4 locks held by fsstress/649: #0: 00000000a7478e7e (&type->s_umount_key#19){++++}, at: iterate_supers+0x77/0xf0 #1: 00000000f8de1423 (&(&ci->i_ceph_lock)->rlock){+.+.}, at: ceph_check_caps+0x7b/0xc60 #2: 00000000562f2b27 (&s->s_mutex){+.+.}, at: ceph_check_caps+0x3bd/0xc60 #3: 00000000f83ce16a (&mdsc->snap_rwsem){++++}, at: ceph_check_caps+0x3ed/0xc60 CPU: 1 PID: 649 Comm: fsstress Not tainted 5.2.0+ #439 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x67/0x90 ___might_sleep.cold+0x9f/0xb1 vfree+0x4b/0x60 ceph_buffer_release+0x1b/0x60 __ceph_build_xattrs_blob+0x12b/0x170 __send_cap+0x302/0x540 ? __lock_acquire+0x23c/0x1e40 ? __mark_caps_flushing+0x15c/0x280 ? _raw_spin_unlock+0x24/0x30 ceph_check_caps+0x5f0/0xc60 ceph_flush_dirty_caps+0x7c/0x150 ? __ia32_sys_fdatasync+0x20/0x20 ceph_sync_fs+0x5a/0x130 iterate_supers+0x8f/0xf0 ksys_sync+0x4f/0xb0 __ia32_sys_sync+0xa/0x10 do_syscall_64+0x50/0x1c0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fc6409ab617 Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 5 ++++- fs/ceph/snap.c | 4 +++- fs/ceph/super.h | 2 +- fs/ceph/xattr.c | 11 ++++++++--- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d98dcd976c80..ce0f5658720a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1301,6 +1301,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; + struct ceph_buffer *old_blob = NULL; struct cap_msg_args arg; int held, revoking; int wake = 0; @@ -1365,7 +1366,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ci->i_requested_max_size = arg.max_size; if (flushing & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); + old_blob = __ceph_build_xattrs_blob(ci); arg.xattr_version = ci->i_xattrs.version; arg.xattr_buf = ci->i_xattrs.blob; } else { @@ -1409,6 +1410,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); + ret = send_cap_msg(&arg); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4c6494eb02b5..ccfcc66aaf44 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -465,6 +465,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; struct ceph_snap_context *old_snapc, *new_snapc; + struct ceph_buffer *old_blob = NULL; int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); @@ -541,7 +542,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->gid = inode->i_gid; if (dirty & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); + old_blob = __ceph_build_xattrs_blob(ci); capsnap->xattr_blob = ceph_buffer_get(ci->i_xattrs.blob); capsnap->xattr_version = ci->i_xattrs.version; @@ -584,6 +585,7 @@ update_snapc: } spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); kfree(capsnap); ceph_put_snap_context(old_snapc); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d2352fd95dbc..6b9f1ee7de85 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -926,7 +926,7 @@ extern int ceph_getattr(const struct path *path, struct kstat *stat, int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); -extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); +extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern const struct xattr_handler *ceph_xattr_handlers[]; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index c083557b3657..939eab7aa219 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -754,12 +754,15 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, /* * If there are dirty xattrs, reencode xattrs into the prealloc_blob - * and swap into place. + * and swap into place. It returns the old i_xattrs.blob (or NULL) so + * that it can be freed by the caller as the i_ceph_lock is likely to be + * held. */ -void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) +struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) { struct rb_node *p; struct ceph_inode_xattr *xattr = NULL; + struct ceph_buffer *old_blob = NULL; void *dest; dout("__build_xattrs_blob %p\n", &ci->vfs_inode); @@ -790,12 +793,14 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) dest - ci->i_xattrs.prealloc_blob->vec.iov_base; if (ci->i_xattrs.blob) - ceph_buffer_put(ci->i_xattrs.blob); + old_blob = ci->i_xattrs.blob; ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = NULL; ci->i_xattrs.dirty = false; ci->i_xattrs.version++; } + + return old_blob; } static inline int __get_request_mask(struct inode *in) { From af8a85a41734f37b67ba8ce69d56b685bee4ac48 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 19 Jul 2019 15:32:22 +0100 Subject: [PATCH 183/266] ceph: fix buffer free while holding i_ceph_lock in fill_inode() Calling ceph_buffer_put() in fill_inode() may result in freeing the i_xattrs.blob buffer while holding the i_ceph_lock. This can be fixed by postponing the call until later, when the lock is released. The following backtrace was triggered by fstests generic/070. BUG: sleeping function called from invalid context at mm/vmalloc.c:2283 in_atomic(): 1, irqs_disabled(): 0, pid: 3852, name: kworker/0:4 6 locks held by kworker/0:4/3852: #0: 000000004270f6bb ((wq_completion)ceph-msgr){+.+.}, at: process_one_work+0x1b8/0x5f0 #1: 00000000eb420803 ((work_completion)(&(&con->work)->work)){+.+.}, at: process_one_work+0x1b8/0x5f0 #2: 00000000be1c53a4 (&s->s_mutex){+.+.}, at: dispatch+0x288/0x1476 #3: 00000000559cb958 (&mdsc->snap_rwsem){++++}, at: dispatch+0x2eb/0x1476 #4: 000000000d5ebbae (&req->r_fill_mutex){+.+.}, at: dispatch+0x2fc/0x1476 #5: 00000000a83d0514 (&(&ci->i_ceph_lock)->rlock){+.+.}, at: fill_inode.isra.0+0xf8/0xf70 CPU: 0 PID: 3852 Comm: kworker/0:4 Not tainted 5.2.0+ #441 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014 Workqueue: ceph-msgr ceph_con_workfn Call Trace: dump_stack+0x67/0x90 ___might_sleep.cold+0x9f/0xb1 vfree+0x4b/0x60 ceph_buffer_release+0x1b/0x60 fill_inode.isra.0+0xa9b/0xf70 ceph_fill_trace+0x13b/0xc70 ? dispatch+0x2eb/0x1476 dispatch+0x320/0x1476 ? __mutex_unlock_slowpath+0x4d/0x2a0 ceph_con_workfn+0xc97/0x2ec0 ? process_one_work+0x1b8/0x5f0 process_one_work+0x244/0x5f0 worker_thread+0x4d/0x3e0 kthread+0x105/0x140 ? process_one_work+0x5f0/0x5f0 ? kthread_park+0x90/0x90 ret_from_fork+0x3a/0x50 Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 791f84a13bb8..18500edefc56 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -736,6 +736,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, int issued, new_issued, info_caps; struct timespec64 mtime, atime, ctime; struct ceph_buffer *xattr_blob = NULL; + struct ceph_buffer *old_blob = NULL; struct ceph_string *pool_ns = NULL; struct ceph_cap *new_cap = NULL; int err = 0; @@ -881,7 +882,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { if (ci->i_xattrs.blob) - ceph_buffer_put(ci->i_xattrs.blob); + old_blob = ci->i_xattrs.blob; ci->i_xattrs.blob = xattr_blob; if (xattr_blob) memcpy(ci->i_xattrs.blob->vec.iov_base, @@ -1022,8 +1023,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page, out: if (new_cap) ceph_put_cap(mdsc, new_cap); - if (xattr_blob) - ceph_buffer_put(xattr_blob); + ceph_buffer_put(old_blob); + ceph_buffer_put(xattr_blob); ceph_put_string(pool_ns); return err; } From c95f1c5f436badb9bb87e9b30fd573f6b3d59423 Mon Sep 17 00:00:00 2001 From: Erqi Chen Date: Wed, 24 Jul 2019 10:26:09 +0800 Subject: [PATCH 184/266] ceph: clear page dirty before invalidate page clear_page_dirty_for_io(page) before mapping->a_ops->invalidatepage(). invalidatepage() clears page's private flag, if dirty flag is not cleared, the page may cause BUG_ON failure in ceph_set_page_dirty(). Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/40862 Signed-off-by: Erqi Chen Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e078cc55b989..b3c8b886bf64 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -913,8 +913,9 @@ get_more_pages: if (page_offset(page) >= ceph_wbc.i_size) { dout("%p page eof %llu\n", page, ceph_wbc.i_size); - if (ceph_wbc.size_stable || - page_offset(page) >= i_size_read(inode)) + if ((ceph_wbc.size_stable || + page_offset(page) >= i_size_read(inode)) && + clear_page_dirty_for_io(page)) mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); From 28a282616f56990547b9dcd5c6fbd2001344664c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 15 Aug 2019 06:23:38 -0400 Subject: [PATCH 185/266] ceph: don't try fill file_lock on unsuccessful GETFILELOCK reply When ceph_mdsc_do_request returns an error, we can't assume that the filelock_reply pointer will be set. Only try to fetch fields out of the r_reply_info when it returns success. Cc: stable@vger.kernel.org Reported-by: Hector Martin Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/locks.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ac9b53b89365..5083e238ad15 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -111,8 +111,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, req->r_wait_for_completion = ceph_lock_wait_for_completion; err = ceph_mdsc_do_request(mdsc, inode, req); - - if (operation == CEPH_MDS_OP_GETFILELOCK) { + if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; From a561372405cf6bc6f14239b3a9e57bb39f2788b0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 20 Aug 2019 16:40:33 +0200 Subject: [PATCH 186/266] libceph: fix PG split vs OSD (re)connect race We can't rely on ->peer_features in calc_target() because it may be called both when the OSD session is established and open and when it's not. ->peer_features is not valid unless the OSD session is open. If this happens on a PG split (pg_num increase), that could mean we don't resend a request that should have been resent, hanging the client indefinitely. In userspace this was fixed by looking at require_osd_release and get_xinfo[osd].features fields of the osdmap. However these fields belong to the OSD section of the osdmap, which the kernel doesn't decode (only the client section is decoded). Instead, let's drop this feature check. It effectively checks for luminous, so only pre-luminous OSDs would be affected in that on a PG split the kernel might resend a request that should not have been resent. Duplicates can occur in other scenarios, so both sides should already be prepared for them: see dup/replay logic on the OSD side and retry_attempt check on the client side. Cc: stable@vger.kernel.org Fixes: 7de030d6b10a ("libceph: resend on PG splits if OSD has RESEND_ON_SPLIT") Link: https://tracker.ceph.com/issues/41162 Reported-by: Jerry Lee Signed-off-by: Ilya Dryomov Tested-by: Jerry Lee Reviewed-by: Jeff Layton --- net/ceph/osd_client.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0b2df09b2554..78ae6e8c953d 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1496,7 +1496,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osds up, acting; bool force_resend = false; bool unpaused = false; - bool legacy_change; + bool legacy_change = false; bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); bool recovery_deletes = ceph_osdmap_flag(osdc, @@ -1584,15 +1584,14 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->osd = acting.primary; } - if (unpaused || legacy_change || force_resend || - (split && con && CEPH_HAVE_FEATURE(con->peer_features, - RESEND_ON_SPLIT))) + if (unpaused || legacy_change || force_resend || split) ct_res = CALC_TARGET_NEED_RESEND; else ct_res = CALC_TARGET_NO_ACTION; out: - dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); + dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused, + legacy_change, force_resend, split, ct_res, t->osd); return ct_res; } From 2113c5f62b7423e4a72b890bd479704aa85c81ba Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 22 Aug 2019 13:03:05 +0200 Subject: [PATCH 187/266] KVM: arm/arm64: Only skip MMIO insn once If after an MMIO exit to userspace a VCPU is immediately run with an immediate_exit request, such as when a signal is delivered or an MMIO emulation completion is needed, then the VCPU completes the MMIO emulation and immediately returns to userspace. As the exit_reason does not get changed from KVM_EXIT_MMIO in these cases we have to be careful not to complete the MMIO emulation again, when the VCPU is eventually run again, because the emulation does an instruction skip (and doing too many skips would be a waste of guest code :-) We need to use additional VCPU state to track if the emulation is complete. As luck would have it, we already have 'mmio_needed', which even appears to be used in this way by other architectures already. Fixes: 0d640732dbeb ("arm64: KVM: Skip MMIO insn after emulation") Acked-by: Mark Rutland Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmio.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index a8a6a0c883f1..6af5c91337f2 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -86,6 +86,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) unsigned int len; int mask; + /* Detect an already handled MMIO return */ + if (unlikely(!vcpu->mmio_needed)) + return 0; + + vcpu->mmio_needed = 0; + if (!run->mmio.is_write) { len = run->mmio.len; if (len > sizeof(unsigned long)) @@ -188,6 +194,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, run->mmio.is_write = is_write; run->mmio.phys_addr = fault_ipa; run->mmio.len = len; + vcpu->mmio_needed = 1; if (!ret) { /* We handled the access successfully in the kernel. */ From a5fb8e6c02d6a518fb2b1a2b8c2471fa77b69436 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Aug 2019 13:28:43 +0100 Subject: [PATCH 188/266] afs: Fix leak in afs_lookup_cell_rcu() Fix a leak on the cell refcount in afs_lookup_cell_rcu() due to non-clearance of the default error in the case a NULL cell name is passed and the workstation default cell is used. Also put a bit at the end to make sure we don't leak a cell ref if we're going to be returning an error. This leak results in an assertion like the following when the kafs module is unloaded: AFS: Assertion failed 2 == 1 is false 0x2 == 0x1 is false ------------[ cut here ]------------ kernel BUG at fs/afs/cell.c:770! ... RIP: 0010:afs_manage_cells+0x220/0x42f [kafs] ... process_one_work+0x4c2/0x82c ? pool_mayday_timeout+0x1e1/0x1e1 ? do_raw_spin_lock+0x134/0x175 worker_thread+0x336/0x4a6 ? rescuer_thread+0x4af/0x4af kthread+0x1de/0x1ee ? kthread_park+0xd4/0xd4 ret_from_fork+0x24/0x30 Fixes: 989782dcdc91 ("afs: Overhaul cell database management") Signed-off-by: David Howells --- fs/afs/cell.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index a2a87117d262..fd5133e26a38 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -74,6 +74,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, cell = rcu_dereference_raw(net->ws_cell); if (cell) { afs_get_cell(cell); + ret = 0; break; } ret = -EDESTADDRREQ; @@ -108,6 +109,9 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, done_seqretry(&net->cells_lock, seq); + if (ret != 0 && cell) + afs_put_cell(net, cell); + return ret == 0 ? cell : ERR_PTR(ret); } From c4c613ff08d92e72bf64a65ec35a2c3aa1cfcd06 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 22 Aug 2019 13:28:43 +0100 Subject: [PATCH 189/266] afs: Fix possible oops in afs_lookup trace event The afs_lookup trace event can cause the following: [ 216.576777] BUG: kernel NULL pointer dereference, address: 000000000000023b [ 216.576803] #PF: supervisor read access in kernel mode [ 216.576813] #PF: error_code(0x0000) - not-present page ... [ 216.576913] RIP: 0010:trace_event_raw_event_afs_lookup+0x9e/0x1c0 [kafs] If the inode from afs_do_lookup() is an error other than ENOENT, or if it is ENOENT and afs_try_auto_mntpt() returns an error, the trace event will try to dereference the error pointer as a valid pointer. Use IS_ERR_OR_NULL to only pass a valid pointer for the trace, or NULL. Ideally the trace would include the error value, but for now just avoid the oops. Fixes: 80548b03991f ("afs: Add more tracepoints") Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 81207dc3c997..139b4e3cc946 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -959,7 +959,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, inode ? AFS_FS_I(inode) : NULL); } else { trace_afs_lookup(dvnode, &dentry->d_name, - inode ? AFS_FS_I(inode) : NULL); + IS_ERR_OR_NULL(inode) ? NULL + : AFS_FS_I(inode)); } return d; } From 7533be858f5b9a036b9f91556a3ed70786abca8e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 19 Aug 2019 16:05:31 +0100 Subject: [PATCH 190/266] afs: use correct afs_call_type in yfs_fs_store_opaque_acl2 It seems that 'yfs_RXYFSStoreOpaqueACL2' should be use in yfs_fs_store_opaque_acl2(). Fixes: f5e4546347bc ("afs: Implement YFS ACL setting") Signed-off-by: YueHaibing Signed-off-by: David Howells --- fs/afs/yfsclient.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 2575503170fc..ca2452806ebf 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -2171,7 +2171,7 @@ int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *fc, const struct afs_acl *acl key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); size = round_up(acl->size, 4); - call = afs_alloc_flat_call(net, &yfs_RXYFSStoreStatus, + call = afs_alloc_flat_call(net, &yfs_RXYFSStoreOpaqueACL2, sizeof(__be32) * 2 + sizeof(struct yfs_xdr_YFSFid) + sizeof(__be32) + size, From d37b1e534071ab1983e7c85273234b132c77591a Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 22 Aug 2019 03:02:50 -0700 Subject: [PATCH 191/266] RDMA/bnxt_re: Fix stack-out-of-bounds in bnxt_qplib_rcfw_send_message Driver copies FW commands to the HW queue as units of 16 bytes. Some of the command structures are not exact multiple of 16. So while copying the data from those structures, the stack out of bounds messages are reported by KASAN. The following error is reported. [ 1337.530155] ================================================================== [ 1337.530277] BUG: KASAN: stack-out-of-bounds in bnxt_qplib_rcfw_send_message+0x40a/0x850 [bnxt_re] [ 1337.530413] Read of size 16 at addr ffff888725477a48 by task rmmod/2785 [ 1337.530540] CPU: 5 PID: 2785 Comm: rmmod Tainted: G OE 5.2.0-rc6+ #75 [ 1337.530541] Hardware name: Dell Inc. PowerEdge R730/0599V5, BIOS 1.0.4 08/28/2014 [ 1337.530542] Call Trace: [ 1337.530548] dump_stack+0x5b/0x90 [ 1337.530556] ? bnxt_qplib_rcfw_send_message+0x40a/0x850 [bnxt_re] [ 1337.530560] print_address_description+0x65/0x22e [ 1337.530568] ? bnxt_qplib_rcfw_send_message+0x40a/0x850 [bnxt_re] [ 1337.530575] ? bnxt_qplib_rcfw_send_message+0x40a/0x850 [bnxt_re] [ 1337.530577] __kasan_report.cold.3+0x37/0x77 [ 1337.530581] ? _raw_write_trylock+0x10/0xe0 [ 1337.530588] ? bnxt_qplib_rcfw_send_message+0x40a/0x850 [bnxt_re] [ 1337.530590] kasan_report+0xe/0x20 [ 1337.530592] memcpy+0x1f/0x50 [ 1337.530600] bnxt_qplib_rcfw_send_message+0x40a/0x850 [bnxt_re] [ 1337.530608] ? bnxt_qplib_creq_irq+0xa0/0xa0 [bnxt_re] [ 1337.530611] ? xas_create+0x3aa/0x5f0 [ 1337.530613] ? xas_start+0x77/0x110 [ 1337.530615] ? xas_clear_mark+0x34/0xd0 [ 1337.530623] bnxt_qplib_free_mrw+0x104/0x1a0 [bnxt_re] [ 1337.530631] ? bnxt_qplib_destroy_ah+0x110/0x110 [bnxt_re] [ 1337.530633] ? bit_wait_io_timeout+0xc0/0xc0 [ 1337.530641] bnxt_re_dealloc_mw+0x2c/0x60 [bnxt_re] [ 1337.530648] bnxt_re_destroy_fence_mr+0x77/0x1d0 [bnxt_re] [ 1337.530655] bnxt_re_dealloc_pd+0x25/0x60 [bnxt_re] [ 1337.530677] ib_dealloc_pd_user+0xbe/0xe0 [ib_core] [ 1337.530683] srpt_remove_one+0x5de/0x690 [ib_srpt] [ 1337.530689] ? __srpt_close_all_ch+0xc0/0xc0 [ib_srpt] [ 1337.530692] ? xa_load+0x87/0xe0 ... [ 1337.530840] do_syscall_64+0x6d/0x1f0 [ 1337.530843] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1337.530845] RIP: 0033:0x7ff5b389035b [ 1337.530848] Code: 73 01 c3 48 8b 0d 2d 0b 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fd 0a 2c 00 f7 d8 64 89 01 48 [ 1337.530849] RSP: 002b:00007fff83425c28 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [ 1337.530852] RAX: ffffffffffffffda RBX: 00005596443e6750 RCX: 00007ff5b389035b [ 1337.530853] RDX: 000000000000000a RSI: 0000000000000800 RDI: 00005596443e67b8 [ 1337.530854] RBP: 0000000000000000 R08: 00007fff83424ba1 R09: 0000000000000000 [ 1337.530856] R10: 00007ff5b3902960 R11: 0000000000000206 R12: 00007fff83425e50 [ 1337.530857] R13: 00007fff8342673c R14: 00005596443e6260 R15: 00005596443e6750 [ 1337.530885] The buggy address belongs to the page: [ 1337.530962] page:ffffea001c951dc0 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 1337.530964] flags: 0x57ffffc0000000() [ 1337.530967] raw: 0057ffffc0000000 0000000000000000 ffffffff1c950101 0000000000000000 [ 1337.530970] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 1337.530970] page dumped because: kasan: bad access detected [ 1337.530996] Memory state around the buggy address: [ 1337.531072] ffff888725477900: 00 00 00 00 f1 f1 f1 f1 00 00 00 00 00 f2 f2 f2 [ 1337.531180] ffff888725477980: 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 [ 1337.531288] >ffff888725477a00: 00 f2 f2 f2 f2 f2 f2 00 00 00 f2 00 00 00 00 00 [ 1337.531393] ^ [ 1337.531478] ffff888725477a80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 1337.531585] ffff888725477b00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 1337.531691] ================================================================== Fix this by passing the exact size of each FW command to bnxt_qplib_rcfw_send_message as req->cmd_size. Before sending the command to HW, modify the req->cmd_size to number of 16 byte units. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1566468170-489-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 8 +++++++- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 11 ++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 48b04d2f175f..60c8f76aab33 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -136,6 +136,13 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, spin_unlock_irqrestore(&cmdq->lock, flags); return -EBUSY; } + + size = req->cmd_size; + /* change the cmd_size to the number of 16byte cmdq unit. + * req->cmd_size is modified here + */ + bnxt_qplib_set_cmd_slots(req); + memset(resp, 0, sizeof(*resp)); crsqe->resp = (struct creq_qp_event *)resp; crsqe->resp->cookie = req->cookie; @@ -150,7 +157,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, cmdq_ptr = (struct bnxt_qplib_cmdqe **)cmdq->pbl_ptr; preq = (u8 *)req; - size = req->cmd_size * BNXT_QPLIB_CMDQE_UNITS; do { /* Locate the next cmdq slot */ sw_prod = HWQ_CMP(cmdq->prod, cmdq); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 2138533bb642..dfeadc192e17 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -55,9 +55,7 @@ do { \ memset(&(req), 0, sizeof((req))); \ (req).opcode = CMDQ_BASE_OPCODE_##CMD; \ - (req).cmd_size = (sizeof((req)) + \ - BNXT_QPLIB_CMDQE_UNITS - 1) / \ - BNXT_QPLIB_CMDQE_UNITS; \ + (req).cmd_size = sizeof((req)); \ (req).flags = cpu_to_le16(cmd_flags); \ } while (0) @@ -95,6 +93,13 @@ static inline u32 bnxt_qplib_cmdqe_cnt_per_pg(u32 depth) BNXT_QPLIB_CMDQE_UNITS); } +/* Set the cmd_size to a factor of CMDQE unit */ +static inline void bnxt_qplib_set_cmd_slots(struct cmdq_base *req) +{ + req->cmd_size = (req->cmd_size + BNXT_QPLIB_CMDQE_UNITS - 1) / + BNXT_QPLIB_CMDQE_UNITS; +} + #define MAX_CMDQ_IDX(depth) ((depth) - 1) static inline u32 bnxt_qplib_max_cmdq_idx_per_pg(u32 depth) From fab4f97e1fe33cf08e58c09cf9eee334857d9fe7 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 22 Aug 2019 17:07:41 +0200 Subject: [PATCH 192/266] RDMA/siw: Fix SGL mapping issues All user level and most in-kernel applications submit WQEs where the SG list entries are all of a single type. iSER in particular, however, will send us WQEs with mixed SG types: sge[0] = kernel buffer, sge[1] = PBL region. Check and set is_kva on each SG entry individually instead of assuming the first SGE type carries through to the last. This fixes iSER over siw. Fixes: b9be6f18cf9e ("rdma/siw: transmit path") Reported-by: Krishnamraju Eraparaju Tested-by: Krishnamraju Eraparaju Signed-off-by: Bernard Metzler Link: https://lore.kernel.org/r/20190822150741.21871-1-bmt@zurich.ibm.com Signed-off-by: Doug Ledford --- drivers/infiniband/sw/siw/siw_qp_tx.c | 37 +++++++++++---------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 43020d2040fc..42c63622c7bd 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -398,15 +398,13 @@ static int siw_0copy_tx(struct socket *s, struct page **page, #define MAX_TRAILER (MPA_CRC_SIZE + 4) -static void siw_unmap_pages(struct page **pages, int hdr_len, int num_maps) +static void siw_unmap_pages(struct page **pp, unsigned long kmap_mask) { - if (hdr_len) { - ++pages; - --num_maps; - } - while (num_maps-- > 0) { - kunmap(*pages); - pages++; + while (kmap_mask) { + if (kmap_mask & BIT(0)) + kunmap(*pp); + pp++; + kmap_mask >>= 1; } } @@ -437,6 +435,7 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx, pbl_idx = c_tx->pbl_idx; + unsigned long kmap_mask = 0L; if (c_tx->state == SIW_SEND_HDR) { if (c_tx->use_sendpage) { @@ -463,8 +462,7 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) if (!(tx_flags(wqe) & SIW_WQE_INLINE)) { mem = wqe->mem[sge_idx]; - if (!mem->mem_obj) - is_kva = 1; + is_kva = mem->mem_obj == NULL ? 1 : 0; } else { is_kva = 1; } @@ -500,12 +498,7 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) p = siw_get_upage(mem->umem, sge->laddr + sge_off); if (unlikely(!p)) { - if (hdr_len) - seg--; - if (!c_tx->use_sendpage && seg) { - siw_unmap_pages(page_array, - hdr_len, seg); - } + siw_unmap_pages(page_array, kmap_mask); wqe->processed -= c_tx->bytes_unsent; rv = -EFAULT; goto done_crc; @@ -515,6 +508,10 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) if (!c_tx->use_sendpage) { iov[seg].iov_base = kmap(p) + fp_off; iov[seg].iov_len = plen; + + /* Remember for later kunmap() */ + kmap_mask |= BIT(seg); + if (do_crc) crypto_shash_update( c_tx->mpa_crc_hd, @@ -543,10 +540,7 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) if (++seg > (int)MAX_ARRAY) { siw_dbg_qp(tx_qp(c_tx), "to many fragments\n"); - if (!is_kva && !c_tx->use_sendpage) { - siw_unmap_pages(page_array, hdr_len, - seg - 1); - } + siw_unmap_pages(page_array, kmap_mask); wqe->processed -= c_tx->bytes_unsent; rv = -EMSGSIZE; goto done_crc; @@ -597,8 +591,7 @@ sge_done: } else { rv = kernel_sendmsg(s, &msg, iov, seg + 1, hdr_len + data_len + trl_len); - if (!is_kva) - siw_unmap_pages(page_array, hdr_len, seg); + siw_unmap_pages(page_array, kmap_mask); } if (rv < (int)hdr_len) { /* Not even complete hdr pushed or negative rv */ From 7542c6dedbc1caa284ca4cbd6b64f99023ff1b97 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 18 Jun 2019 12:09:26 +0900 Subject: [PATCH 193/266] jffs2: Remove C++ style comments from uapi header Linux kernel tolerates C++ style comments these days. Actually, the SPDX License tags for .c files start with //. On the other hand, uapi headers are written in more strict C, where the C++ comment style is forbidden. I simply dropped these lines instead of fixing the comment style. This code has been always commented out since it was added around Linux 2.4.9 (i.e. commented out for more than 17 years). 'Maybe later...' will never happen. Signed-off-by: Masahiro Yamada Acked-by: Richard Weinberger Signed-off-by: Richard Weinberger --- include/uapi/linux/jffs2.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/uapi/linux/jffs2.h b/include/uapi/linux/jffs2.h index a18b719f49d4..784ba0b9690a 100644 --- a/include/uapi/linux/jffs2.h +++ b/include/uapi/linux/jffs2.h @@ -77,11 +77,6 @@ #define JFFS2_ACL_VERSION 0x0001 -// Maybe later... -//#define JFFS2_NODETYPE_CHECKPOINT (JFFS2_FEATURE_RWCOMPAT_DELETE | JFFS2_NODE_ACCURATE | 3) -//#define JFFS2_NODETYPE_OPTIONS (JFFS2_FEATURE_RWCOMPAT_COPY | JFFS2_NODE_ACCURATE | 4) - - #define JFFS2_INO_FLAG_PREREAD 1 /* Do read_inode() for this one at mount time, don't wait for it to happen later */ From 4dd75b335bc1f10fb1a01b5cd58870d47c13c4e7 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 13 Aug 2019 23:50:51 +0200 Subject: [PATCH 194/266] ubifs: Fix double unlock around orphan_delete() We unlock after orphan_delete(), so no need to unlock in the function too. Reported-by: Han Xu Fixes: 8009ce956c3d ("ubifs: Don't leak orphans on memory during commit") Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index b52624e28fa1..3b4b4114f208 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -129,7 +129,6 @@ static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o) static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) { if (orph->del) { - spin_unlock(&c->orphan_lock); dbg_gen("deleted twice ino %lu", orph->inum); return; } @@ -138,7 +137,6 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) orph->del = 1; orph->dnext = c->orph_dnext; c->orph_dnext = orph; - spin_unlock(&c->orphan_lock); dbg_gen("delete later ino %lu", orph->inum); return; } From 377e208f44784174f3002d9892d553715a3ab71b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 13 Aug 2019 23:55:48 +0200 Subject: [PATCH 195/266] ubifs: Correctly initialize c->min_log_bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently on a freshly mounted UBIFS, c->min_log_bytes is 0. This can lead to a log overrun and make commits fail. Recent kernels will report the following assert: UBIFS assert failed: c->lhead_lnum != c->ltail_lnum, in fs/ubifs/log.c:412 c->min_log_bytes can have two states, 0 and c->leb_size. It controls how much bytes of the log area are reserved for non-bud nodes such as commit nodes. After a commit it has to be set to c->leb_size such that we have always enough space for a commit. While a commit runs it can be 0 to make the remaining bytes of the log available to writers. Having it set to 0 right after mount is wrong since no space for commits is reserved. Fixes: 1e51764a3c2ac ("UBIFS: add new flash file system") Reported-and-tested-by: Uwe Kleine-König Signed-off-by: Richard Weinberger --- fs/ubifs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 2c0803b0ac3a..8c1d571334bc 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -609,6 +609,10 @@ static int init_constants_early(struct ubifs_info *c) c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; if (c->max_bu_buf_len > c->leb_size) c->max_bu_buf_len = c->leb_size; + + /* Log is ready, preserve one LEB for commits. */ + c->min_log_bytes = c->leb_size; + return 0; } From 0af83abbd4a6e36a4b209d8c57c26143e40eeec1 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Tue, 6 Aug 2019 22:21:40 +0800 Subject: [PATCH 196/266] ubifs: Limit the number of pages in shrink_liability If the number of dirty pages to be written back is large, then writeback_inodes_sb will block waiting for a long time, causing hung task detection alarm. Therefore, we should limit the maximum number of pages written back this time, which let the budget be completed faster. The remaining dirty pages tend to rely on the writeback mechanism to complete the synchronization. Fixes: b6e51316daed ("writeback: separate starting of sync vs opportunistic writeback") Signed-off-by: Liu Song Signed-off-by: Richard Weinberger --- fs/ubifs/budget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 80d7301ab76d..c0b84e960b20 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -51,7 +51,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) { down_read(&c->vfs_sb->s_umount); - writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); + writeback_inodes_sb_nr(c->vfs_sb, nr_to_write, WB_REASON_FS_FREE_SPACE); up_read(&c->vfs_sb->s_umount); } From e4f9d6013820d1eba1432d51dd1c5795759aa77f Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Sat, 17 Aug 2019 13:32:40 +0800 Subject: [PATCH 197/266] dm btree: fix order of block initialization in btree_split_beneath When btree_split_beneath() splits a node to two new children, it will allocate two blocks: left and right. If right block's allocation failed, the left block will be unlocked and marked dirty. If this happened, the left block'ss content is zero, because it wasn't initialized with the btree struct before the attempot to allocate the right block. Upon return, when flushing the left block to disk, the validator will fail when check this block. Then a BUG_ON is raised. Fix this by completely initializing the left block before allocating and initializing the right block. Fixes: 4dcb8b57df359 ("dm btree: fix leak of bufio-backed block in btree_split_beneath error path") Cc: stable@vger.kernel.org Signed-off-by: ZhangXiaoxu Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-btree.c | 31 ++++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 58b319757b1e..8aae0624a297 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -628,39 +628,40 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) new_parent = shadow_current(s); + pn = dm_block_data(new_parent); + size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? + sizeof(__le64) : s->info->value_type.size; + + /* create & init the left block */ r = new_block(s->info, &left); if (r < 0) return r; + ln = dm_block_data(left); + nr_left = le32_to_cpu(pn->header.nr_entries) / 2; + + ln->header.flags = pn->header.flags; + ln->header.nr_entries = cpu_to_le32(nr_left); + ln->header.max_entries = pn->header.max_entries; + ln->header.value_size = pn->header.value_size; + memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); + memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); + + /* create & init the right block */ r = new_block(s->info, &right); if (r < 0) { unlock_block(s->info, left); return r; } - pn = dm_block_data(new_parent); - ln = dm_block_data(left); rn = dm_block_data(right); - - nr_left = le32_to_cpu(pn->header.nr_entries) / 2; nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; - ln->header.flags = pn->header.flags; - ln->header.nr_entries = cpu_to_le32(nr_left); - ln->header.max_entries = pn->header.max_entries; - ln->header.value_size = pn->header.value_size; - rn->header.flags = pn->header.flags; rn->header.nr_entries = cpu_to_le32(nr_right); rn->header.max_entries = pn->header.max_entries; rn->header.value_size = pn->header.value_size; - - memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); - - size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? - sizeof(__le64) : s->info->value_type.size; - memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), nr_right * size); From ae148243d3f0816b37477106c05a2ec7d5f32614 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Mon, 19 Aug 2019 11:31:21 +0800 Subject: [PATCH 198/266] dm space map metadata: fix missing store of apply_bops() return value In commit 6096d91af0b6 ("dm space map metadata: fix occasional leak of a metadata block on resize"), we refactor the commit logic to a new function 'apply_bops'. But when that logic was replaced in out() the return value was not stored. This may lead out() returning a wrong value to the caller. Fixes: 6096d91af0b6 ("dm space map metadata: fix occasional leak of a metadata block on resize") Cc: stable@vger.kernel.org Signed-off-by: ZhangXiaoxu Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-space-map-metadata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index aec449243966..25328582cc48 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -249,7 +249,7 @@ static int out(struct sm_metadata *smm) } if (smm->recursion_count == 1) - apply_bops(smm); + r = apply_bops(smm); smm->recursion_count--; From 8465df4025dd4ab84fc24dad6a91cc2b9ec1604d Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 14 Jul 2019 14:06:40 +0300 Subject: [PATCH 199/266] net/mlx5: Fix crdump chunks print Crdump repeats itself every chunk of 256bytes. That is due to bug of missing progressing offset while copying the data from buffer to devlink_fmsg. Fixes: 9b1f29823605 ("net/mlx5: Add support for FW fatal reporter dump") Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 9314777d99e3..cc5887f52679 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -590,7 +590,8 @@ mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter, data_size = crdump_size - offset; else data_size = MLX5_CR_DUMP_CHUNK_SIZE; - err = devlink_fmsg_binary_put(fmsg, cr_data, data_size); + err = devlink_fmsg_binary_put(fmsg, (char *)cr_data + offset, + data_size); if (err) goto free_data; } From a6633e11e8732b9c000774746a2c1827a7e3c316 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 13 Aug 2019 12:49:13 +0300 Subject: [PATCH 200/266] net/mlx5: Fix delay in fw fatal report handling due to fw report When fw fatal error occurs, poll health() first detects and reports on a fw error. Afterwards, it detects and reports on the fw fatal error itself. That can cause a long delay in fw fatal error handling which waits in a queue for the fw error handling to be finished. The fw error handle will try asking for fw core dump command while fw in fatal state may not respond and driver will wait for command timeout. Changing the flow to detect and handle first fw fatal errors and only if no fatal error detected look for a fw error to handle. Fixes: d1bf0e2cc4a6 ("net/mlx5: Report devlink health on FW issues") Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/health.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index cc5887f52679..d685122d9ff7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -701,6 +701,16 @@ static void poll_health(struct timer_list *t) if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) goto out; + fatal_error = check_fatal_sensors(dev); + + if (fatal_error && !health->fatal_error) { + mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); + dev->priv.health.fatal_error = fatal_error; + print_health_info(dev); + mlx5_trigger_health_work(dev); + goto out; + } + count = ioread32be(health->health_counter); if (count == health->prev) ++health->miss_counter; @@ -719,15 +729,6 @@ static void poll_health(struct timer_list *t) if (health->synd && health->synd != prev_synd) queue_work(health->wq, &health->report_work); - fatal_error = check_fatal_sensors(dev); - - if (fatal_error && !health->fatal_error) { - mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); - dev->priv.health.fatal_error = fatal_error; - print_health_info(dev); - mlx5_trigger_health_work(dev); - } - out: mod_timer(&health->timer, get_next_poll_jiffies()); } From 5c6f40c61777e059ac3692c4505dff5eb880a12d Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 22 Aug 2019 15:03:27 +0300 Subject: [PATCH 201/266] net/mlx5e: Add num bytes metadata to WQE info For TLS WQEs, metadata info did not include num_bytes. Due to this issue, tx_tls_dump_bytes counter did not increment. Modify tx_fill_wi() to fill num bytes. When it is called for non-traffic WQE, zero is expected. Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support") Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 8b93101e1a09..0681735ea398 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -109,13 +109,15 @@ build_progress_params(struct mlx5e_tx_wqe *wqe, u16 pc, u32 sqn, static void tx_fill_wi(struct mlx5e_txqsq *sq, u16 pi, u8 num_wqebbs, - skb_frag_t *resync_dump_frag) + skb_frag_t *resync_dump_frag, + u32 num_bytes) { struct mlx5e_tx_wqe_info *wi = &sq->db.wqe_info[pi]; wi->skb = NULL; wi->num_wqebbs = num_wqebbs; wi->resync_dump_frag = resync_dump_frag; + wi->num_bytes = num_bytes; } void mlx5e_ktls_tx_offload_set_pending(struct mlx5e_ktls_offload_context_tx *priv_tx) @@ -143,7 +145,7 @@ post_static_params(struct mlx5e_txqsq *sq, umr_wqe = mlx5e_sq_fetch_wqe(sq, MLX5E_KTLS_STATIC_UMR_WQE_SZ, &pi); build_static_params(umr_wqe, sq->pc, sq->sqn, priv_tx, fence); - tx_fill_wi(sq, pi, MLX5E_KTLS_STATIC_WQEBBS, NULL); + tx_fill_wi(sq, pi, MLX5E_KTLS_STATIC_WQEBBS, NULL, 0); sq->pc += MLX5E_KTLS_STATIC_WQEBBS; } @@ -157,7 +159,7 @@ post_progress_params(struct mlx5e_txqsq *sq, wqe = mlx5e_sq_fetch_wqe(sq, MLX5E_KTLS_PROGRESS_WQE_SZ, &pi); build_progress_params(wqe, sq->pc, sq->sqn, priv_tx, fence); - tx_fill_wi(sq, pi, MLX5E_KTLS_PROGRESS_WQEBBS, NULL); + tx_fill_wi(sq, pi, MLX5E_KTLS_PROGRESS_WQEBBS, NULL, 0); sq->pc += MLX5E_KTLS_PROGRESS_WQEBBS; } @@ -296,7 +298,7 @@ tx_post_resync_dump(struct mlx5e_txqsq *sq, struct sk_buff *skb, dseg->byte_count = cpu_to_be32(fsz); mlx5e_dma_push(sq, dma_addr, fsz, MLX5E_DMA_MAP_PAGE); - tx_fill_wi(sq, pi, num_wqebbs, frag); + tx_fill_wi(sq, pi, num_wqebbs, frag, fsz); sq->pc += num_wqebbs; WARN(num_wqebbs > MLX5E_KTLS_MAX_DUMP_WQEBBS, @@ -323,7 +325,7 @@ static void tx_post_fence_nop(struct mlx5e_txqsq *sq) struct mlx5_wq_cyc *wq = &sq->wq; u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - tx_fill_wi(sq, pi, 1, NULL); + tx_fill_wi(sq, pi, 1, NULL, 0); mlx5e_post_nop_fence(wq, sq->sqn, &sq->pc); } From a195784c105b2907b45fd62307d9ce821da9dc20 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 21 Aug 2019 15:47:29 +0300 Subject: [PATCH 202/266] net/mlx5e: Remove ethernet segment from dump WQE Dump WQE shall not include Ethernet segment. Define mlx5e_dump_wqe to be used for "Dump WQEs" instead of sharing it with the general mlx5e_tx_wqe layout. Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support") Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en_accel/ktls_tx.c | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 0681735ea398..7833ddef0427 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -250,43 +250,37 @@ tx_post_resync_params(struct mlx5e_txqsq *sq, mlx5e_ktls_tx_post_param_wqes(sq, priv_tx, skip_static_post, true); } +struct mlx5e_dump_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_data_seg data; +}; + static int tx_post_resync_dump(struct mlx5e_txqsq *sq, struct sk_buff *skb, skb_frag_t *frag, u32 tisn, bool first) { struct mlx5_wqe_ctrl_seg *cseg; - struct mlx5_wqe_eth_seg *eseg; struct mlx5_wqe_data_seg *dseg; - struct mlx5e_tx_wqe *wqe; + struct mlx5e_dump_wqe *wqe; dma_addr_t dma_addr = 0; - u16 ds_cnt, ds_cnt_inl; u8 num_wqebbs; - u16 pi, ihs; + u16 ds_cnt; int fsz; - - ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; - ihs = eth_get_headlen(skb->dev, skb->data, skb_headlen(skb)); - ds_cnt_inl = DIV_ROUND_UP(ihs - INL_HDR_START_SZ, MLX5_SEND_WQE_DS); - ds_cnt += ds_cnt_inl; - ds_cnt += 1; /* one frag */ + u16 pi; wqe = mlx5e_sq_fetch_wqe(sq, sizeof(*wqe), &pi); + ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); cseg = &wqe->ctrl; - eseg = &wqe->eth; - dseg = wqe->data; + dseg = &wqe->data; cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_DUMP); cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); cseg->tisn = cpu_to_be32(tisn << 8); cseg->fm_ce_se = first ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; - eseg->inline_hdr.sz = cpu_to_be16(ihs); - memcpy(eseg->inline_hdr.start, skb->data, ihs); - dseg += ds_cnt_inl; - fsz = skb_frag_size(frag); dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz, DMA_TO_DEVICE); From 08f5439f1df25a6cf6cf4c72cf6c13025599ce67 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Aug 2019 22:19:11 -0600 Subject: [PATCH 203/266] io_uring: add need_resched() check in inner poll loop The outer poll loop checks for whether we need to reschedule, and returns to userspace if we do. However, it's possible to get stuck in the inner loop as well, if the CPU we are running on needs to reschedule to finish the IO work. Add the need_resched() check in the inner loop as well. This fixes a potential hang if the kernel is configured with CONFIG_PREEMPT_VOLUNTARY=y. Reported-by: Sagi Grimberg Reviewed-by: Sagi Grimberg Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e7a43a354d91..cfb48bd088e1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -778,7 +778,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->poll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); @@ -805,6 +805,12 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) unsigned int nr_events = 0; io_iopoll_getevents(ctx, &nr_events, 1); + + /* + * Ensure we allow local-to-the-cpu processing to take place, + * in this case we need to ensure that we reap all events. + */ + cond_resched(); } mutex_unlock(&ctx->uring_lock); } From e0917f879536cbf57367429d084775d8224c986c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 22 Jul 2019 09:12:56 +0200 Subject: [PATCH 204/266] um: fix time travel mode Unfortunately, my build fix for when time travel mode isn't enabled broke time travel mode, because I forgot that we need to use the timer time after the timer has been marked disabled, and thus need to leave the time stored instead of zeroing it. Fix that by splitting the inline into two, so we can call only the _mode() one in the relevant code path. Fixes: b482e48d29f1 ("um: fix build without CONFIG_UML_TIME_TRAVEL_SUPPORT") Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/include/shared/timer-internal.h | 14 ++++++++++---- arch/um/kernel/process.c | 2 +- arch/um/kernel/time.c | 16 +++++++++------- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/arch/um/include/shared/timer-internal.h b/arch/um/include/shared/timer-internal.h index 8574338bf23b..9991ec2371e4 100644 --- a/arch/um/include/shared/timer-internal.h +++ b/arch/um/include/shared/timer-internal.h @@ -34,10 +34,13 @@ static inline void time_travel_set_time(unsigned long long ns) time_travel_time = ns; } -static inline void time_travel_set_timer(enum time_travel_timer_mode mode, - unsigned long long expiry) +static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode) { time_travel_timer_mode = mode; +} + +static inline void time_travel_set_timer_expiry(unsigned long long expiry) +{ time_travel_timer_expiry = expiry; } #else @@ -50,8 +53,11 @@ static inline void time_travel_set_time(unsigned long long ns) { } -static inline void time_travel_set_timer(enum time_travel_timer_mode mode, - unsigned long long expiry) +static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode) +{ +} + +static inline void time_travel_set_timer_expiry(unsigned long long expiry) { } diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 67c0d1a860e9..6bede7888fc2 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -213,7 +213,7 @@ static void time_travel_sleep(unsigned long long duration) if (time_travel_timer_mode != TT_TMR_DISABLED || time_travel_timer_expiry < next) { if (time_travel_timer_mode == TT_TMR_ONESHOT) - time_travel_set_timer(TT_TMR_DISABLED, 0); + time_travel_set_timer_mode(TT_TMR_DISABLED); /* * time_travel_time will be adjusted in the timer * IRQ handler so it works even when the signal diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 6a051b078359..234757233355 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -50,7 +50,7 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) static int itimer_shutdown(struct clock_event_device *evt) { if (time_travel_mode != TT_MODE_OFF) - time_travel_set_timer(TT_TMR_DISABLED, 0); + time_travel_set_timer_mode(TT_TMR_DISABLED); if (time_travel_mode != TT_MODE_INFCPU) os_timer_disable(); @@ -62,9 +62,10 @@ static int itimer_set_periodic(struct clock_event_device *evt) { unsigned long long interval = NSEC_PER_SEC / HZ; - if (time_travel_mode != TT_MODE_OFF) - time_travel_set_timer(TT_TMR_PERIODIC, - time_travel_time + interval); + if (time_travel_mode != TT_MODE_OFF) { + time_travel_set_timer_mode(TT_TMR_PERIODIC); + time_travel_set_timer_expiry(time_travel_time + interval); + } if (time_travel_mode != TT_MODE_INFCPU) os_timer_set_interval(interval); @@ -77,9 +78,10 @@ static int itimer_next_event(unsigned long delta, { delta += 1; - if (time_travel_mode != TT_MODE_OFF) - time_travel_set_timer(TT_TMR_ONESHOT, - time_travel_time + delta); + if (time_travel_mode != TT_MODE_OFF) { + time_travel_set_timer_mode(TT_TMR_ONESHOT); + time_travel_set_timer_expiry(time_travel_time + delta); + } if (time_travel_mode != TT_MODE_INFCPU) return os_timer_one_shot(delta); From a71d9eff9394d24f05cbe115309152fb4543cd6c Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 21 Aug 2019 09:59:12 +0800 Subject: [PATCH 205/266] ocelot_ace: fix action of trap The trap action should be copying the frame to CPU and dropping it for forwarding, but current setting was just copying frame to CPU. Fixes: b596229448dd ("net: mscc: ocelot: Add support for tcam") Signed-off-by: Yangbo Lu Acked-by: Allan W. Nielsen Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_ace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot_ace.c b/drivers/net/ethernet/mscc/ocelot_ace.c index 39aca1ab4687..86fc6e6b46dd 100644 --- a/drivers/net/ethernet/mscc/ocelot_ace.c +++ b/drivers/net/ethernet/mscc/ocelot_ace.c @@ -317,7 +317,7 @@ static void is2_action_set(struct vcap_data *data, break; case OCELOT_ACL_ACTION_TRAP: VCAP_ACT_SET(PORT_MASK, 0x0); - VCAP_ACT_SET(MASK_MODE, 0x0); + VCAP_ACT_SET(MASK_MODE, 0x1); VCAP_ACT_SET(POLICE_ENA, 0x0); VCAP_ACT_SET(POLICE_IDX, 0x0); VCAP_ACT_SET(CPU_QU_NUM, 0x0); From de0e4fd2f07ce3bbdb69dfb8d9426b7227451b69 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Tue, 20 Aug 2019 23:46:36 -0500 Subject: [PATCH 206/266] qed: Add cleanup in qed_slowpath_start() If qed_mcp_send_drv_version() fails, no cleanup is executed, leading to memory leaks. To fix this issue, introduce the label 'err4' to perform the cleanup work before returning the error. Signed-off-by: Wenwen Wang Acked-by: Sudarsana Reddy Kalluru Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 829dd60ab937..1efff7f68ef6 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1325,7 +1325,7 @@ static int qed_slowpath_start(struct qed_dev *cdev, &drv_version); if (rc) { DP_NOTICE(cdev, "Failed sending drv version command\n"); - return rc; + goto err4; } } @@ -1333,6 +1333,8 @@ static int qed_slowpath_start(struct qed_dev *cdev, return 0; +err4: + qed_ll2_dealloc_if(cdev); err3: qed_hw_stop(cdev); err2: From b99328a60a482108f5195b4d611f90992ca016ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Aug 2019 13:00:15 +0200 Subject: [PATCH 207/266] timekeeping/vsyscall: Prevent math overflow in BOOTTIME update The VDSO update for CLOCK_BOOTTIME has a overflow issue as it shifts the nanoseconds based boot time offset left by the clocksource shift. That overflows once the boot time offset becomes large enough. As a consequence CLOCK_BOOTTIME in the VDSO becomes a random number causing applications to misbehave. Fix it by storing a timespec64 representation of the offset when boot time is adjusted and add that to the MONOTONIC base time value in the vdso data page. Using the timespec64 representation avoids a 64bit division in the update code. Fixes: 44f57d788e7d ("timekeeping: Provide a generic update_vsyscall() implementation") Reported-by: Chris Clayton Signed-off-by: Thomas Gleixner Tested-by: Chris Clayton Tested-by: Vincenzo Frascino Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1908221257580.1983@nanos.tec.linutronix.de --- include/linux/timekeeper_internal.h | 5 +++++ kernel/time/timekeeping.c | 5 +++++ kernel/time/vsyscall.c | 22 +++++++++++++--------- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 7acb953298a7..84ff2844df2a 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -57,6 +57,7 @@ struct tk_read_base { * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP * interval. @@ -84,6 +85,9 @@ struct tk_read_base { * * wall_to_monotonic is no longer the boot time, getboottime must be * used instead. + * + * @monotonic_to_boottime is a timespec64 representation of @offs_boot to + * accelerate the VDSO update for CLOCK_BOOTTIME. */ struct timekeeper { struct tk_read_base tkr_mono; @@ -99,6 +103,7 @@ struct timekeeper { u8 cs_was_changed_seq; ktime_t next_leap_ktime; u64 raw_sec; + struct timespec64 monotonic_to_boot; /* The following members are for timekeeping internal use */ u64 cycle_interval; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d911c8470149..ca69290bee2a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* + * Timespec representation for VDSO update to avoid 64bit division + * on every update. + */ + tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 8cf3596a4ce6..4bc37ac3bb05 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; - u64 nsec; + u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; @@ -45,23 +45,27 @@ static inline void update_vdso_data(struct vdso_data *vdata, } vdso_ts->nsec = nsec; - /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; - vdso_ts->sec = tk->raw_sec; - vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* Copy MONOTONIC time for BOOTTIME */ + sec = vdso_ts->sec; + /* Add the boot offset */ + sec += tk->monotonic_to_boot.tv_sec; + nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; - vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec; - nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + - ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); + vdso_ts->sec = sec; + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; From 75710f08ea7e41b2f7010da3f6deab061f7a853b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 21 Aug 2019 22:25:27 -0500 Subject: [PATCH 208/266] drm/amdgpu/powerplay: silence a warning in smu_v11_0_setup_pptable I think gcc is confused as I don't see how size could be used unitialized, but go ahead and silence the warning. Signed-off-by: Alex Deucher Reviewed-by: Evan Quan Signed-off-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20190822032527.1376-1-alexander.deucher@amd.com --- drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index 3ac061a3c3c5..53097961bf2b 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -326,7 +326,7 @@ static int smu_v11_0_setup_pptable(struct smu_context *smu) struct amdgpu_device *adev = smu->adev; const struct smc_firmware_header_v1_0 *hdr; int ret, index; - uint32_t size; + uint32_t size = 0; uint16_t atom_table_size; uint8_t frev, crev; void *table; From f6edbf2d616435cda7823942c20005ce198e97c8 Mon Sep 17 00:00:00 2001 From: "Justin.Lee1@Dell.com" Date: Wed, 21 Aug 2019 21:24:52 +0000 Subject: [PATCH 209/266] net/ncsi: Fix the payload copying for the request coming from Netlink The request coming from Netlink should use the OEM generic handler. The standard command handler expects payload in bytes/words/dwords but the actual payload is stored in data if the request is coming from Netlink. Signed-off-by: Justin Lee Reviewed-by: Vijay Khemka Signed-off-by: David S. Miller --- net/ncsi/ncsi-cmd.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index eab4346b0a39..0187e65176c0 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -309,14 +309,21 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) { - struct ncsi_request *nr; - struct ethhdr *eh; struct ncsi_cmd_handler *nch = NULL; + struct ncsi_request *nr; + unsigned char type; + struct ethhdr *eh; int i, ret; + /* Use OEM generic handler for Netlink request */ + if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) + type = NCSI_PKT_CMD_OEM; + else + type = nca->type; + /* Search for the handler */ for (i = 0; i < ARRAY_SIZE(ncsi_cmd_handlers); i++) { - if (ncsi_cmd_handlers[i].type == nca->type) { + if (ncsi_cmd_handlers[i].type == type) { if (ncsi_cmd_handlers[i].handler) nch = &ncsi_cmd_handlers[i]; else From c358ebf59634f06d8ed176da651ec150df3c8686 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 25 Jul 2019 15:40:01 -0400 Subject: [PATCH 210/266] drm/nouveau: Don't retry infinitely when receiving no data on i2c over AUX While I had thought I had fixed this issue in: commit 342406e4fbba ("drm/nouveau/i2c: Disable i2c bus access after ->fini()") It turns out that while I did fix the error messages I was seeing on my P50 when trying to access i2c busses with the GPU in runtime suspend, I accidentally had missed one important detail that was mentioned on the bug report this commit was supposed to fix: that the CPU would only lock up when trying to access i2c busses _on connected devices_ _while the GPU is not in runtime suspend_. Whoops. That definitely explains why I was not able to get my machine to hang with i2c bus interactions until now, as plugging my P50 into it's dock with an HDMI monitor connected allowed me to finally reproduce this locally. Now that I have managed to reproduce this issue properly, it looks like the problem is much simpler then it looks. It turns out that some connected devices, such as MST laptop docks, will actually ACK i2c reads even if no data was actually read: [ 275.063043] nouveau 0000:01:00.0: i2c: aux 000a: 1: 0000004c 1 [ 275.063447] nouveau 0000:01:00.0: i2c: aux 000a: 00 01101000 10040000 [ 275.063759] nouveau 0000:01:00.0: i2c: aux 000a: rd 00000001 [ 275.064024] nouveau 0000:01:00.0: i2c: aux 000a: rd 00000000 [ 275.064285] nouveau 0000:01:00.0: i2c: aux 000a: rd 00000000 [ 275.064594] nouveau 0000:01:00.0: i2c: aux 000a: rd 00000000 Because we don't handle the situation of i2c ack without any data, we end up entering an infinite loop in nvkm_i2c_aux_i2c_xfer() since the value of cnt always remains at 0. This finally properly explains how this could result in a CPU hang like the ones observed in the aforementioned commit. So, fix this by retrying transactions if no data is written or received, and give up and fail the transaction if we continue to not write or receive any data after 32 retries. Signed-off-by: Lyude Paul Cc: stable@vger.kernel.org Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.c | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.c b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.c index b4e7404fe660..a11637b0f6cc 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.c @@ -40,8 +40,7 @@ nvkm_i2c_aux_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) u8 *ptr = msg->buf; while (remaining) { - u8 cnt = (remaining > 16) ? 16 : remaining; - u8 cmd; + u8 cnt, retries, cmd; if (msg->flags & I2C_M_RD) cmd = 1; @@ -51,10 +50,19 @@ nvkm_i2c_aux_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) if (mcnt || remaining > 16) cmd |= 4; /* MOT */ - ret = aux->func->xfer(aux, true, cmd, msg->addr, ptr, &cnt); - if (ret < 0) { - nvkm_i2c_aux_release(aux); - return ret; + for (retries = 0, cnt = 0; + retries < 32 && !cnt; + retries++) { + cnt = min_t(u8, remaining, 16); + ret = aux->func->xfer(aux, true, cmd, + msg->addr, ptr, &cnt); + if (ret < 0) + goto out; + } + if (!cnt) { + AUX_TRACE(aux, "no data after 32 retries"); + ret = -EIO; + goto out; } ptr += cnt; @@ -64,8 +72,10 @@ nvkm_i2c_aux_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) msg++; } + ret = num; +out: nvkm_i2c_aux_release(aux); - return num; + return ret; } static u32 From 1fb254aa983bf190cfd685d40c64a480a9bafaee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Aug 2019 20:55:54 -0700 Subject: [PATCH 211/266] xfs: fix missing ILOCK unlock when xfs_setattr_nonsize fails due to EDQUOT Benjamin Moody reported to Debian that XFS partially wedges when a chgrp fails on account of being out of disk quota. I ran his reproducer script: # adduser dummy # adduser dummy plugdev # dd if=/dev/zero bs=1M count=100 of=test.img # mkfs.xfs test.img # mount -t xfs -o gquota test.img /mnt # mkdir -p /mnt/dummy # chown -c dummy /mnt/dummy # xfs_quota -xc 'limit -g bsoft=100k bhard=100k plugdev' /mnt (and then as user dummy) $ dd if=/dev/urandom bs=1M count=50 of=/mnt/dummy/foo $ chgrp plugdev /mnt/dummy/foo and saw: ================================================ WARNING: lock held when returning to user space! 5.3.0-rc5 #rc5 Tainted: G W ------------------------------------------------ chgrp/47006 is leaving the kernel with locks still held! 1 lock held by chgrp/47006: #0: 000000006664ea2d (&xfs_nondir_ilock_class){++++}, at: xfs_ilock+0xd2/0x290 [xfs] ...which is clearly caused by xfs_setattr_nonsize failing to unlock the ILOCK after the xfs_qm_vop_chown_reserve call fails. Add the missing unlock. Reported-by: benjamin.moody@gmail.com Fixes: 253f4911f297 ("xfs: better xfs_trans_alloc interface") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Tested-by: Salvatore Bonaccorso --- fs/xfs/xfs_iops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ff3c1fae5357..fe285d123d69 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -793,6 +793,7 @@ xfs_setattr_nonsize( out_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); From 48057ed1840fde9239b1e000bea1a0a1f07c5e99 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 20 Aug 2019 10:05:27 +0200 Subject: [PATCH 212/266] gpio: Fix irqchip initialization order The new API for registering a gpio_irq_chip along with a gpio_chip has a different semantic ordering than the old API which added the irqchip explicitly after registering the gpio_chip. Move the calls to add the gpio_irq_chip *last* in the function, so that the different hooks setting up OF and ACPI and machine gpio_chips are called *before* we try to register the interrupts, preserving the elder semantic order. This cropped up in the PL061 driver which used to work fine with no special ACPI quirks, but started to misbehave using the new API. Fixes: e0d897289813 ("gpio: Implement tighter IRQ chip integration") Cc: Thierry Reding Cc: Grygorii Strashko Cc: Andy Shevchenko Reported-by: Wei Xu Tested-by: Wei Xu Reported-by: Andy Shevchenko Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20190820080527.11796-1-linus.walleij@linaro.org --- drivers/gpio/gpiolib.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 80a2a2cb673b..cca749010cd0 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1373,21 +1373,13 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, if (status) goto err_remove_from_list; - status = gpiochip_irqchip_init_valid_mask(chip); + status = gpiochip_alloc_valid_mask(chip); if (status) goto err_remove_from_list; - status = gpiochip_alloc_valid_mask(chip); - if (status) - goto err_remove_irqchip_mask; - - status = gpiochip_add_irqchip(chip, lock_key, request_key); - if (status) - goto err_free_gpiochip_mask; - status = of_gpiochip_add(chip); if (status) - goto err_remove_chip; + goto err_free_gpiochip_mask; status = gpiochip_init_valid_mask(chip); if (status) @@ -1413,6 +1405,14 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, machine_gpiochip_add(chip); + status = gpiochip_irqchip_init_valid_mask(chip); + if (status) + goto err_remove_acpi_chip; + + status = gpiochip_add_irqchip(chip, lock_key, request_key); + if (status) + goto err_remove_irqchip_mask; + /* * By first adding the chardev, and then adding the device, * we get a device node entry in sysfs under @@ -1424,21 +1424,21 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, if (gpiolib_initialized) { status = gpiochip_setup_dev(gdev); if (status) - goto err_remove_acpi_chip; + goto err_remove_irqchip; } return 0; +err_remove_irqchip: + gpiochip_irqchip_remove(chip); +err_remove_irqchip_mask: + gpiochip_irqchip_free_valid_mask(chip); err_remove_acpi_chip: acpi_gpiochip_remove(chip); err_remove_of_chip: gpiochip_free_hogs(chip); of_gpiochip_remove(chip); -err_remove_chip: - gpiochip_irqchip_remove(chip); err_free_gpiochip_mask: gpiochip_free_valid_mask(chip); -err_remove_irqchip_mask: - gpiochip_irqchip_free_valid_mask(chip); err_remove_from_list: spin_lock_irqsave(&gpio_lock, flags); list_del(&gdev->list); From 1cfd5d3399e87167b7f9157ef99daa0e959f395d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 23 Aug 2019 09:54:09 -0400 Subject: [PATCH 213/266] dm table: fix invalid memory accesses with too high sector number If the sector number is too high, dm_table_find_target() should return a pointer to a zeroed dm_target structure (the caller should test it with dm_target_is_valid). However, for some table sizes, the code in dm_table_find_target() that performs btree lookup will access out of bound memory structures. Fix this bug by testing the sector number at the beginning of dm_table_find_target(). Also, add an "inline" keyword to the function dm_table_get_size() because this is a hot path. Fixes: 512875bd9661 ("dm: table detect io beyond device") Cc: stable@vger.kernel.org Reported-by: Zhang Tao Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 7b6c3ee9e755..8820931ec7d2 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1342,7 +1342,7 @@ void dm_table_event(struct dm_table *t) } EXPORT_SYMBOL(dm_table_event); -sector_t dm_table_get_size(struct dm_table *t) +inline sector_t dm_table_get_size(struct dm_table *t) { return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; } @@ -1367,6 +1367,9 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) unsigned int l, n = 0, k = 0; sector_t *node; + if (unlikely(sector >= dm_table_get_size(t))) + return &t->targets[t->num_targets]; + for (l = 0; l < t->depth; l++) { n = get_child(n, k); node = get_node(t, l, n); From b63f20a778c88b6a04458ed6ffc69da953d3a109 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Aug 2019 14:11:22 -0700 Subject: [PATCH 214/266] x86/retpoline: Don't clobber RFLAGS during CALL_NOSPEC on i386 Use 'lea' instead of 'add' when adjusting %rsp in CALL_NOSPEC so as to avoid clobbering flags. KVM's emulator makes indirect calls into a jump table of sorts, where the destination of the CALL_NOSPEC is a small blob of code that performs fast emulation by executing the target instruction with fixed operands. adcb_al_dl: 0x000339f8 <+0>: adc %dl,%al 0x000339fa <+2>: ret A major motiviation for doing fast emulation is to leverage the CPU to handle consumption and manipulation of arithmetic flags, i.e. RFLAGS is both an input and output to the target of CALL_NOSPEC. Clobbering flags results in all sorts of incorrect emulation, e.g. Jcc instructions often take the wrong path. Sans the nops... asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" 0x0003595a <+58>: mov 0xc0(%ebx),%eax 0x00035960 <+64>: mov 0x60(%ebx),%edx 0x00035963 <+67>: mov 0x90(%ebx),%ecx 0x00035969 <+73>: push %edi 0x0003596a <+74>: popf 0x0003596b <+75>: call *%esi 0x000359a0 <+128>: pushf 0x000359a1 <+129>: pop %edi 0x000359a2 <+130>: mov %eax,0xc0(%ebx) 0x000359b1 <+145>: mov %edx,0x60(%ebx) ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); 0x000359a8 <+136>: mov -0x10(%ebp),%eax 0x000359ab <+139>: and $0x8d5,%edi 0x000359b4 <+148>: and $0xfffff72a,%eax 0x000359b9 <+153>: or %eax,%edi 0x000359bd <+157>: mov %edi,0x4(%ebx) For the most part this has gone unnoticed as emulation of guest code that can trigger fast emulation is effectively limited to MMIO when running on modern hardware, and MMIO is rarely, if ever, accessed by instructions that affect or consume flags. Breakage is almost instantaneous when running with unrestricted guest disabled, in which case KVM must emulate all instructions when the guest has invalid state, e.g. when the guest is in Big Real Mode during early BIOS. Fixes: 776b043848fd2 ("x86/retpoline: Add initial retpoline support") Fixes: 1a29b5b7f347a ("KVM: x86: Make indirect calls in emulator speculation safe") Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190822211122.27579-1-sean.j.christopherson@intel.com --- arch/x86/include/asm/nospec-branch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 109f974f9835..80bc209c0708 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -192,7 +192,7 @@ " lfence;\n" \ " jmp 902b;\n" \ " .align 16\n" \ - "903: addl $4, %%esp;\n" \ + "903: lea 4(%%esp), %%esp;\n" \ " pushl %[thunk_target];\n" \ " ret;\n" \ " .align 16\n" \ From c536277e0db1ad2e9fbb9dfd940c3565a14d9c52 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 22 Aug 2019 19:37:38 +0200 Subject: [PATCH 215/266] RDMA/siw: Fix 64/32bit pointer inconsistency Fixes improper casting between addresses and unsigned types. Changes siw_pbl_get_buffer() function to return appropriate dma_addr_t, and not u64. Also fixes debug prints. Now any potentially kernel private pointers are printed formatted as '%pK', to allow keeping that information secret. Fixes: d941bfe500be ("RDMA/siw: Change CQ flags from 64->32 bits") Fixes: b0fff7317bb4 ("rdma/siw: completion queue methods") Fixes: 8b6a361b8c48 ("rdma/siw: receive path") Fixes: b9be6f18cf9e ("rdma/siw: transmit path") Fixes: f29dd55b0236 ("rdma/siw: queue pair methods") Fixes: 2251334dcac9 ("rdma/siw: application buffer management") Fixes: 303ae1cdfdf7 ("rdma/siw: application interface") Fixes: 6c52fdc244b5 ("rdma/siw: connection management") Fixes: a531975279f3 ("rdma/siw: main include file") Reported-by: Geert Uytterhoeven Reported-by: Jason Gunthorpe Reported-by: Leon Romanovsky Signed-off-by: Bernard Metzler Link: https://lore.kernel.org/r/20190822173738.26817-1-bmt@zurich.ibm.com Signed-off-by: Doug Ledford --- drivers/infiniband/sw/siw/siw.h | 8 +-- drivers/infiniband/sw/siw/siw_cm.c | 77 +++++++++++++-------------- drivers/infiniband/sw/siw/siw_cq.c | 5 +- drivers/infiniband/sw/siw/siw_mem.c | 14 ++--- drivers/infiniband/sw/siw/siw_mem.h | 2 +- drivers/infiniband/sw/siw/siw_qp.c | 2 +- drivers/infiniband/sw/siw/siw_qp_rx.c | 26 ++++----- drivers/infiniband/sw/siw/siw_qp_tx.c | 43 +++++++-------- drivers/infiniband/sw/siw/siw_verbs.c | 40 +++++++------- 9 files changed, 108 insertions(+), 109 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 77b1aabf6ff3..dba4535494ab 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -138,9 +138,9 @@ struct siw_umem { }; struct siw_pble { - u64 addr; /* Address of assigned user buffer */ - u64 size; /* Size of this entry */ - u64 pbl_off; /* Total offset from start of PBL */ + dma_addr_t addr; /* Address of assigned buffer */ + unsigned int size; /* Size of this entry */ + unsigned long pbl_off; /* Total offset from start of PBL */ }; struct siw_pbl { @@ -734,7 +734,7 @@ static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__) #define siw_dbg_cep(cep, fmt, ...) \ - ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \ + ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%pK] %s: " fmt, \ cep, __func__, ##__VA_ARGS__) void siw_cq_flush(struct siw_cq *cq); diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index fc97571a640b..1db5ad3d9580 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -355,8 +355,8 @@ static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, getname_local(cep->sock, &event.local_addr); getname_peer(cep->sock, &event.remote_addr); } - siw_dbg_cep(cep, "[QP %u]: id 0x%p, reason=%d, status=%d\n", - cep->qp ? qp_id(cep->qp) : -1, id, reason, status); + siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n", + cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status); return id->event_handler(id, &event); } @@ -947,8 +947,6 @@ static void siw_accept_newconn(struct siw_cep *cep) siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; - siw_dbg_cep(cep, "listen socket 0x%p, new 0x%p\n", s, new_s); - if (siw_tcp_nagle == false) { int val = 1; @@ -1011,7 +1009,8 @@ static void siw_cm_work_handler(struct work_struct *w) cep = work->cep; siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", - cep->qp ? qp_id(cep->qp) : -1, work->type, cep->state); + cep->qp ? qp_id(cep->qp) : UINT_MAX, + work->type, cep->state); siw_cep_set_inuse(cep); @@ -1145,9 +1144,9 @@ static void siw_cm_work_handler(struct work_struct *w) } if (release_cep) { siw_dbg_cep(cep, - "release: timer=%s, QP[%u], id 0x%p\n", + "release: timer=%s, QP[%u]\n", cep->mpa_timer ? "y" : "n", - cep->qp ? qp_id(cep->qp) : -1, cep->cm_id); + cep->qp ? qp_id(cep->qp) : UINT_MAX); siw_cancel_mpatimer(cep); @@ -1211,8 +1210,8 @@ int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) else delay = MPAREP_TIMEOUT; } - siw_dbg_cep(cep, "[QP %u]: work type: %d, work 0x%p, timeout %lu\n", - cep->qp ? qp_id(cep->qp) : -1, type, work, delay); + siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n", + cep->qp ? qp_id(cep->qp) : -1, type, delay); queue_delayed_work(siw_cm_wq, &work->work, delay); @@ -1376,16 +1375,16 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) } if (v4) siw_dbg_qp(qp, - "id 0x%p, pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n", - id, pd_len, + "pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n", + pd_len, &((struct sockaddr_in *)(laddr))->sin_addr, ntohs(((struct sockaddr_in *)(laddr))->sin_port), &((struct sockaddr_in *)(raddr))->sin_addr, ntohs(((struct sockaddr_in *)(raddr))->sin_port)); else siw_dbg_qp(qp, - "id 0x%p, pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n", - id, pd_len, + "pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n", + pd_len, &((struct sockaddr_in6 *)(laddr))->sin6_addr, ntohs(((struct sockaddr_in6 *)(laddr))->sin6_port), &((struct sockaddr_in6 *)(raddr))->sin6_addr, @@ -1508,8 +1507,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) if (rv >= 0) { rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); if (!rv) { - siw_dbg_cep(cep, "id 0x%p, [QP %u]: exit\n", id, - qp_id(qp)); + siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp)); siw_cep_set_free(cep); return 0; } @@ -1581,7 +1579,7 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { - siw_dbg_cep(cep, "id 0x%p: out of state\n", id); + siw_dbg_cep(cep, "out of state\n"); siw_cep_set_free(cep); siw_cep_put(cep); @@ -1602,7 +1600,7 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) up_write(&qp->state_lock); goto error; } - siw_dbg_cep(cep, "id 0x%p\n", id); + siw_dbg_cep(cep, "[QP %d]\n", params->qpn); if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); @@ -1612,8 +1610,8 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) params->ird > sdev->attrs.max_ird) { siw_dbg_cep( cep, - "id 0x%p, [QP %u]: ord %d (max %d), ird %d (max %d)\n", - id, qp_id(qp), params->ord, sdev->attrs.max_ord, + "[QP %u]: ord %d (max %d), ird %d (max %d)\n", + qp_id(qp), params->ord, sdev->attrs.max_ord, params->ird, sdev->attrs.max_ird); rv = -EINVAL; up_write(&qp->state_lock); @@ -1625,8 +1623,8 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) if (params->private_data_len > max_priv_data) { siw_dbg_cep( cep, - "id 0x%p, [QP %u]: private data length: %d (max %d)\n", - id, qp_id(qp), params->private_data_len, max_priv_data); + "[QP %u]: private data length: %d (max %d)\n", + qp_id(qp), params->private_data_len, max_priv_data); rv = -EINVAL; up_write(&qp->state_lock); goto error; @@ -1680,7 +1678,7 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.state = SIW_QP_STATE_RTS; - siw_dbg_cep(cep, "id 0x%p, [QP%u]: moving to rts\n", id, qp_id(qp)); + siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp)); /* Associate QP with CEP */ siw_cep_get(cep); @@ -1701,8 +1699,8 @@ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) if (rv) goto error; - siw_dbg_cep(cep, "id 0x%p, [QP %u]: send mpa reply, %d byte pdata\n", - id, qp_id(qp), params->private_data_len); + siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n", + qp_id(qp), params->private_data_len); rv = siw_send_mpareqrep(cep, params->private_data, params->private_data_len); @@ -1760,14 +1758,14 @@ int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { - siw_dbg_cep(cep, "id 0x%p: out of state\n", id); + siw_dbg_cep(cep, "out of state\n"); siw_cep_set_free(cep); siw_cep_put(cep); /* put last reference */ return -ECONNRESET; } - siw_dbg_cep(cep, "id 0x%p, cep->state %d, pd_len %d\n", id, cep->state, + siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state, pd_len); if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { @@ -1805,14 +1803,14 @@ static int siw_listen_address(struct iw_cm_id *id, int backlog, rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, sizeof(s_val)); if (rv) { - siw_dbg(id->device, "id 0x%p: setsockopt error: %d\n", id, rv); + siw_dbg(id->device, "setsockopt error: %d\n", rv); goto error; } rv = s->ops->bind(s, laddr, addr_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); if (rv) { - siw_dbg(id->device, "id 0x%p: socket bind error: %d\n", id, rv); + siw_dbg(id->device, "socket bind error: %d\n", rv); goto error; } cep = siw_cep_alloc(sdev); @@ -1825,13 +1823,13 @@ static int siw_listen_address(struct iw_cm_id *id, int backlog, rv = siw_cm_alloc_work(cep, backlog); if (rv) { siw_dbg(id->device, - "id 0x%p: alloc_work error %d, backlog %d\n", id, + "alloc_work error %d, backlog %d\n", rv, backlog); goto error; } rv = s->ops->listen(s, backlog); if (rv) { - siw_dbg(id->device, "id 0x%p: listen error %d\n", id, rv); + siw_dbg(id->device, "listen error %d\n", rv); goto error; } cep->cm_id = id; @@ -1915,8 +1913,7 @@ static void siw_drop_listeners(struct iw_cm_id *id) list_del(p); - siw_dbg_cep(cep, "id 0x%p: drop cep, state %d\n", id, - cep->state); + siw_dbg_cep(cep, "drop cep, state %d\n", cep->state); siw_cep_set_inuse(cep); @@ -1953,7 +1950,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct net_device *dev = to_siw_dev(id->device)->netdev; int rv = 0, listeners = 0; - siw_dbg(id->device, "id 0x%p: backlog %d\n", id, backlog); + siw_dbg(id->device, "backlog %d\n", backlog); /* * For each attached address of the interface, create a @@ -1969,8 +1966,8 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) s_raddr = (struct sockaddr_in *)&id->remote_addr; siw_dbg(id->device, - "id 0x%p: laddr %pI4:%d, raddr %pI4:%d\n", - id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port), + "laddr %pI4:%d, raddr %pI4:%d\n", + &s_laddr.sin_addr, ntohs(s_laddr.sin_port), &s_raddr->sin_addr, ntohs(s_raddr->sin_port)); rtnl_lock(); @@ -1995,8 +1992,8 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) *s_raddr = &to_sockaddr_in6(id->remote_addr); siw_dbg(id->device, - "id 0x%p: laddr %pI6:%d, raddr %pI6:%d\n", - id, &s_laddr->sin6_addr, ntohs(s_laddr->sin6_port), + "laddr %pI6:%d, raddr %pI6:%d\n", + &s_laddr->sin6_addr, ntohs(s_laddr->sin6_port), &s_raddr->sin6_addr, ntohs(s_raddr->sin6_port)); read_lock_bh(&in6_dev->lock); @@ -2029,17 +2026,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) else if (!rv) rv = -EINVAL; - siw_dbg(id->device, "id 0x%p: %s\n", id, rv ? "FAIL" : "OK"); + siw_dbg(id->device, "%s\n", rv ? "FAIL" : "OK"); return rv; } int siw_destroy_listen(struct iw_cm_id *id) { - siw_dbg(id->device, "id 0x%p\n", id); - if (!id->provider_data) { - siw_dbg(id->device, "id 0x%p: no cep(s)\n", id); + siw_dbg(id->device, "no cep(s)\n"); return 0; } siw_drop_listeners(id); diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index e381ae9b7d62..d8db3bee9da7 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -71,9 +71,10 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) wc->wc_flags = IB_WC_WITH_INVALIDATE; } wc->qp = cqe->base_qp; - siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%p\n", + siw_dbg_cq(cq, + "idx %u, type %d, flags %2x, id 0x%pK\n", cq->cq_get % cq->num_cqe, cqe->opcode, - cqe->flags, (void *)cqe->id); + cqe->flags, (void *)(uintptr_t)cqe->id); } WRITE_ONCE(cqe->flags, 0); cq->cq_get++; diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index 67171c82b0c4..87a56039f0ef 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -197,12 +197,12 @@ int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, */ if (addr < mem->va || addr + len > mem->va + mem->len) { siw_dbg_pd(pd, "MEM interval len %d\n", len); - siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n", - (unsigned long long)addr, - (unsigned long long)(addr + len)); - siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n", - (unsigned long long)mem->va, - (unsigned long long)(mem->va + mem->len), + siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n", + (void *)(uintptr_t)addr, + (void *)(uintptr_t)(addr + len)); + siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n", + (void *)(uintptr_t)mem->va, + (void *)(uintptr_t)(mem->va + mem->len), mem->stag); return -E_BASE_BOUNDS; @@ -330,7 +330,7 @@ out: * Optionally, provides remaining len within current element, and * current PBL index for later resume at same element. */ -u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) +dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) { int i = idx ? *idx : 0; diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h index f43daf280891..db138c8423da 100644 --- a/drivers/infiniband/sw/siw/siw_mem.h +++ b/drivers/infiniband/sw/siw/siw_mem.h @@ -9,7 +9,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable); void siw_umem_release(struct siw_umem *umem, bool dirty); struct siw_pbl *siw_pbl_alloc(u32 num_buf); -u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); +dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index); int siw_mem_add(struct siw_device *sdev, struct siw_mem *m); int siw_invalidate_stag(struct ib_pd *pd, u32 stag); diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c index 0990307c5d2c..430314c8abd9 100644 --- a/drivers/infiniband/sw/siw/siw_qp.c +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -949,7 +949,7 @@ skip_irq: rv = -EINVAL; goto out; } - wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; + wqe->sqe.sge[0].laddr = (uintptr_t)&wqe->sqe.sge[1]; wqe->sqe.sge[0].lkey = 0; wqe->sqe.num_sge = 1; } diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index f87657a11657..c0a887240325 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -38,9 +38,10 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, p = siw_get_upage(umem, dest_addr); if (unlikely(!p)) { - pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n", + pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n", __func__, qp_id(rx_qp(srx)), - (void *)dest_addr, (void *)umem->fp_addr); + (void *)(uintptr_t)dest_addr, + (void *)(uintptr_t)umem->fp_addr); /* siw internal error */ srx->skb_copied += copied; srx->skb_new -= copied; @@ -50,7 +51,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, pg_off = dest_addr & ~PAGE_MASK; bytes = min(len, (int)PAGE_SIZE - pg_off); - siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes); + siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes); dest = kmap_atomic(p); rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, @@ -104,11 +105,11 @@ static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) { int rv; - siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len); + siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len); rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); if (unlikely(rv)) { - pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n", + pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n", qp_id(rx_qp(srx)), __func__, len, kva, rv); return rv; @@ -132,7 +133,7 @@ static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, while (len) { int bytes; - u64 buf_addr = + dma_addr_t buf_addr = siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx); if (!buf_addr) break; @@ -485,8 +486,8 @@ int siw_proc_send(struct siw_qp *qp) mem_p = *mem; if (mem_p->mem_obj == NULL) rv = siw_rx_kva(srx, - (void *)(sge->laddr + frx->sge_off), - sge_bytes); + (void *)(uintptr_t)(sge->laddr + frx->sge_off), + sge_bytes); else if (!mem_p->is_pbl) rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + frx->sge_off, sge_bytes); @@ -598,8 +599,8 @@ int siw_proc_write(struct siw_qp *qp) if (mem->mem_obj == NULL) rv = siw_rx_kva(srx, - (void *)(srx->ddp_to + srx->fpdu_part_rcvd), - bytes); + (void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd), + bytes); else if (!mem->is_pbl) rv = siw_rx_umem(srx, mem->umem, srx->ddp_to + srx->fpdu_part_rcvd, bytes); @@ -841,8 +842,9 @@ int siw_proc_rresp(struct siw_qp *qp) bytes = min(srx->fpdu_part_rem, srx->skb_new); if (mem_p->mem_obj == NULL) - rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed), - bytes); + rv = siw_rx_kva(srx, + (void *)(uintptr_t)(sge->laddr + wqe->processed), + bytes); else if (!mem_p->is_pbl) rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed, bytes); diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 42c63622c7bd..438a2917a47c 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -26,7 +26,7 @@ static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) { struct siw_pbl *pbl = mem->pbl; u64 offset = addr - mem->va; - u64 paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); + dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); if (paddr) return virt_to_page(paddr); @@ -37,7 +37,7 @@ static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) /* * Copy short payload at provided destination payload address */ -static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr) +static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr) { struct siw_wqe *wqe = &c_tx->wqe_active; struct siw_sge *sge = &wqe->sqe.sge[0]; @@ -50,16 +50,16 @@ static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr) return 0; if (tx_flags(wqe) & SIW_WQE_INLINE) { - memcpy((void *)paddr, &wqe->sqe.sge[1], bytes); + memcpy(paddr, &wqe->sqe.sge[1], bytes); } else { struct siw_mem *mem = wqe->mem[0]; if (!mem->mem_obj) { /* Kernel client using kva */ - memcpy((void *)paddr, (void *)sge->laddr, bytes); + memcpy(paddr, + (const void *)(uintptr_t)sge->laddr, bytes); } else if (c_tx->in_syscall) { - if (copy_from_user((void *)paddr, - (const void __user *)sge->laddr, + if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr), bytes)) return -EFAULT; } else { @@ -79,12 +79,12 @@ static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr) buffer = kmap_atomic(p); if (likely(PAGE_SIZE - off >= bytes)) { - memcpy((void *)paddr, buffer + off, bytes); + memcpy(paddr, buffer + off, bytes); kunmap_atomic(buffer); } else { unsigned long part = bytes - (PAGE_SIZE - off); - memcpy((void *)paddr, buffer + off, part); + memcpy(paddr, buffer + off, part); kunmap_atomic(buffer); if (!mem->is_pbl) @@ -98,7 +98,7 @@ static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr) return -EFAULT; buffer = kmap_atomic(p); - memcpy((void *)(paddr + part), buffer, + memcpy(paddr + part, buffer, bytes - part); kunmap_atomic(buffer); } @@ -166,7 +166,7 @@ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) c_tx->ctrl_len = sizeof(struct iwarp_send); crc = (char *)&c_tx->pkt.send_pkt.crc; - data = siw_try_1seg(c_tx, (u64)crc); + data = siw_try_1seg(c_tx, crc); break; case SIW_OP_SEND_REMOTE_INV: @@ -189,7 +189,7 @@ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) c_tx->ctrl_len = sizeof(struct iwarp_send_inv); crc = (char *)&c_tx->pkt.send_pkt.crc; - data = siw_try_1seg(c_tx, (u64)crc); + data = siw_try_1seg(c_tx, crc); break; case SIW_OP_WRITE: @@ -201,7 +201,7 @@ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); crc = (char *)&c_tx->pkt.write_pkt.crc; - data = siw_try_1seg(c_tx, (u64)crc); + data = siw_try_1seg(c_tx, crc); break; case SIW_OP_READ_RESPONSE: @@ -216,7 +216,7 @@ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); crc = (char *)&c_tx->pkt.write_pkt.crc; - data = siw_try_1seg(c_tx, (u64)crc); + data = siw_try_1seg(c_tx, crc); break; default: @@ -471,7 +471,8 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) * tx from kernel virtual address: either inline data * or memory region with assigned kernel buffer */ - iov[seg].iov_base = (void *)(sge->laddr + sge_off); + iov[seg].iov_base = + (void *)(uintptr_t)(sge->laddr + sge_off); iov[seg].iov_len = sge_len; if (do_crc) @@ -523,13 +524,13 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) page_address(p) + fp_off, plen); } else { - u64 pa = ((sge->laddr + sge_off) & PAGE_MASK); + u64 va = sge->laddr + sge_off; - page_array[seg] = virt_to_page(pa); + page_array[seg] = virt_to_page(va & PAGE_MASK); if (do_crc) crypto_shash_update( c_tx->mpa_crc_hd, - (void *)(sge->laddr + sge_off), + (void *)(uintptr_t)va, plen); } @@ -822,7 +823,8 @@ static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe) rv = -EINVAL; goto tx_error; } - wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; + wqe->sqe.sge[0].laddr = + (u64)(uintptr_t)&wqe->sqe.sge[1]; } } wqe->wr_status = SIW_WR_INPROGRESS; @@ -917,7 +919,7 @@ tx_error: static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) { - struct ib_mr *base_mr = (struct ib_mr *)sqe->base_mr; + struct ib_mr *base_mr = (struct ib_mr *)(uintptr_t)sqe->base_mr; struct siw_device *sdev = to_siw_dev(pd->device); struct siw_mem *mem = siw_mem_id2obj(sdev, sqe->rkey >> 8); int rv = 0; @@ -947,8 +949,7 @@ static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) mem->stag = sqe->rkey; mem->perms = sqe->access; - siw_dbg_mem(mem, "STag now valid, MR va: 0x%016llx -> 0x%016llx\n", - mem->va, base_mr->iova); + siw_dbg_mem(mem, "STag 0x%08x now valid\n", sqe->rkey); mem->va = base_mr->iova; mem->stag_valid = 1; out: diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index e7f3a2379d9d..da52c90e06d4 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -424,8 +424,7 @@ struct ib_qp *siw_create_qp(struct ib_pd *pd, */ qp->srq = to_siw_srq(attrs->srq); qp->attrs.rq_size = 0; - siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n", - qp->qp_num, qp->srq); + siw_dbg(base_dev, "QP [%u]: SRQ attached\n", qp->qp_num); } else if (num_rqe) { if (qp->kernel_verbs) qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); @@ -610,7 +609,7 @@ int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) base_ucontext); struct siw_qp_attrs qp_attrs; - siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); + siw_dbg_qp(qp, "state %d\n", qp->attrs.state); /* * Mark QP as in process of destruction to prevent from @@ -662,7 +661,7 @@ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, void *kbuf = &sqe->sge[1]; int num_sge = core_wr->num_sge, bytes = 0; - sqe->sge[0].laddr = (u64)kbuf; + sqe->sge[0].laddr = (uintptr_t)kbuf; sqe->sge[0].lkey = 0; while (num_sge--) { @@ -825,7 +824,7 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, break; case IB_WR_REG_MR: - sqe->base_mr = (uint64_t)reg_wr(wr)->mr; + sqe->base_mr = (uintptr_t)reg_wr(wr)->mr; sqe->rkey = reg_wr(wr)->key; sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; sqe->opcode = SIW_OP_REG_MR; @@ -842,8 +841,9 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, rv = -EINVAL; break; } - siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n", - sqe->opcode, sqe->flags, (void *)sqe->id); + siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", + sqe->opcode, sqe->flags, + (void *)(uintptr_t)sqe->id); if (unlikely(rv < 0)) break; @@ -1205,8 +1205,8 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); int rv; - siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n", - (unsigned long long)start, (unsigned long long)rnic_va, + siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", + (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, (unsigned long long)len); if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { @@ -1363,7 +1363,7 @@ int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, struct siw_mem *mem = mr->mem; struct siw_pbl *pbl = mem->pbl; struct siw_pble *pble; - u64 pbl_size; + unsigned long pbl_size; int i, rv; if (!pbl) { @@ -1402,16 +1402,18 @@ int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, pbl_size += sg_dma_len(slp); } siw_dbg_mem(mem, - "sge[%d], size %llu, addr 0x%016llx, total %llu\n", - i, pble->size, pble->addr, pbl_size); + "sge[%d], size %u, addr 0x%p, total %lu\n", + i, pble->size, (void *)(uintptr_t)pble->addr, + pbl_size); } rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); if (rv > 0) { mem->len = base_mr->length; mem->va = base_mr->iova; siw_dbg_mem(mem, - "%llu bytes, start 0x%016llx, %u SLE to %u entries\n", - mem->len, mem->va, num_sle, pbl->num_buf); + "%llu bytes, start 0x%pK, %u SLE to %u entries\n", + mem->len, (void *)(uintptr_t)mem->va, num_sle, + pbl->num_buf); } return rv; } @@ -1529,7 +1531,7 @@ int siw_create_srq(struct ib_srq *base_srq, } spin_lock_init(&srq->lock); - siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq); + siw_dbg_pd(base_srq->pd, "[SRQ]: success\n"); return 0; @@ -1650,8 +1652,7 @@ int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, if (unlikely(!srq->kernel_verbs)) { siw_dbg_pd(base_srq->pd, - "[SRQ 0x%p]: no kernel post_recv for mapped srq\n", - srq); + "[SRQ]: no kernel post_recv for mapped srq\n"); rv = -EINVAL; goto out; } @@ -1673,8 +1674,7 @@ int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, } if (unlikely(wr->num_sge > srq->max_sge)) { siw_dbg_pd(base_srq->pd, - "[SRQ 0x%p]: too many sge's: %d\n", srq, - wr->num_sge); + "[SRQ]: too many sge's: %d\n", wr->num_sge); rv = -EINVAL; break; } @@ -1693,7 +1693,7 @@ int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&srq->lock, flags); out: if (unlikely(rv < 0)) { - siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv); + siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv); *bad_wr = wr; } return rv; From 2e16f3e926ed48373c98edea85c6ad0ef69425d1 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 23 Aug 2019 11:34:16 +0100 Subject: [PATCH 216/266] KVM: arm/arm64: VGIC: Properly initialise private IRQ affinity At the moment we initialise the target *mask* of a virtual IRQ to the VCPU it belongs to, even though this mask is only defined for GICv2 and quickly runs out of bits for many GICv3 guests. This behaviour triggers an UBSAN complaint for more than 32 VCPUs: ------ [ 5659.462377] UBSAN: Undefined behaviour in virt/kvm/arm/vgic/vgic-init.c:223:21 [ 5659.471689] shift exponent 32 is too large for 32-bit type 'unsigned int' ------ Also for GICv3 guests the reporting of TARGET in the "vgic-state" debugfs dump is wrong, due to this very same problem. Because there is no requirement to create the VGIC device before the VCPUs (and QEMU actually does it the other way round), we can't safely initialise mpidr or targets in kvm_vgic_vcpu_init(). But since we touch every private IRQ for each VCPU anyway later (in vgic_init()), we can just move the initialisation of those fields into there, where we definitely know the VGIC type. On the way make sure we really have either a VGICv2 or a VGICv3 device, since the existing code is just checking for "VGICv3 or not", silently ignoring the uninitialised case. Signed-off-by: Andre Przywara Reported-by: Dave Martin Tested-by: Julien Grall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-init.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index bdbc297d06fb..e621b5d45b27 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "vgic.h" @@ -164,12 +165,18 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) irq->vcpu = NULL; irq->target_vcpu = vcpu0; kref_init(&irq->refcount); - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V2: irq->targets = 0; irq->group = 0; - } else { + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: irq->mpidr = 0; irq->group = 1; + break; + default: + kfree(dist->spis); + return -EINVAL; } } return 0; @@ -209,7 +216,6 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) irq->intid = i; irq->vcpu = NULL; irq->target_vcpu = vcpu; - irq->targets = 1U << vcpu->vcpu_id; kref_init(&irq->refcount); if (vgic_irq_is_sgi(i)) { /* SGIs */ @@ -219,11 +225,6 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) /* PPIs */ irq->config = VGIC_CONFIG_LEVEL; } - - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - irq->group = 1; - else - irq->group = 0; } if (!irqchip_in_kernel(vcpu->kvm)) @@ -286,10 +287,19 @@ int vgic_init(struct kvm *kvm) for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V3: irq->group = 1; - else + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: irq->group = 0; + irq->targets = 1U << idx; + break; + default: + ret = -EINVAL; + goto out; + } } } From db0b99f59ae4d934a0af1a5670706d7c2a4b58ea Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 23 Aug 2019 15:44:36 +0200 Subject: [PATCH 217/266] ipv6: propagate ipv6_add_dev's error returns out of ipv6_find_idev Currently, ipv6_find_idev returns NULL when ipv6_add_dev fails, ignoring the specific error value. This results in addrconf_add_dev returning ENOBUFS in all cases, which is unfortunate in cases such as: # ip link add dummyX type dummy # ip link set dummyX mtu 1200 up # ip addr add 2000::/64 dev dummyX RTNETLINK answers: No buffer space available Commit a317a2f19da7 ("ipv6: fail early when creating netdev named all or default") introduced error returns in ipv6_add_dev. Before that, that function would simply return NULL for all failures. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ced995f3fec4..6a576ff92c39 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -478,7 +478,7 @@ static struct inet6_dev *ipv6_find_idev(struct net_device *dev) if (!idev) { idev = ipv6_add_dev(dev); if (IS_ERR(idev)) - return NULL; + return idev; } if (dev->flags&IFF_UP) @@ -2466,8 +2466,8 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) ASSERT_RTNL(); idev = ipv6_find_idev(dev); - if (!idev) - return ERR_PTR(-ENOBUFS); + if (IS_ERR(idev)) + return idev; if (idev->cnf.disable_ipv6) return ERR_PTR(-EACCES); @@ -3159,7 +3159,7 @@ static void init_loopback(struct net_device *dev) ASSERT_RTNL(); idev = ipv6_find_idev(dev); - if (!idev) { + if (IS_ERR(idev)) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -3374,7 +3374,7 @@ static void addrconf_sit_config(struct net_device *dev) */ idev = ipv6_find_idev(dev); - if (!idev) { + if (IS_ERR(idev)) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -3399,7 +3399,7 @@ static void addrconf_gre_config(struct net_device *dev) ASSERT_RTNL(); idev = ipv6_find_idev(dev); - if (!idev) { + if (IS_ERR(idev)) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -4773,8 +4773,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; idev = ipv6_find_idev(dev); - if (!idev) - return -ENOBUFS; + if (IS_ERR(idev)) + return PTR_ERR(idev); if (!ipv6_allow_optimistic_dad(net, idev)) cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; From 345b93265b3a3d001ec23b696b66059395238d16 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 23 Aug 2019 19:57:49 +0200 Subject: [PATCH 218/266] Revert "r8169: remove not needed call to dma_sync_single_for_device" This reverts commit f072218cca5b076dd99f3dfa3aaafedfd0023a51. As reported by Aaro this patch causes network problems on MIPS Loongson platform. Therefore revert it. Fixes: f072218cca5b ("r8169: remove not needed call to dma_sync_single_for_device") Signed-off-by: Heiner Kallweit Reported-by: Aaro Koskinen Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index e1dd6ea60d67..bae0074ab9aa 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5921,6 +5921,7 @@ static struct sk_buff *rtl8169_try_rx_copy(void *data, skb = napi_alloc_skb(&tp->napi, pkt_size); if (skb) skb_copy_to_linear_data(skb, data, pkt_size); + dma_sync_single_for_device(d, addr, pkt_size, DMA_FROM_DEVICE); return skb; } From db38de39684dda2bf307f41797db2831deba64e9 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 21 Aug 2019 14:17:20 +0200 Subject: [PATCH 219/266] flow_dissector: Fix potential use-after-free on BPF_PROG_DETACH Call to bpf_prog_put(), with help of call_rcu(), queues an RCU-callback to free the program once a grace period has elapsed. The callback can run together with new RCU readers that started after the last grace period. New RCU readers can potentially see the "old" to-be-freed or already-freed pointer to the program object before the RCU update-side NULLs it. Reorder the operations so that the RCU update-side resets the protected pointer before the end of the grace period after which the program will be freed. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Reported-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Acked-by: Petar Penkov Signed-off-by: Daniel Borkmann --- net/core/flow_dissector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3e6fedb57bc1..2470b4b404e6 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -142,8 +142,8 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) mutex_unlock(&flow_dissector_mutex); return -ENOENT; } - bpf_prog_put(attached); RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + bpf_prog_put(attached); mutex_unlock(&flow_dissector_mutex); return 0; } From 6754172c208d9d3dae208c6494611ac167d56688 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 21 Aug 2019 14:07:10 -0700 Subject: [PATCH 220/266] bpf: fix precision tracking in presence of bpf2bpf calls While adding extra tests for precision tracking and extra infra to adjust verifier heuristics the existing test "calls: cross frame pruning - liveness propagation" started to fail. The root cause is the same as described in verifer.c comment: * Also if parent's curframe > frame where backtracking started, * the verifier need to mark registers in both frames, otherwise callees * may incorrectly prune callers. This is similar to * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") * For now backtracking falls back into conservative marking. Turned out though that returning -ENOTSUPP from backtrack_insn() and doing mark_all_scalars_precise() in the current parentage chain is not enough. Depending on how is_state_visited() heuristic is creating parentage chain it's possible that callee will incorrectly prune caller. Fix the issue by setting precise=true earlier and more aggressively. Before this fix the precision tracking _within_ functions that don't do bpf2bpf calls would still work. Whereas now precision tracking is completely disabled when bpf2bpf calls are present anywhere in the program. No difference in cilium tests (they don't have bpf2bpf calls). No difference in test_progs though some of them have bpf2bpf calls, but precision tracking wasn't effective there. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c84d83f86141..b5c14c9d7b98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -985,9 +985,6 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; - - /* constant backtracking is enabled for root only for now */ - reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1014,7 +1011,11 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); return; } - __mark_reg_unknown(regs + regno); + regs += regno; + __mark_reg_unknown(regs); + /* constant backtracking is enabled for root without bpf2bpf calls */ + regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? + true : false; } static void __mark_reg_not_init(struct bpf_reg_state *reg) From c751798aa224fadc5124b49eeb38fb468c0fa039 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Aug 2019 22:14:23 +0200 Subject: [PATCH 221/266] bpf: fix use after free in prog symbol exposure syzkaller managed to trigger the warning in bpf_jit_free() which checks via bpf_prog_kallsyms_verify_off() for potentially unlinked JITed BPF progs in kallsyms, and subsequently trips over GPF when walking kallsyms entries: [...] 8021q: adding VLAN 0 to HW filter on device batadv0 8021q: adding VLAN 0 to HW filter on device batadv0 WARNING: CPU: 0 PID: 9869 at kernel/bpf/core.c:810 bpf_jit_free+0x1e8/0x2a0 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 9869 Comm: kworker/0:7 Not tainted 5.0.0-rc8+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events bpf_prog_free_deferred Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x113/0x167 lib/dump_stack.c:113 panic+0x212/0x40b kernel/panic.c:214 __warn.cold.8+0x1b/0x38 kernel/panic.c:571 report_bug+0x1a4/0x200 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973 RIP: 0010:bpf_jit_free+0x1e8/0x2a0 Code: 02 4c 89 e2 83 e2 07 38 d0 7f 08 84 c0 0f 85 86 00 00 00 48 ba 00 02 00 00 00 00 ad de 0f b6 43 02 49 39 d6 0f 84 5f fe ff ff <0f> 0b e9 58 fe ff ff 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 RSP: 0018:ffff888092f67cd8 EFLAGS: 00010202 RAX: 0000000000000007 RBX: ffffc90001947000 RCX: ffffffff816e9d88 RDX: dead000000000200 RSI: 0000000000000008 RDI: ffff88808769f7f0 RBP: ffff888092f67d00 R08: fffffbfff1394059 R09: fffffbfff1394058 R10: fffffbfff1394058 R11: ffffffff89ca02c7 R12: ffffc90001947002 R13: ffffc90001947020 R14: ffffffff881eca80 R15: ffff88808769f7e8 BUG: unable to handle kernel paging request at fffffbfff400d000 #PF error: [normal kernel read fault] PGD 21ffee067 P4D 21ffee067 PUD 21ffed067 PMD 9f942067 PTE 0 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 9869 Comm: kworker/0:7 Not tainted 5.0.0-rc8+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events bpf_prog_free_deferred RIP: 0010:bpf_get_prog_addr_region kernel/bpf/core.c:495 [inline] RIP: 0010:bpf_tree_comp kernel/bpf/core.c:558 [inline] RIP: 0010:__lt_find include/linux/rbtree_latch.h:115 [inline] RIP: 0010:latch_tree_find include/linux/rbtree_latch.h:208 [inline] RIP: 0010:bpf_prog_kallsyms_find+0x107/0x2e0 kernel/bpf/core.c:632 Code: 00 f0 ff ff 44 38 c8 7f 08 84 c0 0f 85 fa 00 00 00 41 f6 45 02 01 75 02 0f 0b 48 39 da 0f 82 92 00 00 00 48 89 d8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 74 08 3c 03 0f 8e 45 01 00 00 8b 03 48 c1 e0 [...] Upon further debugging, it turns out that whenever we trigger this issue, the kallsyms removal in bpf_prog_ksym_node_del() was /skipped/ but yet bpf_jit_free() reported that the entry is /in use/. Problem is that symbol exposure via bpf_prog_kallsyms_add() but also perf_event_bpf_event() were done /after/ bpf_prog_new_fd(). Once the fd is exposed to the public, a parallel close request came in right before we attempted to do the bpf_prog_kallsyms_add(). Given at this time the prog reference count is one, we start to rip everything underneath us via bpf_prog_release() -> bpf_prog_put(). The memory is eventually released via deferred free, so we're seeing that bpf_jit_free() has a kallsym entry because we added it from bpf_prog_load() but /after/ bpf_prog_put() from the remote CPU. Therefore, move both notifications /before/ we install the fd. The issue was never seen between bpf_prog_alloc_id() and bpf_prog_new_fd() because upon bpf_prog_get_fd_by_id() we'll take another reference to the BPF prog, so we're still holding the original reference from the bpf_prog_load(). Fixes: 6ee52e2a3fe4 ("perf, bpf: Introduce PERF_RECORD_BPF_EVENT") Fixes: 74451e66d516 ("bpf: make jited programs visible in traces") Reported-by: syzbot+bd3bba6ff3fcea7a6ec6@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Song Liu --- kernel/bpf/syscall.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5d141f16f6fa..272071e9112f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1707,20 +1707,26 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (err) goto free_used_maps; - err = bpf_prog_new_fd(prog); - if (err < 0) { - /* failed to allocate fd. - * bpf_prog_put() is needed because the above - * bpf_prog_alloc_id() has published the prog - * to the userspace and the userspace may - * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. - */ - bpf_prog_put(prog); - return err; - } - + /* Upon success of bpf_prog_alloc_id(), the BPF prog is + * effectively publicly exposed. However, retrieving via + * bpf_prog_get_fd_by_id() will take another reference, + * therefore it cannot be gone underneath us. + * + * Only for the time /after/ successful bpf_prog_new_fd() + * and before returning to userspace, we might just hold + * one reference and any parallel close on that fd could + * rip everything out. Hence, below notifications must + * happen before bpf_prog_new_fd(). + * + * Also, any failure handling from this point onwards must + * be using bpf_prog_put() given the program is exposed. + */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + + err = bpf_prog_new_fd(prog); + if (err < 0) + bpf_prog_put(prog); return err; free_used_maps: From 2c238177bd7f4b14bdf7447cc1cd9bb791f147e6 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 20 Aug 2019 17:50:25 +0200 Subject: [PATCH 222/266] bpf: allow narrow loads of some sk_reuseport_md fields with offset > 0 test_select_reuseport fails on s390 due to verifier rejecting test_select_reuseport_kern.o with the following message: ; data_check.eth_protocol = reuse_md->eth_protocol; 18: (69) r1 = *(u16 *)(r6 +22) invalid bpf_context access off=22 size=2 This is because on big-endian machines casts from __u32 to __u16 are generated by referencing the respective variable as __u16 with an offset of 2 (as opposed to 0 on little-endian machines). The verifier already has all the infrastructure in place to allow such accesses, it's just that they are not explicitly enabled for eth_protocol field. Enable them for eth_protocol field by using bpf_ctx_range instead of offsetof. Ditto for ip_protocol, bind_inany and len, since they already allow narrowing, and the same problem can arise when working with them. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann --- net/core/filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 7878f918b8c0..4c6a252d4212 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8757,13 +8757,13 @@ sk_reuseport_is_valid_access(int off, int size, return size == size_default; /* Fields that allow narrowing */ - case offsetof(struct sk_reuseport_md, eth_protocol): + case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < FIELD_SIZEOF(struct sk_buff, protocol)) return false; /* fall through */ - case offsetof(struct sk_reuseport_md, ip_protocol): - case offsetof(struct sk_reuseport_md, bind_inany): - case offsetof(struct sk_reuseport_md, len): + case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): + case bpf_ctx_range(struct sk_reuseport_md, bind_inany): + case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); From 7837951a12fdaf88d2c51ff0757980c00072790c Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Sat, 24 Aug 2019 15:07:07 +1000 Subject: [PATCH 223/266] drm/mediatek: include dma-mapping header Although it builds fine here in my arm cross compile, it seems either via some other patches in -next or some Kconfig combination, this fails to build for everyone. Include linux/dma-mapping.h should fix it. Signed-off-by: Dave Airlie --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 7f5408cb2377..945bc20f1d33 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "mtk_drm_crtc.h" #include "mtk_drm_ddp.h" From 12c6bc38f99bb168b7f16bdb5e855a51a23ee9ec Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Wed, 21 Aug 2019 17:16:10 -0700 Subject: [PATCH 224/266] openvswitch: Fix log message in ovs conntrack Fixes: 06bd2bdf19d2 ("openvswitch: Add timeout support to ct action") Signed-off-by: Yi-Hung Wei Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 848c6eb55064..a1852e035ebb 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1565,7 +1565,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, case OVS_CT_ATTR_TIMEOUT: memcpy(info->timeout, nla_data(a), nla_len(a)); if (!memchr(info->timeout, '\0', nla_len(a))) { - OVS_NLERR(log, "Invalid conntrack helper"); + OVS_NLERR(log, "Invalid conntrack timeout"); return -EINVAL; } break; From e2c693934194fd3b4e795635934883354c06ebc9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 22 Aug 2019 22:19:48 +0800 Subject: [PATCH 225/266] ipv4/icmp: fix rt dst dev null pointer dereference In __icmp_send() there is a possibility that the rt->dst.dev is NULL, e,g, with tunnel collect_md mode, which will cause kernel crash. Here is what the code path looks like, for GRE: - ip6gre_tunnel_xmit - ip6gre_xmit_ipv4 - __gre6_xmit - ip6_tnl_xmit - if skb->len - t->tun_hlen - eth_hlen > mtu; return -EMSGSIZE - icmp_send - net = dev_net(rt->dst.dev); <-- here The reason is __metadata_dst_init() init dst->dev to NULL by default. We could not fix it in __metadata_dst_init() as there is no dev supplied. On the other hand, the reason we need rt->dst.dev is to get the net. So we can just try get it from skb->dev when rt->dst.dev is NULL. v4: Julian Anastasov remind skb->dev also could be NULL. We'd better still use dst.dev and do a check to avoid crash. v3: No changes. v2: fix the issue in __icmp_send() instead of updating shared dst dev in {ip_md, ip6}_tunnel_xmit. Fixes: c8b34e680a09 ("ip_tunnel: Add tnl_update_pmtu in ip_md_tunnel_xmit") Signed-off-by: Hangbin Liu Reviewed-by: Julian Anastasov Acked-by: Jonathan Lemon Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index bf7b5d45de99..4298aae74e0e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -582,7 +582,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (!rt) goto out; - net = dev_net(rt->dst.dev); + + if (rt->dst.dev) + net = dev_net(rt->dst.dev); + else if (skb_in->dev) + net = dev_net(skb_in->dev); + else + goto out; /* * Find the original header. It is expected to be valid, of course. From c3b4c3a47e05d5fecf7354d75824a9d1b37f3e84 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 22 Aug 2019 22:19:49 +0800 Subject: [PATCH 226/266] xfrm/xfrm_policy: fix dst dev null pointer dereference in collect_md mode In decode_session{4,6} there is a possibility that the skb dst dev is NULL, e,g, with tunnel collect_md mode, which will cause kernel crash. Here is what the code path looks like, for GRE: - ip6gre_tunnel_xmit - ip6gre_xmit_ipv6 - __gre6_xmit - ip6_tnl_xmit - if skb->len - t->tun_hlen - eth_hlen > mtu; return -EMSGSIZE - icmpv6_send - icmpv6_route_lookup - xfrm_decode_session_reverse - decode_session4 - oif = skb_dst(skb)->dev->ifindex; <-- here - decode_session6 - oif = skb_dst(skb)->dev->ifindex; <-- here The reason is __metadata_dst_init() init dst->dev to NULL by default. We could not fix it in __metadata_dst_init() as there is no dev supplied. On the other hand, the skb_dst(skb)->dev is actually not needed as we called decode_session{4,6} via xfrm_decode_session_reverse(), so oif is not used by: fl4->flowi4_oif = reverse ? skb->skb_iif : oif; So make a dst dev check here should be clean and safe. v4: No changes. v3: No changes. v2: fix the issue in decode_session{4,6} instead of updating shared dst dev in {ip_md, ip6}_tunnel_xmit. Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") Signed-off-by: Hangbin Liu Tested-by: Jonathan Lemon Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8ca637a72697..ec94f5795ea4 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3269,7 +3269,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; - if (skb_dst(skb)) + if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); @@ -3387,7 +3387,7 @@ decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse) nexthdr = nh[nhoff]; - if (skb_dst(skb)) + if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; memset(fl6, 0, sizeof(struct flowi6)); From 0c69b19f92dfcc0962bbc09741677f658bc55452 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 25 Aug 2019 00:34:54 +0200 Subject: [PATCH 227/266] MAINTAINERS: Add phylink keyword to SFF/SFP/SFP+ MODULE SUPPORT Russell king maintains phylink, as part of the SFP module support. However, much of the review work is about drivers swapping from phylib to phylink. Such changes don't make changes to the phylink core, and so the F: rules in MAINTAINERS don't match. Add a K:, keywork rule, which hopefully get_maintainers will match against for patches to MAC drivers swapping to phylink. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index a744851db1df..37a0e297cf28 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14472,6 +14472,7 @@ F: drivers/net/phy/phylink.c F: drivers/net/phy/sfp* F: include/linux/phylink.h F: include/linux/sfp.h +K: phylink SGI GRU DRIVER M: Dimitri Sivanich From b45ce32135d1c82a5bf12aa56957c3fd27956057 Mon Sep 17 00:00:00 2001 From: zhanglin Date: Fri, 23 Aug 2019 09:14:11 +0800 Subject: [PATCH 228/266] sock: fix potential memory leak in proto_register() If protocols registered exceeded PROTO_INUSE_NR, prot will be added to proto_list, but no available bit left for prot in proto_inuse_idx. Changes since v2: * Propagate the error code properly Signed-off-by: zhanglin Signed-off-by: David S. Miller --- net/core/sock.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 6d08553f885c..545fac19a711 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3287,16 +3287,17 @@ static __init int net_inuse_init(void) core_initcall(net_inuse_init); -static void assign_proto_idx(struct proto *prot) +static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); - return; + return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); + return 0; } static void release_proto_idx(struct proto *prot) @@ -3305,8 +3306,9 @@ static void release_proto_idx(struct proto *prot) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else -static inline void assign_proto_idx(struct proto *prot) +static inline int assign_proto_idx(struct proto *prot) { + return 0; } static inline void release_proto_idx(struct proto *prot) @@ -3355,6 +3357,8 @@ static int req_prot_init(const struct proto *prot) int proto_register(struct proto *prot, int alloc_slab) { + int ret = -ENOBUFS; + if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, @@ -3391,20 +3395,27 @@ int proto_register(struct proto *prot, int alloc_slab) } mutex_lock(&proto_list_mutex); + ret = assign_proto_idx(prot); + if (ret) { + mutex_unlock(&proto_list_mutex); + goto out_free_timewait_sock_slab_name; + } list_add(&prot->node, &proto_list); - assign_proto_idx(prot); mutex_unlock(&proto_list_mutex); - return 0; + return ret; out_free_timewait_sock_slab_name: - kfree(prot->twsk_prot->twsk_slab_name); + if (alloc_slab && prot->twsk_prot) + kfree(prot->twsk_prot->twsk_slab_name); out_free_request_sock_slab: - req_prot_cleanup(prot->rsk_prot); + if (alloc_slab) { + req_prot_cleanup(prot->rsk_prot); - kmem_cache_destroy(prot->slab); - prot->slab = NULL; + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } out: - return -ENOBUFS; + return ret; } EXPORT_SYMBOL(proto_register); From 292a50e3fc2cf699587ea282e6253e0d6ae3cdc1 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Fri, 23 Aug 2019 11:29:23 +0200 Subject: [PATCH 229/266] s390/qeth: reject oversized SNMP requests Commit d4c08afafa04 ("s390/qeth: streamline SNMP cmd code") removed the bounds checking for req_len, under the assumption that the check in qeth_alloc_cmd() would suffice. But that code path isn't sufficiently robust to handle a user-provided data_length, which could overflow (when adding the cmd header overhead) before being checked against QETH_BUFSIZE. We end up allocating just a tiny iob, and the subsequent copy_from_user() writes past the end of that iob. Special-case this path and add a coarse bounds check, to protect against maliciuous requests. This let's the subsequent code flow do its normal job and precise checking, without risk of overflow. Fixes: d4c08afafa04 ("s390/qeth: streamline SNMP cmd code") Reported-by: Dan Carpenter Signed-off-by: Julian Wiedmann Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 9c3310c4d61d..6502b148541e 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -4374,6 +4374,10 @@ static int qeth_snmp_command(struct qeth_card *card, char __user *udata) get_user(req_len, &ureq->hdr.req_len)) return -EFAULT; + /* Sanitize user input, to avoid overflows in iob size calculation: */ + if (req_len > QETH_BUFSIZE) + return -EINVAL; + iob = qeth_get_adapter_cmd(card, IPA_SETADP_SET_SNMP_CONTROL, req_len); if (!iob) return -ENOMEM; From e93fb3e9521abffadb8f965c591a290cdd92b56c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 23 Aug 2019 17:11:38 -0700 Subject: [PATCH 230/266] net: route dump netlink NLM_F_MULTI flag missing An excerpt from netlink(7) man page, In multipart messages (multiple nlmsghdr headers with associated payload in one byte stream) the first and all following headers have the NLM_F_MULTI flag set, except for the last header which has the type NLMSG_DONE. but, after (ee28906) there is a missing NLM_F_MULTI flag in the middle of a FIB dump. The result is user space applications following above man page excerpt may get confused and may stop parsing msg believing something went wrong. In the golang netlink lib [0] the library logic stops parsing believing the message is not a multipart message. Found this running Cilium[1] against net-next while adding a feature to auto-detect routes. I noticed with multiple route tables we no longer could detect the default routes on net tree kernels because the library logic was not returning them. Fix this by handling the fib_dump_info_fnhe() case the same way the fib_dump_info() handles it by passing the flags argument through the call chain and adding a flags argument to rt_fill_info(). Tested with Cilium stack and auto-detection of routes works again. Also annotated libs to dump netlink msgs and inspected NLM_F_MULTI and NLMSG_DONE flags look correct after this. Note: In inet_rtm_getroute() pass rt_fill_info() '0' for flags the same as is done for fib_dump_info() so this looks correct to me. [0] https://github.com/vishvananda/netlink/ [1] https://github.com/cilium/ Fixes: ee28906fd7a14 ("ipv4: Dump route exceptions if requested") Signed-off-by: John Fastabend Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- include/net/route.h | 2 +- net/ipv4/fib_trie.c | 2 +- net/ipv4/route.c | 17 ++++++++++------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 630a0493f1f3..dfce19c9fa96 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -233,7 +233,7 @@ void rt_del_uncached_list(struct rtable *rt); int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, - int *fa_index, int fa_start); + int *fa_index, int fa_start, unsigned int flags); static inline void ip_rt_put(struct rtable *rt) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2b2b3d291ab0..1ab2fb6bb37d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2145,7 +2145,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, if (filter->dump_exceptions) { err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, - &i_fa, s_fa); + &i_fa, s_fa, flags); if (err < 0) goto stop; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 517300d587a7..b6a6f18c3dd1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2728,7 +2728,8 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, struct flowi4 *fl4, - struct sk_buff *skb, u32 portid, u32 seq) + struct sk_buff *skb, u32 portid, u32 seq, + unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; @@ -2736,7 +2737,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 error; u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); + nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; @@ -2860,7 +2861,7 @@ nla_put_failure: static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fnhe_hash_bucket *bucket, int genid, - int *fa_index, int fa_start) + int *fa_index, int fa_start, unsigned int flags) { int i; @@ -2891,7 +2892,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, table_id, NULL, skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, flags); if (err) return err; next: @@ -2904,7 +2905,7 @@ next: int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, - int *fa_index, int fa_start) + int *fa_index, int fa_start, unsigned int flags) { struct net *net = sock_net(cb->skb->sk); int nhsel, genid = fnhe_genid(net); @@ -2922,7 +2923,8 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, err = 0; if (bucket) err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, - genid, fa_index, fa_start); + genid, fa_index, fa_start, + flags); rcu_read_unlock(); if (err) return err; @@ -3183,7 +3185,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_tos, res.fi, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; From e0e6d062822529dbe9be21939359b0d1e065bb0f Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 23 Aug 2019 21:04:16 -0400 Subject: [PATCH 231/266] net: rds: add service level support in rds-info >From IB specific 7.6.5 SERVICE LEVEL, Service Level (SL) is used to identify different flows within an IBA subnet. It is carried in the local route header of the packet. Before this commit, run "rds-info -I". The outputs are as below: " RDS IB Connections: LocalAddr RemoteAddr Tos SL LocalDev RemoteDev 192.2.95.3 192.2.95.1 2 0 fe80::21:28:1a:39 fe80::21:28:10:b9 192.2.95.3 192.2.95.1 1 0 fe80::21:28:1a:39 fe80::21:28:10:b9 192.2.95.3 192.2.95.1 0 0 fe80::21:28:1a:39 fe80::21:28:10:b9 " After this commit, the output is as below: " RDS IB Connections: LocalAddr RemoteAddr Tos SL LocalDev RemoteDev 192.2.95.3 192.2.95.1 2 2 fe80::21:28:1a:39 fe80::21:28:10:b9 192.2.95.3 192.2.95.1 1 1 fe80::21:28:1a:39 fe80::21:28:10:b9 192.2.95.3 192.2.95.1 0 0 fe80::21:28:1a:39 fe80::21:28:10:b9 " The commit fe3475af3bdf ("net: rds: add per rds connection cache statistics") adds cache_allocs in struct rds_info_rdma_connection as below: struct rds_info_rdma_connection { ... __u32 rdma_mr_max; __u32 rdma_mr_size; __u8 tos; __u32 cache_allocs; }; The peer struct in rds-tools of struct rds_info_rdma_connection is as below: struct rds_info_rdma_connection { ... uint32_t rdma_mr_max; uint32_t rdma_mr_size; uint8_t tos; uint8_t sl; uint32_t cache_allocs; }; The difference between userspace and kernel is the member variable sl. In the kernel struct, the member variable sl is missing. This will introduce risks. So it is necessary to use this commit to avoid this risk. Fixes: fe3475af3bdf ("net: rds: add per rds connection cache statistics") CC: Joe Jin CC: JUNXIAO_BI Suggested-by: Gerd Rausch Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 2 ++ net/rds/ib.c | 16 ++++++++++------ net/rds/ib.h | 1 + net/rds/ib_cm.c | 3 +++ net/rds/rdma_transport.c | 10 ++++++++-- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index fd6b5f66e2c5..cba368e55863 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -250,6 +250,7 @@ struct rds_info_rdma_connection { __u32 rdma_mr_max; __u32 rdma_mr_size; __u8 tos; + __u8 sl; __u32 cache_allocs; }; @@ -265,6 +266,7 @@ struct rds6_info_rdma_connection { __u32 rdma_mr_max; __u32 rdma_mr_size; __u8 tos; + __u8 sl; __u32 cache_allocs; }; diff --git a/net/rds/ib.c b/net/rds/ib.c index ec05d91aa9a2..45acab2de0cf 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -291,7 +291,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { struct rds_info_rdma_connection *iinfo = buffer; - struct rds_ib_connection *ic; + struct rds_ib_connection *ic = conn->c_transport_data; /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) @@ -301,15 +301,16 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; - iinfo->tos = conn->c_tos; + if (ic) { + iinfo->tos = conn->c_tos; + iinfo->sl = ic->i_sl; + } memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; - ic = conn->c_transport_data; - rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid, (union ib_gid *)&iinfo->dst_gid); @@ -329,7 +330,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { struct rds6_info_rdma_connection *iinfo6 = buffer; - struct rds_ib_connection *ic; + struct rds_ib_connection *ic = conn->c_transport_data; /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) @@ -337,6 +338,10 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, iinfo6->src_addr = conn->c_laddr; iinfo6->dst_addr = conn->c_faddr; + if (ic) { + iinfo6->tos = conn->c_tos; + iinfo6->sl = ic->i_sl; + } memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid)); memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid)); @@ -344,7 +349,6 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; - ic = conn->c_transport_data; rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid, (union ib_gid *)&iinfo6->dst_gid); rds_ibdev = ic->rds_ibdev; diff --git a/net/rds/ib.h b/net/rds/ib.h index 303c6ee8bdb7..f2b558e8b5ea 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -220,6 +220,7 @@ struct rds_ib_connection { /* Send/Recv vectors */ int i_scq_vector; int i_rcq_vector; + u8 i_sl; }; /* This assumes that atomic_t is at least 32 bits */ diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index fddaa09f7b0d..233f1368162b 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -152,6 +152,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even RDS_PROTOCOL_MINOR(conn->c_version), ic->i_flowctl ? ", flow control" : ""); + /* receive sl from the peer */ + ic->i_sl = ic->i_cm_id->route.path_rec->sl; + atomic_set(&ic->i_cq_quiesce, 0); /* Init rings and fill recv. this needs to wait until protocol diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 9986d6065c4d..5f741e51b4ba 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -43,6 +43,9 @@ static struct rdma_cm_id *rds_rdma_listen_id; static struct rdma_cm_id *rds6_rdma_listen_id; #endif +/* Per IB specification 7.7.3, service level is a 4-bit field. */ +#define TOS_TO_SL(tos) ((tos) & 0xF) + static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, bool isv6) @@ -97,10 +100,13 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rds_ib_connection *ibic; ibic = conn->c_transport_data; - if (ibic && ibic->i_cm_id == cm_id) + if (ibic && ibic->i_cm_id == cm_id) { + cm_id->route.path_rec[0].sl = + TOS_TO_SL(conn->c_tos); ret = trans->cm_initiate_connect(cm_id, isv6); - else + } else { rds_conn_drop(conn); + } } break; From d776aaa9895eb6eb770908e899cb7f5bd5025b3c Mon Sep 17 00:00:00 2001 From: Henry Burns Date: Sat, 24 Aug 2019 17:54:37 -0700 Subject: [PATCH 232/266] mm/z3fold.c: fix race between migration and destruction In z3fold_destroy_pool() we call destroy_workqueue(&pool->compact_wq). However, we have no guarantee that migration isn't happening in the background at that time. Migration directly calls queue_work_on(pool->compact_wq), if destruction wins that race we are using a destroyed workqueue. Link: http://lkml.kernel.org/r/20190809213828.202833-1-henryburns@google.com Signed-off-by: Henry Burns Cc: Vitaly Wool Cc: Shakeel Butt Cc: Jonathan Adams Cc: Henry Burns Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/mm/z3fold.c b/mm/z3fold.c index ed19d98c9dcd..e31cd9bd4ed5 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -145,6 +146,8 @@ struct z3fold_header { * @release_wq: workqueue for safe page release * @work: work_struct for safe page release * @inode: inode for z3fold pseudo filesystem + * @destroying: bool to stop migration once we start destruction + * @isolated: int to count the number of pages currently in isolation * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -163,8 +166,11 @@ struct z3fold_pool { const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; + struct wait_queue_head isolate_wait; struct work_struct work; struct inode *inode; + bool destroying; + int isolated; }; /* @@ -769,6 +775,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); + init_waitqueue_head(&pool->isolate_wait); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); if (!pool->unbuddied) goto out_pool; @@ -808,6 +815,15 @@ out: return NULL; } +static bool pool_isolated_are_drained(struct z3fold_pool *pool) +{ + bool ret; + + spin_lock(&pool->lock); + ret = pool->isolated == 0; + spin_unlock(&pool->lock); + return ret; +} /** * z3fold_destroy_pool() - destroys an existing z3fold pool * @pool: the z3fold pool to be destroyed @@ -817,6 +833,22 @@ out: static void z3fold_destroy_pool(struct z3fold_pool *pool) { kmem_cache_destroy(pool->c_handle); + /* + * We set pool-> destroying under lock to ensure that + * z3fold_page_isolate() sees any changes to destroying. This way we + * avoid the need for any memory barriers. + */ + + spin_lock(&pool->lock); + pool->destroying = true; + spin_unlock(&pool->lock); + + /* + * We need to ensure that no pages are being migrated while we destroy + * these workqueues, as migration can queue work on either of the + * workqueues. + */ + wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); /* * We need to destroy pool->compact_wq before pool->release_wq, @@ -1307,6 +1339,28 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) return atomic64_read(&pool->pages_nr); } +/* + * z3fold_dec_isolated() expects to be called while pool->lock is held. + */ +static void z3fold_dec_isolated(struct z3fold_pool *pool) +{ + assert_spin_locked(&pool->lock); + VM_BUG_ON(pool->isolated <= 0); + pool->isolated--; + + /* + * If we have no more isolated pages, we have to see if + * z3fold_destroy_pool() is waiting for a signal. + */ + if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) + wake_up_all(&pool->isolate_wait); +} + +static void z3fold_inc_isolated(struct z3fold_pool *pool) +{ + pool->isolated++; +} + static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) { struct z3fold_header *zhdr; @@ -1333,6 +1387,33 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) spin_lock(&pool->lock); if (!list_empty(&page->lru)) list_del(&page->lru); + /* + * We need to check for destruction while holding pool->lock, as + * otherwise destruction could see 0 isolated pages, and + * proceed. + */ + if (unlikely(pool->destroying)) { + spin_unlock(&pool->lock); + /* + * If this page isn't stale, somebody else holds a + * reference to it. Let't drop our refcount so that they + * can call the release logic. + */ + if (unlikely(kref_put(&zhdr->refcount, + release_z3fold_page_locked))) { + /* + * If we get here we have kref problems, so we + * should freak out. + */ + WARN(1, "Z3fold is experiencing kref problems\n"); + return false; + } + z3fold_page_unlock(zhdr); + return false; + } + + + z3fold_inc_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); return true; @@ -1401,6 +1482,10 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); + spin_lock(&pool->lock); + z3fold_dec_isolated(pool); + spin_unlock(&pool->lock); + page_mapcount_reset(page); put_page(page); return 0; @@ -1420,10 +1505,14 @@ static void z3fold_page_putback(struct page *page) INIT_LIST_HEAD(&page->lru); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); + spin_lock(&pool->lock); + z3fold_dec_isolated(pool); + spin_unlock(&pool->lock); return; } spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); + z3fold_dec_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); } From cd961038381f392b364a7c4a040f4576ca415b1a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sat, 24 Aug 2019 17:54:40 -0700 Subject: [PATCH 233/266] mm, page_alloc: move_freepages should not examine struct page of reserved memory After commit 907ec5fca3dc ("mm: zero remaining unavailable struct pages"), struct page of reserved memory is zeroed. This causes page->flags to be 0 and fixes issues related to reading /proc/kpageflags, for example, of reserved memory. The VM_BUG_ON() in move_freepages_block(), however, assumes that page_zone() is meaningful even for reserved memory. That assumption is no longer true after the aforementioned commit. There's no reason why move_freepages_block() should be testing the legitimacy of page_zone() for reserved memory; its scope is limited only to pages on the zone's freelist. Note that pfn_valid() can be true for reserved memory: there is a backing struct page. The check for page_to_nid(page) is also buggy but reserved memory normally only appears on node 0 so the zeroing doesn't affect this. Move the debug checks to after verifying PageBuddy is true. This isolates the scope of the checks to only be for buddy pages which are on the zone's freelist which move_freepages_block() is operating on. In this case, an incorrect node or zone is a bug worthy of being warned about (and the examination of struct page is acceptable bcause this memory is not reserved). Why does move_freepages_block() gets called on reserved memory? It's simply math after finding a valid free page from the per-zone free area to use as fallback. We find the beginning and end of the pageblock of the valid page and that can bring us into memory that was reserved per the e820. pfn_valid() is still true (it's backed by a struct page), but since it's zero'd we shouldn't make any inferences here about comparing its node or zone. The current node check just happens to succeed most of the time by luck because reserved memory typically appears on node 0. The fix here is to validate that we actually have buddy pages before testing if there's any type of zone or node strangeness going on. We noticed it almost immediately after bringing 907ec5fca3dc in on CONFIG_DEBUG_VM builds. It depends on finding specific free pages in the per-zone free area where the math in move_freepages() will bring the start or end pfn into reserved memory and wanting to claim that entire pageblock as a new migratetype. So the path will be rare, require CONFIG_DEBUG_VM, and require fallback to a different migratetype. Some struct pages were already zeroed from reserve pages before 907ec5fca3c so it theoretically could trigger before this commit. I think it's rare enough under a config option that most people don't run that others may not have noticed. I wouldn't argue against a stable tag and the backport should be easy enough, but probably wouldn't single out a commit that this is fixing. Mel said: : The overhead of the debugging check is higher with this patch although : it'll only affect debug builds and the path is not particularly hot. : If this was a concern, I think it would be reasonable to simply remove : the debugging check as the zone boundaries are checked in : move_freepages_block and we never expect a zone/node to be smaller than : a pageblock and stuck in the middle of another zone. Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1908122036560.10779@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Naoya Horiguchi Cc: Masayoshi Mizuma Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 272c6de1bf4e..9c9194959271 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2238,27 +2238,12 @@ static int move_freepages(struct zone *zone, unsigned int order; int pages_moved = 0; -#ifndef CONFIG_HOLES_IN_ZONE - /* - * page_zone is not safe to call in this context when - * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant - * anyway as we check zone boundaries in move_freepages_block(). - * Remove at a later date when no bug reports exist related to - * grouping pages by mobility - */ - VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && - pfn_valid(page_to_pfn(end_page)) && - page_zone(start_page) != page_zone(end_page)); -#endif for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2273,6 +2258,10 @@ static int move_freepages(struct zone *zone, continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + order = page_order(page); move_to_free_area(page, &zone->free_area[order], migratetype); page += 1 << order; From bbcb03a97ffe49169f02d34eff2ced56ddaafb4f Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sat, 24 Aug 2019 17:54:43 -0700 Subject: [PATCH 234/266] parisc: fix compilation errrors Commit 0cfaee2af3a0 ("include/asm-generic/5level-fixup.h: fix variable 'p4d' set but not used") converted a few functions from macros to static inline, which causes parisc to complain, In file included from include/asm-generic/4level-fixup.h:38:0, from arch/parisc/include/asm/pgtable.h:5, from arch/parisc/include/asm/io.h:6, from include/linux/io.h:13, from sound/core/memory.c:9: include/asm-generic/5level-fixup.h:14:18: error: unknown type name 'pgd_t'; did you mean 'pid_t'? #define p4d_t pgd_t ^ include/asm-generic/5level-fixup.h:24:28: note: in expansion of macro 'p4d_t' static inline int p4d_none(p4d_t p4d) ^~~~~ It is because "4level-fixup.h" is included before "asm/page.h" where "pgd_t" is defined. Link: http://lkml.kernel.org/r/20190815205305.1382-1-cai@lca.pw Fixes: 0cfaee2af3a0 ("include/asm-generic/5level-fixup.h: fix variable 'p4d' set but not used") Signed-off-by: Qian Cai Reported-by: Guenter Roeck Tested-by: Guenter Roeck Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/include/asm/pgtable.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index a39b079e73f2..6d58c1739b42 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -2,6 +2,7 @@ #ifndef _PARISC_PGTABLE_H #define _PARISC_PGTABLE_H +#include #include #include @@ -98,8 +99,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #endif /* !__ASSEMBLY__ */ -#include - #define pte_ERROR(e) \ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ From c350a99ea2b1b666c28948d74ab46c16913c28a7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 24 Aug 2019 17:54:47 -0700 Subject: [PATCH 235/266] mm: memcontrol: flush percpu vmstats before releasing memcg Percpu caching of local vmstats with the conditional propagation by the cgroup tree leads to an accumulation of errors on non-leaf levels. Let's imagine two nested memory cgroups A and A/B. Say, a process belonging to A/B allocates 100 pagecache pages on the CPU 0. The percpu cache will spill 3 times, so that 32*3=96 pages will be accounted to A/B and A atomic vmstat counters, 4 pages will remain in the percpu cache. Imagine A/B is nearby memory.max, so that every following allocation triggers a direct reclaim on the local CPU. Say, each such attempt will free 16 pages on a new cpu. That means every percpu cache will have -16 pages, except the first one, which will have 4 - 16 = -12. A/B and A atomic counters will not be touched at all. Now a user removes A/B. All percpu caches are freed and corresponding vmstat numbers are forgotten. A has 96 pages more than expected. As memory cgroups are created and destroyed, errors do accumulate. Even 1-2 pages differences can accumulate into large numbers. To fix this issue let's accumulate and propagate percpu vmstat values before releasing the memory cgroup. At this point these numbers are stable and cannot be changed. Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate only over online cpus. Link: http://lkml.kernel.org/r/20190819202338.363363-2-guro@fb.com Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f5c0c517c49..1a32e32e7ac3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3260,6 +3260,41 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) +{ + unsigned long stat[MEMCG_NR_STAT]; + struct mem_cgroup *mi; + int node, cpu, i; + + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < MEMCG_NR_STAT; i++) + atomic_long_add(stat[i], &mi->vmstats[i]); + + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + struct mem_cgroup_per_node *pi; + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + stat[i] += raw_cpu_read( + pn->lruvec_stat_cpu->count[i]); + + for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + atomic_long_add(stat[i], &pi->lruvec_stat[i]); + } +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -4682,6 +4717,11 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + /* + * Flush percpu vmstats to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); From bb65f89b7d3d305c14951f49860711fbcae70692 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 24 Aug 2019 17:54:50 -0700 Subject: [PATCH 236/266] mm: memcontrol: flush percpu vmevents before releasing memcg Similar to vmstats, percpu caching of local vmevents leads to an accumulation of errors on non-leaf levels. This happens because some leftovers may remain in percpu caches, so that they are never propagated up by the cgroup tree and just disappear into nonexistence with on releasing of the memory cgroup. To fix this issue let's accumulate and propagate percpu vmevents values before releasing the memory cgroup similar to what we're doing with vmstats. Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate only over online cpus. Link: http://lkml.kernel.org/r/20190819202338.363363-4-guro@fb.com Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1a32e32e7ac3..26e2999af608 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3295,6 +3295,25 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) } } +static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct mem_cgroup *mi; + int cpu, i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] += raw_cpu_read( + memcg->vmstats_percpu->events[i]); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + atomic_long_add(events[i], &mi->vmevents[i]); +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -4718,10 +4737,11 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; /* - * Flush percpu vmstats to guarantee the value correctness + * Flush percpu vmstats and vmevents to guarantee the value correctness * on parent's and all ancestor levels. */ memcg_flush_percpu_vmstats(memcg); + memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); From 7b2b55da1db10a5525460633ae4b6fb0be060c41 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 24 Aug 2019 17:54:53 -0700 Subject: [PATCH 237/266] psi: get poll_work to run when calling poll syscall next time Only when calling the poll syscall the first time can user receive POLLPRI correctly. After that, user always fails to acquire the event signal. Reproduce case: 1. Get the monitor code in Documentation/accounting/psi.txt 2. Run it, and wait for the event triggered. 3. Kill and restart the process. The question is why we can end up with poll_scheduled = 1 but the work not running (which would reset it to 0). And the answer is because the scheduling side sees group->poll_kworker under RCU protection and then schedules it, but here we cancel the work and destroy the worker. The cancel needs to pair with resetting the poll_scheduled flag. Link: http://lkml.kernel.org/r/1566357985-97781-1-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Jason Xing Signed-off-by: Joseph Qi Reviewed-by: Caspar Zhang Reviewed-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 23fbbcc414d5..6e52b67b420e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref) * deadlock while waiting for psi_poll_work to acquire trigger_lock */ if (kworker_to_destroy) { + /* + * After the RCU grace period has expired, the worker + * can no longer be found through group->poll_kworker. + * But it might have been already scheduled before + * that - deschedule it cleanly before destroying it. + */ kthread_cancel_delayed_work_sync(&group->poll_work); + atomic_set(&group->poll_scheduled, 0); + kthread_destroy_worker(kworker_to_destroy); } kfree(t); From 46d0b24c5ee10a15dfb25e20642f5a5ed59c5003 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Aug 2019 17:54:56 -0700 Subject: [PATCH 238/266] userfaultfd_release: always remove uffd flags and clear vm_userfaultfd_ctx userfaultfd_release() should clear vm_flags/vm_userfaultfd_ctx even if mm->core_state != NULL. Otherwise a page fault can see userfaultfd_missing() == T and use an already freed userfaultfd_ctx. Link: http://lkml.kernel.org/r/20190820160237.GB4983@redhat.com Fixes: 04f5866e41fb ("coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping") Signed-off-by: Oleg Nesterov Reported-by: Kefeng Wang Reviewed-by: Andrea Arcangeli Tested-by: Kefeng Wang Cc: Peter Xu Cc: Mike Rapoport Cc: Jann Horn Cc: Jason Gunthorpe Cc: Michal Hocko Cc: Tetsuo Handa Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ccbdbd62f0d8..fe6d804a38dc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -880,6 +880,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + bool still_valid; WRITE_ONCE(ctx->released, true); @@ -895,8 +896,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); - if (!mmget_still_valid(mm)) - goto skip_mm; + still_valid = mmget_still_valid(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -907,19 +907,20 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX); - if (prev) - vma = prev; - else - prev = vma; + if (still_valid) { + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; + } vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } -skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: From f7da677bc6e72033f0981b9d58b5c5d409fa641e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sat, 24 Aug 2019 17:54:59 -0700 Subject: [PATCH 239/266] mm, page_owner: handle THP splits correctly THP splitting path is missing the split_page_owner() call that split_page() has. As a result, split THP pages are wrongly reported in the page_owner file as order-9 pages. Furthermore when the former head page is freed, the remaining former tail pages are not listed in the page_owner file at all. This patch fixes that by adding the split_page_owner() call into __split_huge_page(). Link: http://lkml.kernel.org/r/20190820131828.22684-2-vbabka@suse.cz Fixes: a9627bc5e34e ("mm/page_owner: introduce split_page_owner and replace manual handling") Reported-by: Kirill A. Shutemov Signed-off-by: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 738065f765ab..de1f15969e27 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -2516,6 +2517,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); + + split_page_owner(head, HPAGE_PMD_ORDER); + /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ From 1a87aa03597efa9641e92875b883c94c7f872ccb Mon Sep 17 00:00:00 2001 From: Henry Burns Date: Sat, 24 Aug 2019 17:55:03 -0700 Subject: [PATCH 240/266] mm/zsmalloc.c: migration can leave pages in ZS_EMPTY indefinitely In zs_page_migrate() we call putback_zspage() after we have finished migrating all pages in this zspage. However, the return value is ignored. If a zs_free() races in between zs_page_isolate() and zs_page_migrate(), freeing the last object in the zspage, putback_zspage() will leave the page in ZS_EMPTY for potentially an unbounded amount of time. To fix this, we need to do the same thing as zs_page_putback() does: schedule free_work to occur. To avoid duplicated code, move the sequence to a new putback_zspage_deferred() function which both zs_page_migrate() and zs_page_putback() call. Link: http://lkml.kernel.org/r/20190809181751.219326-1-henryburns@google.com Fixes: 48b4800a1c6a ("zsmalloc: page migration support") Signed-off-by: Henry Burns Reviewed-by: Sergey Senozhatsky Cc: Henry Burns Cc: Minchan Kim Cc: Shakeel Butt Cc: Jonathan Adams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 57fbb7ced69f..5105b9b66653 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1862,6 +1862,18 @@ static void dec_zspage_isolation(struct zspage *zspage) zspage->isolated--; } +static void putback_zspage_deferred(struct zs_pool *pool, + struct size_class *class, + struct zspage *zspage) +{ + enum fullness_group fg; + + fg = putback_zspage(class, zspage); + if (fg == ZS_EMPTY) + schedule_work(&pool->free_work); + +} + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -2031,7 +2043,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, * the list if @page is final isolated subpage in the zspage. */ if (!is_zspage_isolated(zspage)) - putback_zspage(class, zspage); + putback_zspage_deferred(pool, class, zspage); reset_page(page); put_page(page); @@ -2077,14 +2089,13 @@ static void zs_page_putback(struct page *page) spin_lock(&class->lock); dec_zspage_isolation(zspage); if (!is_zspage_isolated(zspage)) { - fg = putback_zspage(class, zspage); /* * Due to page_lock, we cannot free zspage immediately * so let's defer. */ - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); + putback_zspage_deferred(pool, class, zspage); } + spin_unlock(&class->lock); } From 701d678599d0c1623aaf4139c03eea260a75b027 Mon Sep 17 00:00:00 2001 From: Henry Burns Date: Sat, 24 Aug 2019 17:55:06 -0700 Subject: [PATCH 241/266] mm/zsmalloc.c: fix race condition in zs_destroy_pool In zs_destroy_pool() we call flush_work(&pool->free_work). However, we have no guarantee that migration isn't happening in the background at that time. Since migration can't directly free pages, it relies on free_work being scheduled to free the pages. But there's nothing preventing an in-progress migrate from queuing the work *after* zs_unregister_migration() has called flush_work(). Which would mean pages still pointing at the inode when we free it. Since we know at destroy time all objects should be free, no new migrations can come in (since zs_page_isolate() fails for fully-free zspages). This means it is sufficient to track a "# isolated zspages" count by class, and have the destroy logic ensure all such pages have drained before proceeding. Keeping that state under the class spinlock keeps the logic straightforward. In this case a memory leak could lead to an eventual crash if compaction hits the leaked page. This crash would only occur if people are changing their zswap backend at runtime (which eventually starts destruction). Link: http://lkml.kernel.org/r/20190809181751.219326-2-henryburns@google.com Fixes: 48b4800a1c6a ("zsmalloc: page migration support") Signed-off-by: Henry Burns Reviewed-by: Sergey Senozhatsky Cc: Henry Burns Cc: Minchan Kim Cc: Shakeel Butt Cc: Jonathan Adams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5105b9b66653..08def3a0d200 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -268,6 +269,10 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; + /* A wait queue for when migration races with async_free_zspage() */ + struct wait_queue_head migration_wait; + atomic_long_t isolated_pages; + bool destroying; #endif }; @@ -1874,6 +1879,19 @@ static void putback_zspage_deferred(struct zs_pool *pool, } +static inline void zs_pool_dec_isolated(struct zs_pool *pool) +{ + VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); + atomic_long_dec(&pool->isolated_pages); + /* + * There's no possibility of racing, since wait_for_isolated_drain() + * checks the isolated count under &class->lock after enqueuing + * on migration_wait. + */ + if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) + wake_up_all(&pool->migration_wait); +} + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1943,6 +1961,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) */ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { get_zspage_mapping(zspage, &class_idx, &fullness); + atomic_long_inc(&pool->isolated_pages); remove_zspage(class, zspage, fullness); } @@ -2042,8 +2061,16 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, * Page migration is done so let's putback isolated zspage to * the list if @page is final isolated subpage in the zspage. */ - if (!is_zspage_isolated(zspage)) + if (!is_zspage_isolated(zspage)) { + /* + * We cannot race with zs_destroy_pool() here because we wait + * for isolation to hit zero before we start destroying. + * Also, we ensure that everyone can see pool->destroying before + * we start waiting. + */ putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); + } reset_page(page); put_page(page); @@ -2094,8 +2121,8 @@ static void zs_page_putback(struct page *page) * so let's defer. */ putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); } - spin_unlock(&class->lock); } @@ -2118,8 +2145,36 @@ static int zs_register_migration(struct zs_pool *pool) return 0; } +static bool pool_isolated_are_drained(struct zs_pool *pool) +{ + return atomic_long_read(&pool->isolated_pages) == 0; +} + +/* Function for resolving migration */ +static void wait_for_isolated_drain(struct zs_pool *pool) +{ + + /* + * We're in the process of destroying the pool, so there are no + * active allocations. zs_page_isolate() fails for completely free + * zspages, so we need only wait for the zs_pool's isolated + * count to hit zero. + */ + wait_event(pool->migration_wait, + pool_isolated_are_drained(pool)); +} + static void zs_unregister_migration(struct zs_pool *pool) { + pool->destroying = true; + /* + * We need a memory barrier here to ensure global visibility of + * pool->destroying. Thus pool->isolated pages will either be 0 in which + * case we don't care, or it will be > 0 and pool->destroying will + * ensure that we wake up once isolation hits 0. + */ + smp_mb(); + wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2357,6 +2412,8 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; + init_waitqueue_head(&pool->migration_wait); + if (create_cache(pool)) goto err; From 00fb24a42a68b1ee0f6495993fe1be7124433dfb Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Sat, 24 Aug 2019 17:55:09 -0700 Subject: [PATCH 242/266] mm/kasan: fix false positive invalid-free reports with CONFIG_KASAN_SW_TAGS=y The code like this: ptr = kmalloc(size, GFP_KERNEL); page = virt_to_page(ptr); offset = offset_in_page(ptr); kfree(page_address(page) + offset); may produce false-positive invalid-free reports on the kernel with CONFIG_KASAN_SW_TAGS=y. In the example above we lose the original tag assigned to 'ptr', so kfree() gets the pointer with 0xFF tag. In kfree() we check that 0xFF tag is different from the tag in shadow hence print false report. Instead of just comparing tags, do the following: 1) Check that shadow doesn't contain KASAN_TAG_INVALID. Otherwise it's double-free and it doesn't matter what tag the pointer have. 2) If pointer tag is different from 0xFF, make sure that tag in the shadow is the same as in the pointer. Link: http://lkml.kernel.org/r/20190819172540.19581-1-aryabinin@virtuozzo.com Fixes: 7f94ffbc4c6a ("kasan: add hooks implementation for tag-based mode") Signed-off-by: Andrey Ryabinin Reported-by: Walter Wu Reported-by: Mark Rutland Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Will Deacon Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2277b82902d8..95d16a42db6b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -407,8 +407,14 @@ static inline bool shadow_invalid(u8 tag, s8 shadow_byte) if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE; - else - return tag != (u8)shadow_byte; + + /* else CONFIG_KASAN_SW_TAGS: */ + if ((u8)shadow_byte == KASAN_TAG_INVALID) + return true; + if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte)) + return true; + + return false; } static bool __kasan_slab_free(struct kmem_cache *cache, void *object, From 75545304eba6a3d282f923b96a466dc25a81e359 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 25 Aug 2019 09:21:44 +0200 Subject: [PATCH 243/266] ALSA: seq: Fix potential concurrent access to the deleted pool The input pool of a client might be deleted via the resize ioctl, the the access to it should be covered by the proper locks. Currently the only missing place is the call in snd_seq_ioctl_get_client_pool(), and this patch papers over it. Reported-by: syzbot+4a75454b9ca2777f35c7@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 3 +-- sound/core/seq/seq_fifo.c | 17 +++++++++++++++++ sound/core/seq/seq_fifo.h | 2 ++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 7737b2670064..6d9592f0ae1d 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1835,8 +1835,7 @@ static int snd_seq_ioctl_get_client_pool(struct snd_seq_client *client, if (cptr->type == USER_CLIENT) { info->input_pool = cptr->data.user.fifo_pool_size; info->input_free = info->input_pool; - if (cptr->data.user.fifo) - info->input_free = snd_seq_unused_cells(cptr->data.user.fifo->pool); + info->input_free = snd_seq_fifo_unused_cells(cptr->data.user.fifo); } else { info->input_pool = 0; info->input_free = 0; diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c index ea69261f269a..eaaa8b5830bb 100644 --- a/sound/core/seq/seq_fifo.c +++ b/sound/core/seq/seq_fifo.c @@ -263,3 +263,20 @@ int snd_seq_fifo_resize(struct snd_seq_fifo *f, int poolsize) return 0; } + +/* get the number of unused cells safely */ +int snd_seq_fifo_unused_cells(struct snd_seq_fifo *f) +{ + unsigned long flags; + int cells; + + if (!f) + return 0; + + snd_use_lock_use(&f->use_lock); + spin_lock_irqsave(&f->lock, flags); + cells = snd_seq_unused_cells(f->pool); + spin_unlock_irqrestore(&f->lock, flags); + snd_use_lock_free(&f->use_lock); + return cells; +} diff --git a/sound/core/seq/seq_fifo.h b/sound/core/seq/seq_fifo.h index edc68743943d..b56a7b897c9c 100644 --- a/sound/core/seq/seq_fifo.h +++ b/sound/core/seq/seq_fifo.h @@ -53,5 +53,7 @@ int snd_seq_fifo_poll_wait(struct snd_seq_fifo *f, struct file *file, poll_table /* resize pool in fifo */ int snd_seq_fifo_resize(struct snd_seq_fifo *f, int poolsize); +/* get the number of unused cells safely */ +int snd_seq_fifo_unused_cells(struct snd_seq_fifo *f); #endif From a55aa89aab90fae7c815b0551b07be37db359d76 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Aug 2019 12:01:23 -0700 Subject: [PATCH 244/266] Linux 5.3-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9fa18613566f..f125625efd60 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 3 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Bobtail Squid # *DOCUMENTATION* From 9b5f684182403f2b338f797c44eca0061c797dc8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 25 Aug 2019 07:47:30 -0700 Subject: [PATCH 245/266] nexthop: Fix nexthop_num_path for blackhole nexthops Donald reported this sequence: ip next add id 1 blackhole ip next add id 2 blackhole ip ro add 1.1.1.1/32 nhid 1 ip ro add 1.1.1.2/32 nhid 2 would cause a crash. Backtrace is: [ 151.302790] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 151.304043] CPU: 1 PID: 277 Comm: ip Not tainted 5.3.0-rc5+ #37 [ 151.305078] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014 [ 151.306526] RIP: 0010:fib_add_nexthop+0x8b/0x2aa [ 151.307343] Code: 35 f7 81 48 8d 14 01 c7 02 f1 f1 f1 f1 c7 42 04 01 f4 f4 f4 48 89 f2 48 c1 ea 03 65 48 8b 0c 25 28 00 00 00 48 89 4d d0 31 c9 <80> 3c 02 00 74 08 48 89 f7 e8 1a e8 53 ff be 08 00 00 00 4c 89 e7 [ 151.310549] RSP: 0018:ffff888116c27340 EFLAGS: 00010246 [ 151.311469] RAX: dffffc0000000000 RBX: ffff8881154ece00 RCX: 0000000000000000 [ 151.312713] RDX: 0000000000000004 RSI: 0000000000000020 RDI: ffff888115649b40 [ 151.313968] RBP: ffff888116c273d8 R08: ffffed10221e3757 R09: ffff888110f1bab8 [ 151.315212] R10: 0000000000000001 R11: ffff888110f1bab3 R12: ffff888115649b40 [ 151.316456] R13: 0000000000000020 R14: ffff888116c273b0 R15: ffff888115649b40 [ 151.317707] FS: 00007f60b4d8d800(0000) GS:ffff88811ac00000(0000) knlGS:0000000000000000 [ 151.319113] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 151.320119] CR2: 0000555671ffdc00 CR3: 00000001136ba005 CR4: 0000000000020ee0 [ 151.321367] Call Trace: [ 151.321820] ? fib_nexthop_info+0x635/0x635 [ 151.322572] fib_dump_info+0xaa4/0xde0 [ 151.323247] ? fib_create_info+0x2431/0x2431 [ 151.324008] ? napi_alloc_frag+0x2a/0x2a [ 151.324711] rtmsg_fib+0x2c4/0x3be [ 151.325339] fib_table_insert+0xe2f/0xeee ... fib_dump_info incorrectly has nhs = 0 for blackhole nexthops, so it believes the nexthop object is a multipath group (nhs != 1) and ends up down the nexthop_mpath_fill_node() path which is wrong for a blackhole. The blackhole check in nexthop_num_path is leftover from early days of the blackhole implementation which did not initialize the device. In the end the design was simpler (fewer special case checks) to set the device to loopback in nh_info, so the check in nexthop_num_path should have been removed. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Reported-by: Donald Sharp Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 25f1f9a8419b..95f766c31c90 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -141,12 +141,6 @@ static inline unsigned int nexthop_num_path(const struct nexthop *nh) nh_grp = rcu_dereference_rtnl(nh->nh_grp); rc = nh_grp->num_nh; - } else { - const struct nh_info *nhi; - - nhi = rcu_dereference_rtnl(nh->nh_info); - if (nhi->reject_nh) - rc = 0; } return rc; From 803f3e22ae10003a83c781498c0ac34cfe3463ff Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 23 Aug 2019 20:51:43 +0300 Subject: [PATCH 246/266] ipv4: mpls: fix mpls_xmit for iptunnel When using mpls over gre/gre6 setup, rt->rt_gw4 address is not set, the same for rt->rt_gw_family. Therefore, when rt->rt_gw_family is checked in mpls_xmit(), neigh_xmit() call is skipped. As a result, such setup doesn't work anymore. This issue was found with LTP mpls03 tests. Fixes: 1550c171935d ("ipv4: Prepare rtable for IPv6 gateway") Signed-off-by: Alexey Kodanev Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/mpls_iptunnel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index d25e91d7bdc1..44b675016393 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -133,12 +133,12 @@ static int mpls_xmit(struct sk_buff *skb) mpls_stats_inc_outucastpkts(out_dev, skb); if (rt) { - if (rt->rt_gw_family == AF_INET) - err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, - skb); - else if (rt->rt_gw_family == AF_INET6) + if (rt->rt_gw_family == AF_INET6) err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, skb); + else + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, + skb); } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ From 7177895154e6a35179d332f4a584d396c50d0612 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Thu, 22 Aug 2019 13:17:50 -0700 Subject: [PATCH 247/266] openvswitch: Fix conntrack cache with timeout This patch addresses a conntrack cache issue with timeout policy. Currently, we do not check if the timeout extension is set properly in the cached conntrack entry. Thus, after packet recirculate from conntrack action, the timeout policy is not applied properly. This patch fixes the aforementioned issue. Fixes: 06bd2bdf19d2 ("openvswitch: Add timeout support to ct action") Reported-by: kbuild test robot Signed-off-by: Yi-Hung Wei Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index a1852e035ebb..d8da6477d6be 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -67,6 +67,7 @@ struct ovs_conntrack_info { struct md_mark mark; struct md_labels labels; char timeout[CTNL_TIMEOUT_NAME_MAX]; + struct nf_ct_timeout *nf_ct_timeout; #if IS_ENABLED(CONFIG_NF_NAT) struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif @@ -697,6 +698,14 @@ static bool skb_nfct_cached(struct net *net, if (help && rcu_access_pointer(help->helper) != info->helper) return false; } + if (info->nf_ct_timeout) { + struct nf_conn_timeout *timeout_ext; + + timeout_ext = nf_ct_timeout_find(ct); + if (!timeout_ext || info->nf_ct_timeout != + rcu_dereference(timeout_ext->timeout)) + return false; + } /* Force conntrack entry direction to the current packet? */ if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { /* Delete the conntrack entry if confirmed, else just release @@ -1657,6 +1666,10 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, ct_info.timeout)) pr_info_ratelimited("Failed to associated timeout " "policy `%s'\n", ct_info.timeout); + else + ct_info.nf_ct_timeout = rcu_dereference( + nf_ct_timeout_find(ct_info.ct)->timeout); + } if (helper) { From f53a7ad189594a112167efaf17ea8d0242b5ac00 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Sat, 24 Aug 2019 01:36:19 -0700 Subject: [PATCH 248/266] r8152: Set memory to all 0xFFs on failed reg reads get_registers() blindly copies the memory written to by the usb_control_msg() call even if the underlying urb failed. This could lead to junk register values being read by the driver, since some indirect callers of get_registers() ignore the return values. One example is: ocp_read_dword() ignores the return value of generic_ocp_read(), which calls get_registers(). So, emulate PCI "Master Abort" behavior by setting the buffer to all 0xFFs when usb_control_msg() fails. This patch is copied from the r8152 driver (v2.12.0) published by Realtek (www.realtek.com). Signed-off-by: Prashant Malani Acked-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 0cc03a9ff545..eee0f5007ee3 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -799,8 +799,11 @@ int get_registers(struct r8152 *tp, u16 value, u16 index, u16 size, void *data) ret = usb_control_msg(tp->udev, usb_rcvctrlpipe(tp->udev, 0), RTL8152_REQ_GET_REGS, RTL8152_REQT_READ, value, index, tmp, size, 500); + if (ret < 0) + memset(data, 0xff, size); + else + memcpy(data, tmp, size); - memcpy(data, tmp, size); kfree(tmp); return ret; From 2fd2329393658514db074abd4f7dea8da1c20f81 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 26 Aug 2019 22:55:15 +0900 Subject: [PATCH 249/266] ALSA: oxfw: fix to handle correct stream for PCM playback When userspace application calls ioctl(2) to configure hardware for PCM playback substream, ALSA OXFW driver handles incoming AMDTP stream. In this case, outgoing AMDTP stream should be handled. This commit fixes the bug for v5.3-rc kernel. Fixes: 4f380d007052 ("ALSA: oxfw: configure packet format in pcm.hw_params callback") Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai --- sound/firewire/oxfw/oxfw-pcm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/firewire/oxfw/oxfw-pcm.c b/sound/firewire/oxfw/oxfw-pcm.c index 9ea39348cdf5..7c6d1c277d4d 100644 --- a/sound/firewire/oxfw/oxfw-pcm.c +++ b/sound/firewire/oxfw/oxfw-pcm.c @@ -248,7 +248,7 @@ static int pcm_playback_hw_params(struct snd_pcm_substream *substream, unsigned int channels = params_channels(hw_params); mutex_lock(&oxfw->mutex); - err = snd_oxfw_stream_reserve_duplex(oxfw, &oxfw->tx_stream, + err = snd_oxfw_stream_reserve_duplex(oxfw, &oxfw->rx_stream, rate, channels); if (err >= 0) ++oxfw->substreams_count; From 174ae4e96e0f54958cbe3fd3090a3cefeb63af4d Mon Sep 17 00:00:00 2001 From: Mischa Jonker Date: Wed, 24 Jul 2019 14:04:34 +0200 Subject: [PATCH 250/266] ARCv2: IDU-intc: Add support for edge-triggered interrupts This adds support for an optional extra interrupt cell to specify edge vs level triggered. It is backward compatible with dts files with only one cell, and will default to level-triggered in such a case. Note that I had to make a change to idu_irq_set_affinity as well, as this function was setting the interrupt type to "level" unconditionally, since this was the only type supported previously. Signed-off-by: Mischa Jonker Reviewed-by: Vineet Gupta Signed-off-by: Vineet Gupta --- arch/arc/kernel/mcip.c | 60 +++++++++++++++++++++++++++++++++++++----- include/soc/arc/mcip.h | 11 ++++++++ 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/arch/arc/kernel/mcip.c b/arch/arc/kernel/mcip.c index 18b493dfb3a8..abf9398cc333 100644 --- a/arch/arc/kernel/mcip.c +++ b/arch/arc/kernel/mcip.c @@ -202,8 +202,8 @@ static void idu_set_dest(unsigned int cmn_irq, unsigned int cpu_mask) __mcip_cmd_data(CMD_IDU_SET_DEST, cmn_irq, cpu_mask); } -static void idu_set_mode(unsigned int cmn_irq, unsigned int lvl, - unsigned int distr) +static void idu_set_mode(unsigned int cmn_irq, bool set_lvl, unsigned int lvl, + bool set_distr, unsigned int distr) { union { unsigned int word; @@ -212,8 +212,11 @@ static void idu_set_mode(unsigned int cmn_irq, unsigned int lvl, }; } data; - data.distr = distr; - data.lvl = lvl; + data.word = __mcip_cmd_read(CMD_IDU_READ_MODE, cmn_irq); + if (set_distr) + data.distr = distr; + if (set_lvl) + data.lvl = lvl; __mcip_cmd_data(CMD_IDU_SET_MODE, cmn_irq, data.word); } @@ -240,6 +243,25 @@ static void idu_irq_unmask(struct irq_data *data) raw_spin_unlock_irqrestore(&mcip_lock, flags); } +static void idu_irq_ack(struct irq_data *data) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&mcip_lock, flags); + __mcip_cmd(CMD_IDU_ACK_CIRQ, data->hwirq); + raw_spin_unlock_irqrestore(&mcip_lock, flags); +} + +static void idu_irq_mask_ack(struct irq_data *data) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&mcip_lock, flags); + __mcip_cmd_data(CMD_IDU_SET_MASK, data->hwirq, 1); + __mcip_cmd(CMD_IDU_ACK_CIRQ, data->hwirq); + raw_spin_unlock_irqrestore(&mcip_lock, flags); +} + static int idu_irq_set_affinity(struct irq_data *data, const struct cpumask *cpumask, bool force) @@ -263,13 +285,36 @@ idu_irq_set_affinity(struct irq_data *data, const struct cpumask *cpumask, else distribution_mode = IDU_M_DISTRI_RR; - idu_set_mode(data->hwirq, IDU_M_TRIG_LEVEL, distribution_mode); + idu_set_mode(data->hwirq, false, 0, true, distribution_mode); raw_spin_unlock_irqrestore(&mcip_lock, flags); return IRQ_SET_MASK_OK; } +static int idu_irq_set_type(struct irq_data *data, u32 type) +{ + unsigned long flags; + + /* + * ARCv2 IDU HW does not support inverse polarity, so these are the + * only interrupt types supported. + */ + if (type & ~(IRQ_TYPE_EDGE_RISING | IRQ_TYPE_LEVEL_HIGH)) + return -EINVAL; + + raw_spin_lock_irqsave(&mcip_lock, flags); + + idu_set_mode(data->hwirq, true, + type & IRQ_TYPE_EDGE_RISING ? IDU_M_TRIG_EDGE : + IDU_M_TRIG_LEVEL, + false, 0); + + raw_spin_unlock_irqrestore(&mcip_lock, flags); + + return 0; +} + static void idu_irq_enable(struct irq_data *data) { /* @@ -289,7 +334,10 @@ static struct irq_chip idu_irq_chip = { .name = "MCIP IDU Intc", .irq_mask = idu_irq_mask, .irq_unmask = idu_irq_unmask, + .irq_ack = idu_irq_ack, + .irq_mask_ack = idu_irq_mask_ack, .irq_enable = idu_irq_enable, + .irq_set_type = idu_irq_set_type, #ifdef CONFIG_SMP .irq_set_affinity = idu_irq_set_affinity, #endif @@ -317,7 +365,7 @@ static int idu_irq_map(struct irq_domain *d, unsigned int virq, irq_hw_number_t } static const struct irq_domain_ops idu_irq_ops = { - .xlate = irq_domain_xlate_onecell, + .xlate = irq_domain_xlate_onetwocell, .map = idu_irq_map, }; diff --git a/include/soc/arc/mcip.h b/include/soc/arc/mcip.h index 50f49e043668..d1a93c73f006 100644 --- a/include/soc/arc/mcip.h +++ b/include/soc/arc/mcip.h @@ -46,7 +46,9 @@ struct mcip_cmd { #define CMD_IDU_ENABLE 0x71 #define CMD_IDU_DISABLE 0x72 #define CMD_IDU_SET_MODE 0x74 +#define CMD_IDU_READ_MODE 0x75 #define CMD_IDU_SET_DEST 0x76 +#define CMD_IDU_ACK_CIRQ 0x79 #define CMD_IDU_SET_MASK 0x7C #define IDU_M_TRIG_LEVEL 0x0 @@ -119,4 +121,13 @@ static inline void __mcip_cmd_data(unsigned int cmd, unsigned int param, __mcip_cmd(cmd, param); } +/* + * Read MCIP register + */ +static inline unsigned int __mcip_cmd_read(unsigned int cmd, unsigned int param) +{ + __mcip_cmd(cmd, param); + return read_aux_reg(ARC_REG_MCIP_READBACK); +} + #endif From 01449985e644329e1fd5c269fff07b9a539eeebf Mon Sep 17 00:00:00 2001 From: Mischa Jonker Date: Wed, 24 Jul 2019 14:04:35 +0200 Subject: [PATCH 251/266] dt-bindings: IDU-intc: Clean up documentation * Some lines exceeded 80 characters. * Clarified statement about AUX register interface Signed-off-by: Mischa Jonker Reviewed-by: Rob Herring Signed-off-by: Vineet Gupta --- .../interrupt-controller/snps,archs-idu-intc.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/snps,archs-idu-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/snps,archs-idu-intc.txt index 09fc02b99845..c5a1c7b4fc3f 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/snps,archs-idu-intc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/snps,archs-idu-intc.txt @@ -1,7 +1,8 @@ * ARC-HS Interrupt Distribution Unit - This optional 2nd level interrupt controller can be used in SMP configurations for - dynamic IRQ routing, load balancing of common/external IRQs towards core intc. + This optional 2nd level interrupt controller can be used in SMP configurations + for dynamic IRQ routing, load balancing of common/external IRQs towards core + intc. Properties: @@ -13,8 +14,8 @@ Properties: of the particular interrupt line of IDU corresponds to the line N+24 of the core interrupt controller. - intc accessed via the special ARC AUX register interface, hence "reg" property - is not specified. + The interrupt controller is accessed via the special ARC AUX register + interface, hence "reg" property is not specified. Example: core_intc: core-interrupt-controller { From d85f6b93a76e74f1cbd0c14fb685cc1bc8df9044 Mon Sep 17 00:00:00 2001 From: Mischa Jonker Date: Wed, 24 Jul 2019 14:04:36 +0200 Subject: [PATCH 252/266] dt-bindings: IDU-intc: Add support for edge-triggered interrupts This updates the documentation for supporting an optional extra interrupt cell to specify edge vs level triggered. Signed-off-by: Mischa Jonker Reviewed-by: Rob Herring Signed-off-by: Vineet Gupta --- .../snps,archs-idu-intc.txt | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/snps,archs-idu-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/snps,archs-idu-intc.txt index c5a1c7b4fc3f..a5c1db95b3ec 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/snps,archs-idu-intc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/snps,archs-idu-intc.txt @@ -8,11 +8,20 @@ Properties: - compatible: "snps,archs-idu-intc" - interrupt-controller: This is an interrupt controller. -- #interrupt-cells: Must be <1>. +- #interrupt-cells: Must be <1> or <2>. - Value of the cell specifies the "common" IRQ from peripheral to IDU. Number N - of the particular interrupt line of IDU corresponds to the line N+24 of the - core interrupt controller. + Value of the first cell specifies the "common" IRQ from peripheral to IDU. + Number N of the particular interrupt line of IDU corresponds to the line N+24 + of the core interrupt controller. + + The (optional) second cell specifies any of the following flags: + - bits[3:0] trigger type and level flags + 1 = low-to-high edge triggered + 2 = NOT SUPPORTED (high-to-low edge triggered) + 4 = active high level-sensitive <<< DEFAULT + 8 = NOT SUPPORTED (active low level-sensitive) + When no second cell is specified, the interrupt is assumed to be level + sensitive. The interrupt controller is accessed via the special ARC AUX register interface, hence "reg" property is not specified. From 2f029413cbfbfe519d294c6ac83a0c00e2a48a97 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 12 Aug 2019 14:50:35 -0700 Subject: [PATCH 253/266] arc: prefer __section from compiler_attributes.h Reported-by: Sedat Dilek Suggested-by: Josh Poimboeuf Signed-off-by: Nick Desaulniers Signed-off-by: Vineet Gupta --- arch/arc/include/asm/linkage.h | 8 ++++---- arch/arc/include/asm/mach_desc.h | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arc/include/asm/linkage.h b/arch/arc/include/asm/linkage.h index a0eeb9f8f0a9..d9ee43c6b7db 100644 --- a/arch/arc/include/asm/linkage.h +++ b/arch/arc/include/asm/linkage.h @@ -62,15 +62,15 @@ #else /* !__ASSEMBLY__ */ #ifdef CONFIG_ARC_HAS_ICCM -#define __arcfp_code __attribute__((__section__(".text.arcfp"))) +#define __arcfp_code __section(.text.arcfp) #else -#define __arcfp_code __attribute__((__section__(".text"))) +#define __arcfp_code __section(.text) #endif #ifdef CONFIG_ARC_HAS_DCCM -#define __arcfp_data __attribute__((__section__(".data.arcfp"))) +#define __arcfp_data __section(.data.arcfp) #else -#define __arcfp_data __attribute__((__section__(".data"))) +#define __arcfp_data __section(.data) #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/arc/include/asm/mach_desc.h b/arch/arc/include/asm/mach_desc.h index 8ac0e2ac3e70..73746ed5b834 100644 --- a/arch/arc/include/asm/mach_desc.h +++ b/arch/arc/include/asm/mach_desc.h @@ -53,8 +53,7 @@ extern const struct machine_desc __arch_info_begin[], __arch_info_end[]; */ #define MACHINE_START(_type, _name) \ static const struct machine_desc __mach_desc_##_type \ -__used \ -__attribute__((__section__(".arch.info.init"))) = { \ +__used __section(.arch.info.init) = { \ .name = _name, #define MACHINE_END \ From bd736ed3e2d1088d9b4050f727342e1e619c3841 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 15 Aug 2019 17:26:17 -0400 Subject: [PATCH 254/266] SUNRPC: Don't handle errors if the bind/connect succeeded Don't handle errors in call_bind_status()/call_connect_status() if it turns out that a previous call caused it to succeed. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v5.1+ --- net/sunrpc/clnt.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d8679b6027e9..3b731411d8e8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1970,6 +1970,7 @@ call_bind(struct rpc_task *task) static void call_bind_status(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; int status = -EIO; if (rpc_task_transmitted(task)) { @@ -1977,14 +1978,15 @@ call_bind_status(struct rpc_task *task) return; } - if (task->tk_status >= 0) { - dprint_status(task); + dprint_status(task); + trace_rpc_bind_status(task); + if (task->tk_status >= 0) + goto out_next; + if (xprt_bound(xprt)) { task->tk_status = 0; - task->tk_action = call_connect; - return; + goto out_next; } - trace_rpc_bind_status(task); switch (task->tk_status) { case -ENOMEM: dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid); @@ -2043,7 +2045,9 @@ call_bind_status(struct rpc_task *task) rpc_call_rpcerror(task, status); return; - +out_next: + task->tk_action = call_connect; + return; retry_timeout: task->tk_status = 0; task->tk_action = call_bind; @@ -2090,6 +2094,7 @@ call_connect(struct rpc_task *task) static void call_connect_status(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; @@ -2099,8 +2104,17 @@ call_connect_status(struct rpc_task *task) } dprint_status(task); - trace_rpc_connect_status(task); + + if (task->tk_status == 0) { + clnt->cl_stats->netreconn++; + goto out_next; + } + if (xprt_connected(xprt)) { + task->tk_status = 0; + goto out_next; + } + task->tk_status = 0; switch (status) { case -ECONNREFUSED: @@ -2131,13 +2145,12 @@ call_connect_status(struct rpc_task *task) case -EAGAIN: case -ETIMEDOUT: goto out_retry; - case 0: - clnt->cl_stats->netreconn++; - task->tk_action = call_transmit; - return; } rpc_call_rpcerror(task, status); return; +out_next: + task->tk_action = call_transmit; + return; out_retry: /* Check for timeouts before looping back to call_bind */ task->tk_action = call_bind; From bf2bf9b80e0cd3568ddc85a241abe0dd8b46ebdc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 15 Aug 2019 20:18:48 -0400 Subject: [PATCH 255/266] pNFS/flexfiles: Turn off soft RPC calls The pNFS/flexfiles I/O requests are sent with the SOFTCONN flag set, so they automatically time out if the connection breaks. It should therefore not be necessary to have the soft flag set in addition. Fixes: 5f01d9539496 ("nfs41: create NFSv3 DS connection if specified") Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index c0046c348910..82af4809b869 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -627,11 +627,16 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_test_and_add_xprt, NULL); - } else - clp = get_v3_ds_connect(mds_srv, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans); + continue; + } + clp = get_v3_ds_connect(mds_srv, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans); + if (IS_ERR(clp)) + continue; + clp->cl_rpcclient->cl_softerr = 0; + clp->cl_rpcclient->cl_softrtry = 0; } if (IS_ERR(clp)) { From 80f455da6cd0998a5be30a8af24ea2a22815c212 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 15 Aug 2019 22:55:19 -0400 Subject: [PATCH 256/266] SUNRPC: Handle EADDRINUSE and ENOBUFS correctly If a connect or bind attempt returns EADDRINUSE, that means we want to retry with a different port. It is not a fatal connection error. Similarly, ENOBUFS is not fatal, but just indicates a memory allocation issue. Retry after a short delay. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 3b731411d8e8..8a25440b771c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2005,6 +2005,9 @@ call_bind_status(struct rpc_task *task) task->tk_rebind_retry--; rpc_delay(task, 3*HZ); goto retry_timeout; + case -ENOBUFS: + rpc_delay(task, HZ >> 2); + goto retry_timeout; case -EAGAIN: goto retry_timeout; case -ETIMEDOUT: @@ -2028,7 +2031,6 @@ call_bind_status(struct rpc_task *task) case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: - case -ENOBUFS: case -EPIPE: dprintk("RPC: %5u remote rpcbind unreachable: %d\n", task->tk_pid, task->tk_status); @@ -2131,8 +2133,6 @@ call_connect_status(struct rpc_task *task) case -ENETDOWN: case -ENETUNREACH: case -EHOSTUNREACH: - case -EADDRINUSE: - case -ENOBUFS: case -EPIPE: xprt_conditional_disconnect(task->tk_rqstp->rq_xprt, task->tk_rqstp->rq_connect_cookie); @@ -2141,10 +2141,14 @@ call_connect_status(struct rpc_task *task) /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); /* fall through */ + case -EADDRINUSE: case -ENOTCONN: case -EAGAIN: case -ETIMEDOUT: goto out_retry; + case -ENOBUFS: + rpc_delay(task, HZ >> 2); + goto out_retry; } rpc_call_rpcerror(task, status); return; From d5711920ec6e578f51db95caa6f185f5090b865e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 16 Aug 2019 08:37:26 -0400 Subject: [PATCH 257/266] Revert "NFSv4/flexfiles: Abort I/O early if the layout segment was invalidated" This reverts commit a79f194aa4879e9baad118c3f8bb2ca24dbef765. The mechanism for aborting I/O is racy, since we are not guaranteed that the request is asleep while we're changing both task->tk_status and task->tk_action. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v5.1 --- fs/nfs/flexfilelayout/flexfilelayout.c | 17 ----------------- include/linux/sunrpc/sched.h | 1 - net/sunrpc/xprt.c | 7 ------- 3 files changed, 25 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b04e20d28162..2c7e1eca1ed7 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1148,8 +1148,6 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, break; case -NFS4ERR_RETRY_UNCACHED_REP: break; - case -EAGAIN: - return -NFS4ERR_RESET_TO_PNFS; /* Invalidate Layout errors */ case -NFS4ERR_PNFS_NO_LAYOUT: case -ESTALE: /* mapped NFS4ERR_STALE */ @@ -1210,7 +1208,6 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EBADHANDLE: case -ELOOP: case -ENOSPC: - case -EAGAIN: break; case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); @@ -1445,16 +1442,6 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) ff_layout_read_prepare_common(task, hdr); } -static void -ff_layout_io_prepare_transmit(struct rpc_task *task, - void *data) -{ - struct nfs_pgio_header *hdr = data; - - if (!pnfs_is_valid_lseg(hdr->lseg)) - rpc_exit(task, -EAGAIN); -} - static void ff_layout_read_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; @@ -1740,7 +1727,6 @@ static void ff_layout_commit_release(void *data) static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1748,7 +1734,6 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1756,7 +1741,6 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1764,7 +1748,6 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, - .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index baa3ecdb882f..27536b961552 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -98,7 +98,6 @@ typedef void (*rpc_action)(struct rpc_task *); struct rpc_call_ops { void (*rpc_call_prepare)(struct rpc_task *, void *); - void (*rpc_call_prepare_transmit)(struct rpc_task *, void *); void (*rpc_call_done)(struct rpc_task *, void *); void (*rpc_count_stats)(struct rpc_task *, void *); void (*rpc_release)(void *); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 783748dc5e6f..2e71f5455c6c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1408,13 +1408,6 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) status = -EBADMSG; goto out_dequeue; } - if (task->tk_ops->rpc_call_prepare_transmit) { - task->tk_ops->rpc_call_prepare_transmit(task, - task->tk_calldata); - status = task->tk_status; - if (status < 0) - goto out_dequeue; - } if (RPC_SIGNALLED(task)) { status = -ERESTARTSYS; goto out_dequeue; From c82e5472c9980e0e483f4b689044150eefaca408 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 16 Aug 2019 08:58:48 -0400 Subject: [PATCH 258/266] SUNRPC: Handle connection breakages correctly in call_status() If the connection breaks while we're waiting for a reply from the server, then we want to immediately try to reconnect. Fixes: ec6017d90359 ("SUNRPC fix regression in umount of a secure mount") Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8a25440b771c..a07b516e503a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2382,7 +2382,7 @@ call_status(struct rpc_task *task) case -ECONNABORTED: case -ENOTCONN: rpc_force_rebind(clnt); - /* fall through */ + break; case -EADDRINUSE: rpc_delay(task, 3*HZ); /* fall through */ From 7af46292dadcf8870946916f79fdddf79bd7267f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Aug 2019 08:37:50 -0400 Subject: [PATCH 259/266] pNFS/flexfiles: Don't time out requests on hard mounts If the mount is hard, we should ignore the 'io_maxretrans' module parameter so that we always keep retrying. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 2c7e1eca1ed7..5657b7f2611f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -928,7 +929,9 @@ retry: pgm = &pgio->pg_mirrors[0]; pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; - pgio->pg_maxretrans = io_maxretrans; + if (NFS_SERVER(pgio->pg_inode)->flags & + (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) + pgio->pg_maxretrans = io_maxretrans; return; out_nolseg: if (pgio->pg_error < 0) @@ -940,6 +943,7 @@ out_mds: pgio->pg_lseg); pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; + pgio->pg_maxretrans = 0; nfs_pageio_reset_read_mds(pgio); } @@ -1000,7 +1004,9 @@ retry: pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; } - pgio->pg_maxretrans = io_maxretrans; + if (NFS_SERVER(pgio->pg_inode)->flags & + (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) + pgio->pg_maxretrans = io_maxretrans; return; out_mds: @@ -1010,6 +1016,7 @@ out_mds: pgio->pg_lseg); pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; + pgio->pg_maxretrans = 0; nfs_pageio_reset_write_mds(pgio); } From 8f54c7a4babf58bbaf849e126f7ae9664bdc9e04 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 15 Aug 2019 12:26:05 -0400 Subject: [PATCH 260/266] NFS: Fix spurious EIO read errors If the client attempts to read a page, but the read fails due to some spurious error (e.g. an ACCESS error or a timeout, ...) then we need to allow other processes to retry. Also try to report errors correctly when doing a synchronous readpage. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 10 ++++++++++ fs/nfs/read.c | 35 ++++++++++++++++++++++++++--------- fs/nfs/write.c | 12 ------------ 3 files changed, 36 insertions(+), 21 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a2346a2f8361..e64f810223be 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -775,3 +775,13 @@ static inline bool nfs_error_is_fatal(int err) } } +static inline bool nfs_error_is_fatal_on_server(int err) +{ + switch (err) { + case 0: + case -ERESTARTSYS: + case -EINTR: + return false; + } + return nfs_error_is_fatal(err); +} diff --git a/fs/nfs/read.c b/fs/nfs/read.c index c19841c82b6a..cfe0b586eadd 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -91,19 +91,25 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); -static void nfs_readpage_release(struct nfs_page *req) +static void nfs_readpage_release(struct nfs_page *req, int error) { struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); + struct page *page = req->wb_page; dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), req->wb_bytes, (long long)req_offset(req)); + if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) + SetPageError(page); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(inode, req->wb_page, 0); + struct address_space *mapping = page_file_mapping(page); - unlock_page(req->wb_page); + if (PageUptodate(page)) + nfs_readpage_to_fscache(inode, page, 0); + else if (!PageError(page) && !PagePrivate(page)) + generic_error_remove_page(mapping, page); + unlock_page(page); } nfs_release_request(req); } @@ -131,7 +137,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, &nfs_async_read_completion_ops); if (!nfs_pageio_add_request(&pgio, new)) { nfs_list_remove_request(new); - nfs_readpage_release(new); + nfs_readpage_release(new, pgio.pg_error); } nfs_pageio_complete(&pgio); @@ -153,6 +159,7 @@ static void nfs_page_group_set_uptodate(struct nfs_page *req) static void nfs_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; + int error; if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) goto out; @@ -179,14 +186,19 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) zero_user_segment(page, start, end); } } + error = 0; bytes += req->wb_bytes; if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (bytes <= hdr->good_bytes) nfs_page_group_set_uptodate(req); + else { + error = hdr->error; + xchg(&nfs_req_openctx(req)->error, error); + } } else nfs_page_group_set_uptodate(req); nfs_list_remove_request(req); - nfs_readpage_release(req); + nfs_readpage_release(req, error); } out: hdr->release(hdr); @@ -213,7 +225,7 @@ nfs_async_read_error(struct list_head *head, int error) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - nfs_readpage_release(req); + nfs_readpage_release(req, error); } } @@ -337,8 +349,13 @@ int nfs_readpage(struct file *file, struct page *page) goto out; } + xchg(&ctx->error, 0); error = nfs_readpage_async(ctx, inode, page); - + if (!error) { + error = wait_on_page_locked_killable(page); + if (!PageUptodate(page) && !error) + error = xchg(&ctx->error, 0); + } out: put_nfs_open_context(ctx); return error; @@ -372,8 +389,8 @@ readpage_async_filler(void *data, struct page *page) zero_user_segment(page, len, PAGE_SIZE); if (!nfs_pageio_add_request(desc->pgio, new)) { nfs_list_remove_request(new); - nfs_readpage_release(new); error = desc->pgio->pg_error; + nfs_readpage_release(new, error); goto out; } return 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3399149435ce..cee9905e419c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -599,18 +599,6 @@ static void nfs_write_error(struct nfs_page *req, int error) nfs_release_request(req); } -static bool -nfs_error_is_fatal_on_server(int err) -{ - switch (err) { - case 0: - case -ERESTARTSYS: - case -EINTR: - return false; - } - return nfs_error_is_fatal(err); -} - /* * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). From 96c4145599b30c0eb6cbeaa24207802452dd1872 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 24 Aug 2019 10:39:00 -0400 Subject: [PATCH 261/266] NFS: Fix writepage(s) error handling to not report errors twice If writepage()/writepages() saw an error, but handled it without reporting it, we should not be re-reporting that error on exit. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index cee9905e419c..d193042fa228 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -621,12 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); /* If there is a fatal error that covers this write, just exit */ - ret = 0; mapping = page_file_mapping(page); - if (test_bit(AS_ENOSPC, &mapping->flags) || - test_bit(AS_EIO, &mapping->flags)) + ret = pgio->pg_error; + if (nfs_error_is_fatal_on_server(ret)) goto out_launder; + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -638,6 +638,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, } else ret = -EAGAIN; nfs_redirty_request(req); + pgio->pg_error = 0; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); @@ -657,7 +658,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, ret = nfs_page_async_flush(pgio, page); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); - ret = 0; + ret = AOP_WRITEPAGE_ACTIVATE; } return ret; } @@ -676,10 +677,11 @@ static int nfs_writepage_locked(struct page *page, nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); + pgio.pg_error = 0; nfs_pageio_complete(&pgio); if (err < 0) return err; - if (pgio.pg_error < 0) + if (nfs_error_is_fatal(pgio.pg_error)) return pgio.pg_error; return 0; } @@ -689,7 +691,8 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) int ret; ret = nfs_writepage_locked(page, wbc); - unlock_page(page); + if (ret != AOP_WRITEPAGE_ACTIVATE) + unlock_page(page); return ret; } @@ -698,7 +701,8 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int ret; ret = nfs_do_writepage(page, wbc, data); - unlock_page(page); + if (ret != AOP_WRITEPAGE_ACTIVATE) + unlock_page(page); return ret; } @@ -724,13 +728,14 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); + pgio.pg_error = 0; nfs_pageio_complete(&pgio); nfs_io_completion_put(ioc); if (err < 0) goto out_err; err = pgio.pg_error; - if (err < 0) + if (nfs_error_is_fatal(err)) goto out_err; return 0; out_err: From 5752bc4373b21c3fb1dd6db4dcdd569fae391a1d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Jul 2019 14:53:02 +0200 Subject: [PATCH 262/266] mfd: rk808: Mark pm functions __maybe_unused The newly added suspend/resume functions are only used if CONFIG_PM is enabled: drivers/mfd/rk808.c:752:12: error: 'rk8xx_resume' defined but not used [-Werror=unused-function] drivers/mfd/rk808.c:732:12: error: 'rk8xx_suspend' defined but not used [-Werror=unused-function] Mark them as __maybe_unused so the compiler can silently drop them when they are not needed. Fixes: 586c1b4125b3 ("mfd: rk808: Add RK817 and RK809 support") Signed-off-by: Arnd Bergmann Signed-off-by: Lee Jones --- drivers/mfd/rk808.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c index 601cefb5c9d8..9a9e6315ba46 100644 --- a/drivers/mfd/rk808.c +++ b/drivers/mfd/rk808.c @@ -729,7 +729,7 @@ static int rk808_remove(struct i2c_client *client) return 0; } -static int rk8xx_suspend(struct device *dev) +static int __maybe_unused rk8xx_suspend(struct device *dev) { struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client); int ret = 0; @@ -749,7 +749,7 @@ static int rk8xx_suspend(struct device *dev) return ret; } -static int rk8xx_resume(struct device *dev) +static int __maybe_unused rk8xx_resume(struct device *dev) { struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client); int ret = 0; From 4d82fa67dd6b0e2635ae9dad44fbf3d747eca9ed Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 27 Aug 2019 07:39:50 +0100 Subject: [PATCH 263/266] mfd: rk808: Make PM function declaration static Avoids: ../drivers/mfd/rk808.c:771:1: warning: symbol 'rk8xx_pm_ops' \ was not declared. Should it be static? Fixes: 5752bc4373b2 ("mfd: rk808: Mark pm functions __maybe_unused") Reviewed-by: Arnd Bergmann Signed-off-by: Lee Jones --- drivers/mfd/rk808.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c index 9a9e6315ba46..050478cabc95 100644 --- a/drivers/mfd/rk808.c +++ b/drivers/mfd/rk808.c @@ -768,7 +768,7 @@ static int __maybe_unused rk8xx_resume(struct device *dev) return ret; } -SIMPLE_DEV_PM_OPS(rk8xx_pm_ops, rk8xx_suspend, rk8xx_resume); +static SIMPLE_DEV_PM_OPS(rk8xx_pm_ops, rk8xx_suspend, rk8xx_resume); static struct i2c_driver rk808_i2c_driver = { .driver = { From 71affe9be45a5c60b9772e1b2701710712637274 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Aug 2019 20:41:16 -0400 Subject: [PATCH 264/266] NFSv2: Fix eof handling If we received a reply from the server with a zero length read and no error, then that implies we are at eof. Signed-off-by: Trond Myklebust --- fs/nfs/proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 5552fa8b6e12..ec79d2214a78 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -594,7 +594,8 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ - if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) + if ((hdr->res.count == 0 && hdr->args.count > 0) || + hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) hdr->res.eof = 1; } return 0; From d33d4beb522987d1c305c12500796f9be3687dee Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 27 Aug 2019 07:03:28 -0400 Subject: [PATCH 265/266] NFSv2: Fix write regression Ensure we update the write result count on success, since the RPC call itself does not do so. Reported-by: Jan Stancek Reported-by: Naresh Kamboju Signed-off-by: Trond Myklebust Tested-by: Jan Stancek --- fs/nfs/proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ec79d2214a78..0f7288b94633 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -616,8 +616,10 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - if (task->tk_status >= 0) + if (task->tk_status >= 0) { + hdr->res.count = hdr->args.count; nfs_writeback_update_inode(hdr); + } return 0; } From 99300a85260c2b7febd57082a617d1062532067e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 27 Aug 2019 15:16:36 +0800 Subject: [PATCH 266/266] NFS: remove set but not used variable 'mapping' Fixes gcc '-Wunused-but-set-variable' warning: fs/nfs/write.c: In function nfs_page_async_flush: fs/nfs/write.c:609:24: warning: variable mapping set but not used [-Wunused-but-set-variable] It is not use since commit aefb623c422e ("NFS: Fix writepage(s) error handling to not report errors twice") Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d193042fa228..85ca49549b39 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -606,7 +606,6 @@ static void nfs_write_error(struct nfs_page *req, int error) static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct page *page) { - struct address_space *mapping; struct nfs_page *req; int ret = 0; @@ -621,7 +620,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); /* If there is a fatal error that covers this write, just exit */ - mapping = page_file_mapping(page); ret = pgio->pg_error; if (nfs_error_is_fatal_on_server(ret)) goto out_launder;