From 3b9ea7206d7e1fdd7419cbd10badd3b2c80d04b4 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 14 Feb 2021 19:49:11 +0100 Subject: [PATCH 001/164] ath9k: fix transmitting to stations in dynamic SMPS mode When transmitting to a receiver in dynamic SMPS mode, all transmissions that use multiple spatial streams need to be sent using CTS-to-self or RTS/CTS to give the receiver's extra chains some time to wake up. This fixes the tx rate getting stuck at <= MCS7 for some clients, especially Intel ones, which make aggressive use of SMPS. Cc: stable@vger.kernel.org Reported-by: Martin Kennedy Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210214184911.96702-1-nbd@nbd.name --- drivers/net/wireless/ath/ath9k/ath9k.h | 3 ++- drivers/net/wireless/ath/ath9k/xmit.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h index 13b4f5f50f8a..ef6f5ea06c1f 100644 --- a/drivers/net/wireless/ath/ath9k/ath9k.h +++ b/drivers/net/wireless/ath/ath9k/ath9k.h @@ -177,7 +177,8 @@ struct ath_frame_info { s8 txq; u8 keyix; u8 rtscts_rate; - u8 retries : 7; + u8 retries : 6; + u8 dyn_smps : 1; u8 baw_tracked : 1; u8 tx_power; enum ath9k_key_type keytype:2; diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c index e60d4737fc6e..5691bd6eb82c 100644 --- a/drivers/net/wireless/ath/ath9k/xmit.c +++ b/drivers/net/wireless/ath/ath9k/xmit.c @@ -1271,6 +1271,11 @@ static void ath_buf_set_rate(struct ath_softc *sc, struct ath_buf *bf, is_40, is_sgi, is_sp); if (rix < 8 && (tx_info->flags & IEEE80211_TX_CTL_STBC)) info->rates[i].RateFlags |= ATH9K_RATESERIES_STBC; + if (rix >= 8 && fi->dyn_smps) { + info->rates[i].RateFlags |= + ATH9K_RATESERIES_RTS_CTS; + info->flags |= ATH9K_TXDESC_CTSENA; + } info->txpower[i] = ath_get_rate_txpower(sc, bf, rix, is_40, false); @@ -2114,6 +2119,7 @@ static void setup_frame_info(struct ieee80211_hw *hw, fi->keyix = an->ps_key; else fi->keyix = ATH9K_TXKEYIX_INVALID; + fi->dyn_smps = sta && sta->smps_mode == IEEE80211_SMPS_DYNAMIC; fi->keytype = keytype; fi->framelen = framelen; fi->tx_power = txpower; From ae064fc0e32a4d28389086d9f4b260a0c157cfee Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 16 Feb 2021 14:51:18 +0100 Subject: [PATCH 002/164] mt76: fix tx skb error handling in mt76_dma_tx_queue_skb When running out of room in the tx queue after calling drv->tx_prepare_skb, the buffer list will already have been modified on MT7615 and newer drivers. This can leak a DMA mapping and will show up as swiotlb allocation failures on x86. Fix this by moving the queue length check further up. This is less accurate, since it can overestimate the needed room in the queue on MT7615 and newer, but the difference is small enough to not matter in practice. Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210216135119.23809-1-nbd@nbd.name --- drivers/net/wireless/mediatek/mt76/dma.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 19098b852d0a..abdc8d364361 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -345,7 +345,6 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, struct mt76_queue *q, }; struct ieee80211_hw *hw; int len, n = 0, ret = -ENOMEM; - struct mt76_queue_entry e; struct mt76_txwi_cache *t; struct sk_buff *iter; dma_addr_t addr; @@ -387,6 +386,11 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, struct mt76_queue *q, } tx_info.nbuf = n; + if (q->queued + (tx_info.nbuf + 1) / 2 >= q->ndesc - 1) { + ret = -ENOMEM; + goto unmap; + } + dma_sync_single_for_cpu(dev->dev, t->dma_addr, dev->drv->txwi_size, DMA_TO_DEVICE); ret = dev->drv->tx_prepare_skb(dev, txwi, q->qid, wcid, sta, &tx_info); @@ -395,11 +399,6 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, struct mt76_queue *q, if (ret < 0) goto unmap; - if (q->queued + (tx_info.nbuf + 1) / 2 >= q->ndesc - 1) { - ret = -ENOMEM; - goto unmap; - } - return mt76_dma_add_buf(dev, q, tx_info.buf, tx_info.nbuf, tx_info.info, tx_info.skb, t); @@ -419,9 +418,7 @@ free: } #endif - e.skb = tx_info.skb; - e.txwi = t; - dev->drv->tx_complete_skb(dev, &e); + dev_kfree_skb(tx_info.skb); mt76_put_txwi(dev, t); return ret; } From 94f0e6256c2ab6803c935634aa1f653174c94879 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 16 Feb 2021 14:51:19 +0100 Subject: [PATCH 003/164] mt76: mt7915: only modify tx buffer list after allocating tx token id Modifying the tx buffer list too early can leak DMA mappings Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210216135119.23809-2-nbd@nbd.name --- drivers/net/wireless/mediatek/mt76/mt7915/mac.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index eb889f8d6fea..e5a258958ac9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -967,11 +967,6 @@ int mt7915_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, } txp->nbuf = nbuf; - /* pass partial skb header to fw */ - tx_info->buf[1].len = MT_CT_PARSE_LEN; - tx_info->buf[1].skip_unmap = true; - tx_info->nbuf = MT_CT_DMA_BUF_NUM; - txp->flags = cpu_to_le16(MT_CT_INFO_APPLY_TXD | MT_CT_INFO_FROM_HOST); if (!key) @@ -1009,6 +1004,11 @@ int mt7915_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, txp->rept_wds_wcid = cpu_to_le16(0x3ff); tx_info->skb = DMA_DUMMY_DATA; + /* pass partial skb header to fw */ + tx_info->buf[1].len = MT_CT_PARSE_LEN; + tx_info->buf[1].skip_unmap = true; + tx_info->nbuf = MT_CT_DMA_BUF_NUM; + return 0; } From 4538c5ed0f7e892f1b643472e48146757d1e60c5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 19 Feb 2021 13:35:07 +0100 Subject: [PATCH 004/164] iwlwifi: avoid crash on unsupported debug collection If the opmode doesn't support debug collection (DVM) then don't crash, but just skip the callback. Fixes: d01293154c0a ("iwlwifi: dbg: add op_mode callback for collecting debug data.") Reported-by: Andy Lavr Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210219133506.ecabe285bc7d.I73d230d555c595fa2d9bf284f80078729fe18aa4@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h b/drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h index 868da7e79a45..e6d2e0994317 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h @@ -205,6 +205,8 @@ static inline void iwl_op_mode_time_point(struct iwl_op_mode *op_mode, enum iwl_fw_ini_time_point tp_id, union iwl_dbg_tlv_tp_data *tp_data) { + if (!op_mode || !op_mode->ops || !op_mode->ops->time_point) + return; op_mode->ops->time_point(op_mode, tp_id, tp_data); } From b29dd96b905f3dd543f4ca729447286adf934dd6 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 16 Feb 2021 12:53:07 +0000 Subject: [PATCH 005/164] bpf, x86: Fix BPF_FETCH atomic and/or/xor with r0 as src This code generates a CMPXCHG loop in order to implement atomic_fetch bitwise operations. Because CMPXCHG is hard-coded to use rax (which holds the BPF r0 value), it saves the _real_ r0 value into the internal "ax" temporary register and restores it once the loop is complete. In the middle of the loop, the actual bitwise operation is performed using src_reg. The bug occurs when src_reg is r0: as described above, r0 has been clobbered and the real r0 value is in the ax register. Therefore, perform this operation on the ax register instead, when src_reg is r0. Fixes: 981f94c3e921 ("bpf: Add bitwise atomic instructions") Signed-off-by: Brendan Jackman Signed-off-by: Daniel Borkmann Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210216125307.1406237-1-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 10 +++++--- .../selftests/bpf/verifier/atomic_and.c | 23 +++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 79e7a0ec1da5..6926d0ca6c71 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1349,6 +1349,7 @@ st: if (is_imm8(insn->off)) insn->imm == (BPF_XOR | BPF_FETCH)) { u8 *branch_target; bool is64 = BPF_SIZE(insn->code) == BPF_DW; + u32 real_src_reg = src_reg; /* * Can't be implemented with a single x86 insn. @@ -1357,6 +1358,9 @@ st: if (is_imm8(insn->off)) /* Will need RAX as a CMPXCHG operand so save R0 */ emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); + if (src_reg == BPF_REG_0) + real_src_reg = BPF_REG_AX; + branch_target = prog; /* Load old value */ emit_ldx(&prog, BPF_SIZE(insn->code), @@ -1366,9 +1370,9 @@ st: if (is_imm8(insn->off)) * put the result in the AUX_REG. */ emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0); - maybe_emit_mod(&prog, AUX_REG, src_reg, is64); + maybe_emit_mod(&prog, AUX_REG, real_src_reg, is64); EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], - add_2reg(0xC0, AUX_REG, src_reg)); + add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ err = emit_atomic(&prog, BPF_CMPXCHG, dst_reg, AUX_REG, insn->off, @@ -1381,7 +1385,7 @@ st: if (is_imm8(insn->off)) */ EMIT2(X86_JNE, -(prog - branch_target) - 2); /* Return the pre-modification value */ - emit_mov_reg(&prog, is64, src_reg, BPF_REG_0); + emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0); /* Restore R0 after clobbering RAX */ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); break; diff --git a/tools/testing/selftests/bpf/verifier/atomic_and.c b/tools/testing/selftests/bpf/verifier/atomic_and.c index 1bdc8e6684f7..fe4bb70eb9c5 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_and.c +++ b/tools/testing/selftests/bpf/verifier/atomic_and.c @@ -75,3 +75,26 @@ }, .result = ACCEPT, }, +{ + "BPF_ATOMIC_AND with fetch - r0 as source reg", + .insns = { + /* val = 0x110; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0x110), + /* old = atomic_fetch_and(&val, 0x011); */ + BPF_MOV64_IMM(BPF_REG_0, 0x011), + BPF_ATOMIC_OP(BPF_DW, BPF_AND | BPF_FETCH, BPF_REG_10, BPF_REG_0, -8), + /* if (old != 0x110) exit(3); */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0x110, 2), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* if (val != 0x010) exit(2); */ + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x010, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, From 33ccec5fd740d0d5b78b77846f76eb5b4feb4327 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 17 Feb 2021 10:45:25 +0300 Subject: [PATCH 006/164] bpf: Fix a warning message in mark_ptr_not_null_reg() The WARN_ON() argument is a condition, not an error message. So this code will print a stack trace but will not print the warning message. Fix that and also change it to only WARN_ONCE(). Fixes: 4ddb74165ae5 ("bpf: Extract nullable reg type conversion into a helper function") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/YCzJlV3hnF%2Ft1Pk4@mwanda --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dda9d81f12c..3d34ba492d46 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1120,7 +1120,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) reg->type = PTR_TO_RDWR_BUF; break; default: - WARN_ON("unknown nullable register type"); + WARN_ONCE(1, "unknown nullable register type"); } } From 53f523f3052ac16bbc7718032aa6b848f971d28c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Feb 2021 16:16:47 -0800 Subject: [PATCH 007/164] bpf: Clear percpu pointers in bpf_prog_clone_free() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to bpf_prog_realloc(), bpf_prog_clone_create() also copies the percpu pointers, but the clone still shares them with the original prog, so we have to clear these two percpu pointers in bpf_prog_clone_free(). Otherwise we would get a double free: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 13 PID: 8140 Comm: kworker/13:247 Kdump: loaded Tainted: G                W    OE   5.11.0-rc4.bm.1-amd64+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 test_bpf: #1 TXA Workqueue: events bpf_prog_free_deferred RIP: 0010:percpu_ref_get_many.constprop.97+0x42/0xf0 Code: [...] RSP: 0018:ffffa6bce1f9bda0 EFLAGS: 00010002 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000021dfc7b RDX: ffffffffae2eeb90 RSI: 867f92637e338da5 RDI: 0000000000000046 RBP: ffffa6bce1f9bda8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000046 R11: 0000000000000000 R12: 0000000000000280 R13: 0000000000000000 R14: 0000000000000000 R15: ffff9b5f3ffdedc0 FS:    0000000000000000(0000) GS:ffff9b5f2fb40000(0000) knlGS:0000000000000000 CS:    0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000027c36c002 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace:     refill_obj_stock+0x5e/0xd0     free_percpu+0xee/0x550     __bpf_prog_free+0x4d/0x60     process_one_work+0x26a/0x590     worker_thread+0x3c/0x390     ? process_one_work+0x590/0x590     kthread+0x130/0x150     ? kthread_park+0x80/0x80     ret_from_fork+0x1f/0x30 This bug is 100% reproducible with test_kmod.sh. Fixes: 700d4796ef59 ("bpf: Optimize program stats") Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Reported-by: Jiang Wang Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210218001647.71631-1-xiyou.wangcong@gmail.com --- kernel/bpf/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0ae015ad1e05..aa1e64196d8d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1118,6 +1118,8 @@ static void bpf_prog_clone_free(struct bpf_prog *fp) * clone is guaranteed to not be locked. */ fp->aux = NULL; + fp->stats = NULL; + fp->active = NULL; __bpf_prog_free(fp); } From 9aa940047ae81fa1806506556cde1efd0c39aef9 Mon Sep 17 00:00:00 2001 From: Sharvari Harisangam Date: Mon, 22 Feb 2021 18:19:44 +0530 Subject: [PATCH 008/164] MAINTAINERS: update for mwifiex driver maintainers Add Sharvari Harisangam to Maintainer list. Replace Ganapathi Bhat's email id in Maintainer list. Signed-off-by: Rakesh Parmar Signed-off-by: Sharvari Harisangam Acked-by: Ganapathi Bhat Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1613998184-20047-1-git-send-email-sharvari.harisangam@nxp.com --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 99335fd22c0a..98fd98ebe6b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10685,7 +10685,8 @@ F: drivers/net/ethernet/marvell/mvpp2/ MARVELL MWIFIEX WIRELESS DRIVER M: Amitkumar Karwar -M: Ganapathi Bhat +M: Ganapathi Bhat +M: Sharvari Harisangam M: Xinming Hu L: linux-wireless@vger.kernel.org S: Maintained From ebb9d34e073dc965e9e1f0632a95dcb83736f166 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 21 Feb 2021 19:27:54 +0100 Subject: [PATCH 009/164] ath11k: qmi: use %pad to format dma_addr_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_ARCH_DMA_ADDR_T_64BIT=n: drivers/net/wireless/ath/ath11k/qmi.c: In function ‘ath11k_qmi_respond_fw_mem_request’: drivers/net/wireless/ath/ath11k/qmi.c:1690:8: warning: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 5 has type ‘dma_addr_t’ {aka ‘unsigned int’} [-Wformat=] 1690 | "qmi req mem_seg[%d] 0x%llx %u %u\n", i, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1691 | ab->qmi.target_mem[i].paddr, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | dma_addr_t {aka unsigned int} drivers/net/wireless/ath/ath11k/debug.h:64:30: note: in definition of macro ‘ath11k_dbg’ 64 | __ath11k_dbg(ar, dbg_mask, fmt, ##__VA_ARGS__); \ | ^~~ drivers/net/wireless/ath/ath11k/qmi.c:1690:34: note: format string is defined here 1690 | "qmi req mem_seg[%d] 0x%llx %u %u\n", i, | ~~~^ | | | long long unsigned int | %x Fixes: d5395a5486596308 ("ath11k: qmi: add debug message for allocated memory segment addresses and sizes") Signed-off-by: Geert Uytterhoeven Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210221182754.2071863-1-geert@linux-m68k.org --- drivers/net/wireless/ath/ath11k/qmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/qmi.c b/drivers/net/wireless/ath/ath11k/qmi.c index 1aca841cd147..7968fe4eda22 100644 --- a/drivers/net/wireless/ath/ath11k/qmi.c +++ b/drivers/net/wireless/ath/ath11k/qmi.c @@ -1687,8 +1687,8 @@ static int ath11k_qmi_respond_fw_mem_request(struct ath11k_base *ab) req->mem_seg[i].size = ab->qmi.target_mem[i].size; req->mem_seg[i].type = ab->qmi.target_mem[i].type; ath11k_dbg(ab, ATH11K_DBG_QMI, - "qmi req mem_seg[%d] 0x%llx %u %u\n", i, - ab->qmi.target_mem[i].paddr, + "qmi req mem_seg[%d] %pad %u %u\n", i, + &ab->qmi.target_mem[i].paddr, ab->qmi.target_mem[i].size, ab->qmi.target_mem[i].type); } From 77d7e87128d4dfb400df4208b2812160e999c165 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Mon, 22 Feb 2021 17:14:09 +0200 Subject: [PATCH 010/164] ath11k: fix AP mode for QCA6390 Commit c134d1f8c436 ("ath11k: Handle errors if peer creation fails") completely broke AP mode on QCA6390: kernel: [ 151.230734] ath11k_pci 0000:06:00.0: failed to create peer after vdev start delay: -22 wpa_supplicant[2307]: Failed to set beacon parameters wpa_supplicant[2307]: Interface initialization failed wpa_supplicant[2307]: wlan0: interface state UNINITIALIZED->DISABLED wpa_supplicant[2307]: wlan0: AP-DISABLED wpa_supplicant[2307]: wlan0: Unable to setup interface. wpa_supplicant[2307]: Failed to initialize AP interface This was because commit c134d1f8c436 ("ath11k: Handle errors if peer creation fails") added error handling for ath11k_peer_create(), which had been failing all along but was unnoticed due to the missing error handling. The actual bug was introduced already in commit aa44b2f3ecd4 ("ath11k: start vdev if a bss peer is already created"). ath11k_peer_create() was failing because for AP mode the peer is created already earlier op_add_interface() and we should skip creation here, but the check for modes was wrong. Fixing that makes AP mode work again. This shouldn't affect IPQ8074 nor QCN9074 as they have hw_params.vdev_start_delay disabled. Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01740-QCAHSTSWPLZ_V2_TO_X86-1 Fixes: c134d1f8c436 ("ath11k: Handle errors if peer creation fails") Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1614006849-25764-1-git-send-email-kvalo@codeaurora.org --- drivers/net/wireless/ath/ath11k/mac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index b391169576e2..faa2e678e63e 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -5450,8 +5450,8 @@ ath11k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw, } if (ab->hw_params.vdev_start_delay && - (arvif->vdev_type == WMI_VDEV_TYPE_AP || - arvif->vdev_type == WMI_VDEV_TYPE_MONITOR)) { + arvif->vdev_type != WMI_VDEV_TYPE_AP && + arvif->vdev_type != WMI_VDEV_TYPE_MONITOR) { param.vdev_id = arvif->vdev_id; param.peer_type = WMI_PEER_TYPE_DEFAULT; param.peer_addr = ar->mac_addr; From f4eda8b6e4a5c7897c6bb992ed63a27061b371ef Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Tue, 23 Feb 2021 13:04:16 +0400 Subject: [PATCH 011/164] bpf: Drop imprecise log message Now it is possible for global function to have a pointer argument that points to something different than struct. Drop the irrelevant log message and keep the logic same. Fixes: e5069b9c23b3 ("bpf: Support pointers in global func args") Signed-off-by: Dmitrii Banshchikov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210223090416.333943-1-me@ubique.spb.ru --- kernel/bpf/btf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2efeb5f4b343..b1a76fe046cb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4321,8 +4321,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ - if (log->level & BPF_LOG_LEVEL) - bpf_log(log, "arg#%d type is not a struct\n", arg); return NULL; } tname = btf_name_by_offset(btf, t->name_off); From c41d81bfbb4579c3e583457e383dd63d026bf947 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Tue, 23 Feb 2021 12:22:11 +0400 Subject: [PATCH 012/164] selftests/bpf: Fix a compiler warning in global func test Add an explicit 'const void *' cast to pass program ctx pointer type into a global function that expects pointer to structure. warning: incompatible pointer types passing 'struct __sk_buff *' to parameter of type 'const struct S *' [-Wincompatible-pointer-types] return foo(skb); ^~~ progs/test_global_func11.c:10:36: note: passing argument to parameter 's' here __noinline int foo(const struct S *s) ^ Fixes: 8b08807d039a ("selftests/bpf: Add unit tests for pointers in global functions") Signed-off-by: Dmitrii Banshchikov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210223082211.302596-1-me@ubique.spb.ru --- tools/testing/selftests/bpf/progs/test_global_func11.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_global_func11.c b/tools/testing/selftests/bpf/progs/test_global_func11.c index 28488047c849..ef5277d982d9 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func11.c +++ b/tools/testing/selftests/bpf/progs/test_global_func11.c @@ -15,5 +15,5 @@ __noinline int foo(const struct S *s) SEC("cgroup_skb/ingress") int test_cls(struct __sk_buff *skb) { - return foo(skb); + return foo((const void *)skb); } From 41462c6e730ca0e63f5fed5a517052385d980c54 Mon Sep 17 00:00:00 2001 From: Kun-Chuan Hsieh Date: Wed, 24 Feb 2021 05:27:52 +0000 Subject: [PATCH 013/164] tools/resolve_btfids: Fix build error with older host toolchains Older libelf.h and glibc elf.h might not yet define the ELF compression types. Checking and defining SHF_COMPRESSED fix the build error when compiling with older toolchains. Also, the tool resolve_btfids is compiled with host toolchain. The host toolchain is more likely to be older than the cross compile toolchain. Fixes: 51f6463aacfb ("tools/resolve_btfids: Fix sections with wrong alignment") Signed-off-by: Kun-Chuan Hsieh Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20210224052752.5284-1-jetswayss@gmail.com --- tools/bpf/resolve_btfids/main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 7409d7860aa6..80d966cfcaa1 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -260,6 +260,11 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return btf_id__add(root, id, false); } +/* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */ +#ifndef SHF_COMPRESSED +#define SHF_COMPRESSED (1 << 11) /* Section with compressed data. */ +#endif + /* * The data of compressed section should be aligned to 4 * (for 32bit) or 8 (for 64 bit) bytes. The binutils ld From a7c9c25a99bbdaff51da26b874d2faaa8fdd72b5 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 23 Feb 2021 21:14:57 +0800 Subject: [PATCH 014/164] bpf: Remove blank line in bpf helper description comment Commit 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") added an extra blank line in bpf helper description. This will make bpf_helpers_doc.py stop building bpf_helper_defs.h immediately after bpf_check_mtu(), which will affect future added functions. Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210223131457.1378978-1-liuhangbin@gmail.com --- include/uapi/linux/bpf.h | 1 - tools/include/uapi/linux/bpf.h | 1 - 2 files changed, 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. From 557c223b643a35effec9654958d8edc62fd2603a Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 24 Feb 2021 16:14:03 +0800 Subject: [PATCH 015/164] selftests/bpf: No need to drop the packet when there is no geneve opt In bpf geneve tunnel test we set geneve option on tx side. On rx side we only call bpf_skb_get_tunnel_opt(). Since commit 9c2e14b48119 ("ip_tunnels: Set tunnel option flag when tunnel metadata is present") geneve_rx() will not add TUNNEL_GENEVE_OPT flag if there is no geneve option, which cause bpf_skb_get_tunnel_opt() return ENOENT and _geneve_get_tunnel() in test_tunnel_kern.c drop the packet. As it should be valid that bpf_skb_get_tunnel_opt() return error when there is not tunnel option, there is no need to drop the packet and break all geneve rx traffic. Just set opt_class to 0 in this test and keep returning TC_ACT_OK. Fixes: 933a741e3b82 ("selftests/bpf: bpf tunnel test.") Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Acked-by: William Tu Link: https://lore.kernel.org/bpf/20210224081403.1425474-1-liuhangbin@gmail.com --- tools/testing/selftests/bpf/progs/test_tunnel_kern.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index a621b58ab079..9afe947cfae9 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -446,10 +446,8 @@ int _geneve_get_tunnel(struct __sk_buff *skb) } ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } + if (ret < 0) + gopt.opt_class = 0; bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4, gopt.opt_class); From 62541e266703549550e77fd46138422dbdc881f1 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Thu, 25 Feb 2021 09:04:21 +0200 Subject: [PATCH 016/164] iwlwifi: pcie: fix iwl_so_trans_cfg link error when CONFIG_IWLMVM is disabled Randy reported an error on his randconfig builds: ERROR: modpost: "iwl_so_trans_cfg" [drivers/net/wireless/intel/iwlwifi/iwlwifi.ko] undefined! The problem was that when CONFIG_IWLMVM was disabled we were still accessing iwl_so_trans_cfg. Fix it by moving IS_ENABLED() check before the access. Reported-by: Randy Dunlap Fixes: 930be4e76f26 ("iwlwifi: add support for SnJ with Jf devices") Signed-off-by: Kalle Valo Acked-by: Luca Coelho Acked-by: Randy Dunlap # build-tested Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1614236661-20274-1-git-send-email-kvalo@codeaurora.org --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 314fec4a89ad..ffaf973dae94 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1106,6 +1106,8 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } } +#if IS_ENABLED(CONFIG_IWLMVM) + /* * Workaround for problematic SnJ device: sometimes when * certain RF modules are connected to SnJ, the device ID @@ -1116,7 +1118,6 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (CSR_HW_REV_TYPE(iwl_trans->hw_rev) == IWL_CFG_MAC_TYPE_SNJ) iwl_trans->trans_cfg = &iwl_so_trans_cfg; -#if IS_ENABLED(CONFIG_IWLMVM) /* * special-case 7265D, it has the same PCI IDs. * From fb5fabb192b22293b70bc3351696473c50746d90 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2021 15:59:15 +0100 Subject: [PATCH 017/164] mt76: mt7921: remove incorrect error handling Clang points out a mistake in the error handling in mt7921_mcu_tx_rate_report(), which tries to dereference a pointer that cannot be initialized because of the error that is being handled: drivers/net/wireless/mediatek/mt76/mt7921/mcu.c:409:3: warning: variable 'stats' is uninitialized when used here [-Wuninitialized] stats->tx_rate = rate; ^~~~~ drivers/net/wireless/mediatek/mt76/mt7921/mcu.c:401:32: note: initialize the variable 'stats' to silence this warning struct mt7921_sta_stats *stats; ^ Just remove the obviously incorrect line. Fixes: 1c099ab44727 ("mt76: mt7921: add MCU support") Signed-off-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210225145953.404859-2-arnd@kernel.org --- drivers/net/wireless/mediatek/mt76/mt7921/mcu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c index db125cd22b91..b5cc72e7e81c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c @@ -405,10 +405,8 @@ mt7921_mcu_tx_rate_report(struct mt7921_dev *dev, struct sk_buff *skb, if (wlan_idx >= MT76_N_WCIDS) return; wcid = rcu_dereference(dev->mt76.wcid[wlan_idx]); - if (!wcid) { - stats->tx_rate = rate; + if (!wcid) return; - } msta = container_of(wcid, struct mt7921_sta, wcid); stats = &msta->stats; From d0bd52c591a1070c54dc428e926660eb4f981099 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 7 Feb 2021 12:48:31 +0100 Subject: [PATCH 018/164] mt76: dma: do not report truncated frames to mac80211 Commit b102f0c522cf6 ("mt76: fix array overflow on receiving too many fragments for a packet") fixes a possible OOB access but it introduces a memory leak since the pending frame is not released to page_frag_cache if the frag array of skb_shared_info is full. Commit 93a1d4791c10 ("mt76: dma: fix a possible memory leak in mt76_add_fragment()") fixes the issue but does not free the truncated skb that is forwarded to mac80211 layer. Fix the leftover issue discarding even truncated skbs. Fixes: 93a1d4791c10 ("mt76: dma: fix a possible memory leak in mt76_add_fragment()") Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/a03166fcc8214644333c68674a781836e0f57576.1612697217.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/dma.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index abdc8d364361..2f27c43ad76d 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -512,13 +512,13 @@ mt76_add_fragment(struct mt76_dev *dev, struct mt76_queue *q, void *data, { struct sk_buff *skb = q->rx_head; struct skb_shared_info *shinfo = skb_shinfo(skb); + int nr_frags = shinfo->nr_frags; - if (shinfo->nr_frags < ARRAY_SIZE(shinfo->frags)) { + if (nr_frags < ARRAY_SIZE(shinfo->frags)) { struct page *page = virt_to_head_page(data); int offset = data - page_address(page) + q->buf_offset; - skb_add_rx_frag(skb, shinfo->nr_frags, page, offset, len, - q->buf_size); + skb_add_rx_frag(skb, nr_frags, page, offset, len, q->buf_size); } else { skb_free_frag(data); } @@ -527,7 +527,10 @@ mt76_add_fragment(struct mt76_dev *dev, struct mt76_queue *q, void *data, return; q->rx_head = NULL; - dev->drv->rx_skb(dev, q - dev->q_rx, skb); + if (nr_frags < ARRAY_SIZE(shinfo->frags)) + dev->drv->rx_skb(dev, q - dev->q_rx, skb); + else + dev_kfree_skb(skb); } static int From c490492f15f656340b35cb9e36b9bfdea3539e19 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 26 Feb 2021 15:21:27 +0100 Subject: [PATCH 019/164] mt76: mt7915: fix unused 'mode' variable clang points out a possible corner case in the mt7915_tm_set_tx_cont() function if called with invalid arguments: drivers/net/wireless/mediatek/mt76/mt7915/testmode.c:593:2: warning: variable 'mode' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized] default: ^~~~~~~ drivers/net/wireless/mediatek/mt76/mt7915/testmode.c:597:13: note: uninitialized use occurs here rateval = mode << 6 | rate_idx; ^~~~ drivers/net/wireless/mediatek/mt76/mt7915/testmode.c:506:37: note: initialize the variable 'mode' to silence this warning u8 rate_idx = td->tx_rate_idx, mode; ^ Change it to return an error instead of continuing with invalid data here. Fixes: 3f0caa3cbf94 ("mt76: mt7915: add support for continuous tx in testmode") Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt7915/testmode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/testmode.c b/drivers/net/wireless/mediatek/mt76/mt7915/testmode.c index 7fb2170a9561..bd798df748ba 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/testmode.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/testmode.c @@ -543,7 +543,7 @@ mt7915_tm_set_tx_cont(struct mt7915_phy *phy, bool en) tx_cont->bw = CMD_CBW_20MHZ; break; default: - break; + return -EINVAL; } if (!en) { @@ -591,7 +591,7 @@ mt7915_tm_set_tx_cont(struct mt7915_phy *phy, bool en) mode = MT_PHY_TYPE_HE_MU; break; default: - break; + return -EINVAL; } rateval = mode << 6 | rate_idx; From a3e860a83397bf761ec1128a3f0ba186445992c6 Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 25 Feb 2021 17:01:10 +0800 Subject: [PATCH 020/164] net: stmmac: stop each tx channel independently If clear GMAC_CONFIG_TE bit, it would stop all tx channels, but users may only want to stop specific tx channel. Fixes: 48863ce5940f ("stmmac: add DMA support for GMAC 4.xx") Signed-off-by: Joakim Zhang Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c index 0b4ee2dbb691..71e50751ef2d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c @@ -53,10 +53,6 @@ void dwmac4_dma_stop_tx(void __iomem *ioaddr, u32 chan) value &= ~DMA_CONTROL_ST; writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan)); - - value = readl(ioaddr + GMAC_CONFIG); - value &= ~GMAC_CONFIG_TE; - writel(value, ioaddr + GMAC_CONFIG); } void dwmac4_dma_start_rx(void __iomem *ioaddr, u32 chan) From c511819d138de38e1637eedb645c207e09680d0f Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 25 Feb 2021 17:01:11 +0800 Subject: [PATCH 021/164] net: stmmac: fix watchdog timeout during suspend/resume stress test stmmac_xmit() call stmmac_tx_timer_arm() at the end to modify tx timer to do the transmission cleanup work. Imagine such a situation, stmmac enters suspend immediately after tx timer modified, it's expire callback stmmac_tx_clean() would not be invoked. This could affect BQL, since netdev_tx_sent_queue() has been called, but netdev_tx_completed_queue() have not been involved, as a result, dql_avail(&dev_queue->dql) finally always return a negative value. __dev_queue_xmit->__dev_xmit_skb->qdisc_run->__qdisc_run->qdisc_restart->dequeue_skb: if ((q->flags & TCQ_F_ONETXQUEUE) && netif_xmit_frozen_or_stopped(txq)) // __QUEUE_STATE_STACK_XOFF is set Net core will stop transmitting any more. Finillay, net watchdong would timeout. To fix this issue, we should call netdev_tx_reset_queue() in stmmac_resume(). Fixes: 54139cf3bb33 ("net: stmmac: adding multiple buffers for rx") Signed-off-by: Joakim Zhang Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 26b971cd4da5..12ed337a239b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5257,6 +5257,8 @@ static void stmmac_reset_queues_param(struct stmmac_priv *priv) tx_q->cur_tx = 0; tx_q->dirty_tx = 0; tx_q->mss = 0; + + netdev_tx_reset_queue(netdev_get_tx_queue(priv->dev, queue)); } } From bfaf91ca848e758ed7be99b61fd936d03819fa56 Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 25 Feb 2021 17:01:12 +0800 Subject: [PATCH 022/164] net: stmmac: fix dma physical address of descriptor when display ring Driver uses dma_alloc_coherent to allocate dma memory for descriptors, dma_alloc_coherent will return both the virtual address and physical address. AFAIK, virt_to_phys could not convert virtual address to physical address, for which memory is allocated by dma_alloc_coherent. dwmac4_display_ring() function is broken for various descriptor, it only support normal descriptor(struct dma_desc) now, this patch also extends to support all descriptor types. Signed-off-by: Joakim Zhang Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/dwmac4_descs.c | 50 +++++++++++++--- .../net/ethernet/stmicro/stmmac/enh_desc.c | 9 ++- drivers/net/ethernet/stmicro/stmmac/hwif.h | 3 +- .../net/ethernet/stmicro/stmmac/norm_desc.c | 9 ++- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 57 ++++++++++++------- 5 files changed, 94 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c index c6540b003b43..ee87811b0ca5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c @@ -402,19 +402,53 @@ static void dwmac4_rd_set_tx_ic(struct dma_desc *p) p->des2 |= cpu_to_le32(TDES2_INTERRUPT_ON_COMPLETION); } -static void dwmac4_display_ring(void *head, unsigned int size, bool rx) +static void dwmac4_display_ring(void *head, unsigned int size, bool rx, + dma_addr_t dma_rx_phy, unsigned int desc_size) { - struct dma_desc *p = (struct dma_desc *)head; + dma_addr_t dma_addr; int i; pr_info("%s descriptor ring:\n", rx ? "RX" : "TX"); - for (i = 0; i < size; i++) { - pr_info("%03d [0x%x]: 0x%x 0x%x 0x%x 0x%x\n", - i, (unsigned int)virt_to_phys(p), - le32_to_cpu(p->des0), le32_to_cpu(p->des1), - le32_to_cpu(p->des2), le32_to_cpu(p->des3)); - p++; + if (desc_size == sizeof(struct dma_desc)) { + struct dma_desc *p = (struct dma_desc *)head; + + for (i = 0; i < size; i++) { + dma_addr = dma_rx_phy + i * sizeof(*p); + pr_info("%03d [%pad]: 0x%x 0x%x 0x%x 0x%x\n", + i, &dma_addr, + le32_to_cpu(p->des0), le32_to_cpu(p->des1), + le32_to_cpu(p->des2), le32_to_cpu(p->des3)); + p++; + } + } else if (desc_size == sizeof(struct dma_extended_desc)) { + struct dma_extended_desc *extp = (struct dma_extended_desc *)head; + + for (i = 0; i < size; i++) { + dma_addr = dma_rx_phy + i * sizeof(*extp); + pr_info("%03d [%pad]: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", + i, &dma_addr, + le32_to_cpu(extp->basic.des0), le32_to_cpu(extp->basic.des1), + le32_to_cpu(extp->basic.des2), le32_to_cpu(extp->basic.des3), + le32_to_cpu(extp->des4), le32_to_cpu(extp->des5), + le32_to_cpu(extp->des6), le32_to_cpu(extp->des7)); + extp++; + } + } else if (desc_size == sizeof(struct dma_edesc)) { + struct dma_edesc *ep = (struct dma_edesc *)head; + + for (i = 0; i < size; i++) { + dma_addr = dma_rx_phy + i * sizeof(*ep); + pr_info("%03d [%pad]: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", + i, &dma_addr, + le32_to_cpu(ep->des4), le32_to_cpu(ep->des5), + le32_to_cpu(ep->des6), le32_to_cpu(ep->des7), + le32_to_cpu(ep->basic.des0), le32_to_cpu(ep->basic.des1), + le32_to_cpu(ep->basic.des2), le32_to_cpu(ep->basic.des3)); + ep++; + } + } else { + pr_err("unsupported descriptor!"); } } diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c index d02cec296f51..6650edfab5bc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c +++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c @@ -417,19 +417,22 @@ static int enh_desc_get_rx_timestamp_status(void *desc, void *next_desc, } } -static void enh_desc_display_ring(void *head, unsigned int size, bool rx) +static void enh_desc_display_ring(void *head, unsigned int size, bool rx, + dma_addr_t dma_rx_phy, unsigned int desc_size) { struct dma_extended_desc *ep = (struct dma_extended_desc *)head; + dma_addr_t dma_addr; int i; pr_info("Extended %s descriptor ring:\n", rx ? "RX" : "TX"); for (i = 0; i < size; i++) { u64 x; + dma_addr = dma_rx_phy + i * sizeof(*ep); x = *(u64 *)ep; - pr_info("%03d [0x%x]: 0x%x 0x%x 0x%x 0x%x\n", - i, (unsigned int)virt_to_phys(ep), + pr_info("%03d [%pad]: 0x%x 0x%x 0x%x 0x%x\n", + i, &dma_addr, (unsigned int)x, (unsigned int)(x >> 32), ep->basic.des2, ep->basic.des3); ep++; diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index b40b2e0667bb..7417db31402f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -78,7 +78,8 @@ struct stmmac_desc_ops { /* get rx timestamp status */ int (*get_rx_timestamp_status)(void *desc, void *next_desc, u32 ats); /* Display ring */ - void (*display_ring)(void *head, unsigned int size, bool rx); + void (*display_ring)(void *head, unsigned int size, bool rx, + dma_addr_t dma_rx_phy, unsigned int desc_size); /* set MSS via context descriptor */ void (*set_mss)(struct dma_desc *p, unsigned int mss); /* get descriptor skbuff address */ diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c index f083360e4ba6..98ef43f35802 100644 --- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c +++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c @@ -269,19 +269,22 @@ static int ndesc_get_rx_timestamp_status(void *desc, void *next_desc, u32 ats) return 1; } -static void ndesc_display_ring(void *head, unsigned int size, bool rx) +static void ndesc_display_ring(void *head, unsigned int size, bool rx, + dma_addr_t dma_rx_phy, unsigned int desc_size) { struct dma_desc *p = (struct dma_desc *)head; + dma_addr_t dma_addr; int i; pr_info("%s descriptor ring:\n", rx ? "RX" : "TX"); for (i = 0; i < size; i++) { u64 x; + dma_addr = dma_rx_phy + i * sizeof(*p); x = *(u64 *)p; - pr_info("%03d [0x%x]: 0x%x 0x%x 0x%x 0x%x", - i, (unsigned int)virt_to_phys(p), + pr_info("%03d [%pad]: 0x%x 0x%x 0x%x 0x%x", + i, &dma_addr, (unsigned int)x, (unsigned int)(x >> 32), p->des2, p->des3); p++; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 12ed337a239b..730f2d71578c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1133,6 +1133,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) static void stmmac_display_rx_rings(struct stmmac_priv *priv) { u32 rx_cnt = priv->plat->rx_queues_to_use; + unsigned int desc_size; void *head_rx; u32 queue; @@ -1142,19 +1143,24 @@ static void stmmac_display_rx_rings(struct stmmac_priv *priv) pr_info("\tRX Queue %u rings\n", queue); - if (priv->extend_desc) + if (priv->extend_desc) { head_rx = (void *)rx_q->dma_erx; - else + desc_size = sizeof(struct dma_extended_desc); + } else { head_rx = (void *)rx_q->dma_rx; + desc_size = sizeof(struct dma_desc); + } /* Display RX ring */ - stmmac_display_ring(priv, head_rx, priv->dma_rx_size, true); + stmmac_display_ring(priv, head_rx, priv->dma_rx_size, true, + rx_q->dma_rx_phy, desc_size); } } static void stmmac_display_tx_rings(struct stmmac_priv *priv) { u32 tx_cnt = priv->plat->tx_queues_to_use; + unsigned int desc_size; void *head_tx; u32 queue; @@ -1164,14 +1170,19 @@ static void stmmac_display_tx_rings(struct stmmac_priv *priv) pr_info("\tTX Queue %d rings\n", queue); - if (priv->extend_desc) + if (priv->extend_desc) { head_tx = (void *)tx_q->dma_etx; - else if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc_size = sizeof(struct dma_extended_desc); + } else if (tx_q->tbs & STMMAC_TBS_AVAIL) { head_tx = (void *)tx_q->dma_entx; - else + desc_size = sizeof(struct dma_edesc); + } else { head_tx = (void *)tx_q->dma_tx; + desc_size = sizeof(struct dma_desc); + } - stmmac_display_ring(priv, head_tx, priv->dma_tx_size, false); + stmmac_display_ring(priv, head_tx, priv->dma_tx_size, false, + tx_q->dma_tx_phy, desc_size); } } @@ -3736,18 +3747,23 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) unsigned int count = 0, error = 0, len = 0; int status = 0, coe = priv->hw->rx_csum; unsigned int next_entry = rx_q->cur_rx; + unsigned int desc_size; struct sk_buff *skb = NULL; if (netif_msg_rx_status(priv)) { void *rx_head; netdev_dbg(priv->dev, "%s: descriptor ring:\n", __func__); - if (priv->extend_desc) + if (priv->extend_desc) { rx_head = (void *)rx_q->dma_erx; - else + desc_size = sizeof(struct dma_extended_desc); + } else { rx_head = (void *)rx_q->dma_rx; + desc_size = sizeof(struct dma_desc); + } - stmmac_display_ring(priv, rx_head, priv->dma_rx_size, true); + stmmac_display_ring(priv, rx_head, priv->dma_rx_size, true, + rx_q->dma_rx_phy, desc_size); } while (count < limit) { unsigned int buf1_len = 0, buf2_len = 0; @@ -4315,24 +4331,27 @@ static int stmmac_set_mac_address(struct net_device *ndev, void *addr) static struct dentry *stmmac_fs_dir; static void sysfs_display_ring(void *head, int size, int extend_desc, - struct seq_file *seq) + struct seq_file *seq, dma_addr_t dma_phy_addr) { int i; struct dma_extended_desc *ep = (struct dma_extended_desc *)head; struct dma_desc *p = (struct dma_desc *)head; + dma_addr_t dma_addr; for (i = 0; i < size; i++) { if (extend_desc) { - seq_printf(seq, "%d [0x%x]: 0x%x 0x%x 0x%x 0x%x\n", - i, (unsigned int)virt_to_phys(ep), + dma_addr = dma_phy_addr + i * sizeof(*ep); + seq_printf(seq, "%d [%pad]: 0x%x 0x%x 0x%x 0x%x\n", + i, &dma_addr, le32_to_cpu(ep->basic.des0), le32_to_cpu(ep->basic.des1), le32_to_cpu(ep->basic.des2), le32_to_cpu(ep->basic.des3)); ep++; } else { - seq_printf(seq, "%d [0x%x]: 0x%x 0x%x 0x%x 0x%x\n", - i, (unsigned int)virt_to_phys(p), + dma_addr = dma_phy_addr + i * sizeof(*p); + seq_printf(seq, "%d [%pad]: 0x%x 0x%x 0x%x 0x%x\n", + i, &dma_addr, le32_to_cpu(p->des0), le32_to_cpu(p->des1), le32_to_cpu(p->des2), le32_to_cpu(p->des3)); p++; @@ -4360,11 +4379,11 @@ static int stmmac_rings_status_show(struct seq_file *seq, void *v) if (priv->extend_desc) { seq_printf(seq, "Extended descriptor ring:\n"); sysfs_display_ring((void *)rx_q->dma_erx, - priv->dma_rx_size, 1, seq); + priv->dma_rx_size, 1, seq, rx_q->dma_rx_phy); } else { seq_printf(seq, "Descriptor ring:\n"); sysfs_display_ring((void *)rx_q->dma_rx, - priv->dma_rx_size, 0, seq); + priv->dma_rx_size, 0, seq, rx_q->dma_rx_phy); } } @@ -4376,11 +4395,11 @@ static int stmmac_rings_status_show(struct seq_file *seq, void *v) if (priv->extend_desc) { seq_printf(seq, "Extended descriptor ring:\n"); sysfs_display_ring((void *)tx_q->dma_etx, - priv->dma_tx_size, 1, seq); + priv->dma_tx_size, 1, seq, tx_q->dma_tx_phy); } else if (!(tx_q->tbs & STMMAC_TBS_AVAIL)) { seq_printf(seq, "Descriptor ring:\n"); sysfs_display_ring((void *)tx_q->dma_tx, - priv->dma_tx_size, 0, seq); + priv->dma_tx_size, 0, seq, tx_q->dma_tx_phy); } } From 396e13e11577b614db77db0bbb6fca935b94eb1b Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 25 Feb 2021 17:01:13 +0800 Subject: [PATCH 023/164] net: stmmac: fix wrongly set buffer2 valid when sph unsupport In current driver, buffer2 available only when hardware supports split header. Wrongly set buffer2 valid in stmmac_rx_refill when refill buffer address. You can see that desc3 is 0x81000000 after initialization, but turn out to be 0x83000000 after refill. Fixes: 67afd6d1cfdf ("net: stmmac: Add Split Header support and enable it in XGMAC cores") Signed-off-by: Joakim Zhang Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c | 9 +++++++-- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c | 2 +- drivers/net/ethernet/stmicro/stmmac/hwif.h | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 8 ++++++-- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c index ee87811b0ca5..cbf4429fb1d2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c @@ -533,10 +533,15 @@ static void dwmac4_get_rx_header_len(struct dma_desc *p, unsigned int *len) *len = le32_to_cpu(p->des2) & RDES2_HL; } -static void dwmac4_set_sec_addr(struct dma_desc *p, dma_addr_t addr) +static void dwmac4_set_sec_addr(struct dma_desc *p, dma_addr_t addr, bool buf2_valid) { p->des2 = cpu_to_le32(lower_32_bits(addr)); - p->des3 = cpu_to_le32(upper_32_bits(addr) | RDES3_BUFFER2_VALID_ADDR); + p->des3 = cpu_to_le32(upper_32_bits(addr)); + + if (buf2_valid) + p->des3 |= cpu_to_le32(RDES3_BUFFER2_VALID_ADDR); + else + p->des3 &= cpu_to_le32(~RDES3_BUFFER2_VALID_ADDR); } static void dwmac4_set_tbs(struct dma_edesc *p, u32 sec, u32 nsec) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c index 0aaf19ab5672..ccfb0102dde4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c @@ -292,7 +292,7 @@ static void dwxgmac2_get_rx_header_len(struct dma_desc *p, unsigned int *len) *len = le32_to_cpu(p->des2) & XGMAC_RDES2_HL; } -static void dwxgmac2_set_sec_addr(struct dma_desc *p, dma_addr_t addr) +static void dwxgmac2_set_sec_addr(struct dma_desc *p, dma_addr_t addr, bool is_valid) { p->des2 = cpu_to_le32(lower_32_bits(addr)); p->des3 = cpu_to_le32(upper_32_bits(addr)); diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 7417db31402f..979ac9fca23c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -92,7 +92,7 @@ struct stmmac_desc_ops { int (*get_rx_hash)(struct dma_desc *p, u32 *hash, enum pkt_hash_types *type); void (*get_rx_header_len)(struct dma_desc *p, unsigned int *len); - void (*set_sec_addr)(struct dma_desc *p, dma_addr_t addr); + void (*set_sec_addr)(struct dma_desc *p, dma_addr_t addr, bool buf2_valid); void (*set_sarc)(struct dma_desc *p, u32 sarc_type); void (*set_vlan_tag)(struct dma_desc *p, u16 tag, u16 inner_tag, u32 inner_type); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 730f2d71578c..cd7709da0969 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1314,9 +1314,10 @@ static int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p, return -ENOMEM; buf->sec_addr = page_pool_get_dma_addr(buf->sec_page); - stmmac_set_desc_sec_addr(priv, p, buf->sec_addr); + stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, true); } else { buf->sec_page = NULL; + stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, false); } buf->addr = page_pool_get_dma_addr(buf->page); @@ -3659,7 +3660,10 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue) DMA_FROM_DEVICE); stmmac_set_desc_addr(priv, p, buf->addr); - stmmac_set_desc_sec_addr(priv, p, buf->sec_addr); + if (priv->sph) + stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, true); + else + stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, false); stmmac_refill_desc3(priv, rx_q, p); rx_q->rx_count_frames++; From 9c63faaa931e443e7abbbee9de0169f1d4710546 Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 25 Feb 2021 17:01:14 +0800 Subject: [PATCH 024/164] net: stmmac: re-init rx buffers when mac resume back During suspend/resume stress test, we found descriptor write back by DMA could exhibit unusual behavior, e.g.: 003 [0xc4310030]: 0x0 0x40 0x0 0xb5010040 We can see that desc3 write back is 0xb5010040, it is still ownd by DMA, so application would not recycle this buffer. It will trigger fatal bus error when DMA try to use this descriptor again. To fix this issue, we should re-init all rx buffers when mac resume back. Signed-off-by: Joakim Zhang Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 84 ++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index cd7709da0969..0eba44e9c1f8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1379,6 +1379,88 @@ static void stmmac_free_tx_buffer(struct stmmac_priv *priv, u32 queue, int i) } } +/** + * stmmac_reinit_rx_buffers - reinit the RX descriptor buffer. + * @priv: driver private structure + * Description: this function is called to re-allocate a receive buffer, perform + * the DMA mapping and init the descriptor. + */ +static void stmmac_reinit_rx_buffers(struct stmmac_priv *priv) +{ + u32 rx_count = priv->plat->rx_queues_to_use; + u32 queue; + int i; + + for (queue = 0; queue < rx_count; queue++) { + struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + + for (i = 0; i < priv->dma_rx_size; i++) { + struct stmmac_rx_buffer *buf = &rx_q->buf_pool[i]; + + if (buf->page) { + page_pool_recycle_direct(rx_q->page_pool, buf->page); + buf->page = NULL; + } + + if (priv->sph && buf->sec_page) { + page_pool_recycle_direct(rx_q->page_pool, buf->sec_page); + buf->sec_page = NULL; + } + } + } + + for (queue = 0; queue < rx_count; queue++) { + struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + + for (i = 0; i < priv->dma_rx_size; i++) { + struct stmmac_rx_buffer *buf = &rx_q->buf_pool[i]; + struct dma_desc *p; + + if (priv->extend_desc) + p = &((rx_q->dma_erx + i)->basic); + else + p = rx_q->dma_rx + i; + + if (!buf->page) { + buf->page = page_pool_dev_alloc_pages(rx_q->page_pool); + if (!buf->page) + goto err_reinit_rx_buffers; + + buf->addr = page_pool_get_dma_addr(buf->page); + } + + if (priv->sph && !buf->sec_page) { + buf->sec_page = page_pool_dev_alloc_pages(rx_q->page_pool); + if (!buf->sec_page) + goto err_reinit_rx_buffers; + + buf->sec_addr = page_pool_get_dma_addr(buf->sec_page); + } + + stmmac_set_desc_addr(priv, p, buf->addr); + if (priv->sph) + stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, true); + else + stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, false); + if (priv->dma_buf_sz == BUF_SIZE_16KiB) + stmmac_init_desc3(priv, p); + } + } + + return; + +err_reinit_rx_buffers: + do { + while (--i >= 0) + stmmac_free_rx_buffer(priv, queue, i); + + if (queue == 0) + break; + + i = priv->dma_rx_size; + } while (queue-- > 0); +} + /** * init_dma_rx_desc_rings - init the RX descriptor rings * @dev: net device structure @@ -5343,7 +5425,7 @@ int stmmac_resume(struct device *dev) mutex_lock(&priv->lock); stmmac_reset_queues_param(priv); - + stmmac_reinit_rx_buffers(priv); stmmac_free_tx_skbufs(priv); stmmac_clear_descriptors(priv); From 907310ceb27ee4259bedb6c1257f5d05ee44f3ce Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2021 15:38:31 +0100 Subject: [PATCH 025/164] net: mscc: ocelot: select NET_DEVLINK Without this option, the driver fails to link: ld.lld: error: undefined symbol: devlink_sb_register >>> referenced by ocelot_devlink.c >>> net/ethernet/mscc/ocelot_devlink.o:(ocelot_devlink_sb_register) in archive drivers/built-in.a >>> referenced by ocelot_devlink.c >>> net/ethernet/mscc/ocelot_devlink.o:(ocelot_devlink_sb_register) in archive drivers/built-in.a Fixes: f59fd9cab730 ("net: mscc: ocelot: configure watermarks using devlink-sb") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210225143910.3964364-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mscc/Kconfig b/drivers/net/ethernet/mscc/Kconfig index c0ede0ca7115..05cb040c2677 100644 --- a/drivers/net/ethernet/mscc/Kconfig +++ b/drivers/net/ethernet/mscc/Kconfig @@ -13,6 +13,7 @@ if NET_VENDOR_MICROSEMI # Users should depend on NET_SWITCHDEV, HAS_IOMEM config MSCC_OCELOT_SWITCH_LIB + select NET_DEVLINK select REGMAP_MMIO select PACKING select PHYLIB From 01c2c1ad8f45e0c191bfd961dc41cd77df0d7a2f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2021 15:38:32 +0100 Subject: [PATCH 026/164] net: dsa: tag_ocelot_8021q: fix driver dependency When the ocelot driver code is in a library, the dsa tag code cannot be built-in: ld.lld: error: undefined symbol: ocelot_can_inject >>> referenced by tag_ocelot_8021q.c >>> dsa/tag_ocelot_8021q.o:(ocelot_xmit) in archive net/built-in.a ld.lld: error: undefined symbol: ocelot_port_inject_frame >>> referenced by tag_ocelot_8021q.c >>> dsa/tag_ocelot_8021q.o:(ocelot_xmit) in archive net/built-in.a Building the tag support only really makes sense for compile-testing when the driver is available, so add a Kconfig dependency that prevents the broken configuration while allowing COMPILE_TEST alternative when MSCC_OCELOT_SWITCH_LIB is disabled entirely. This case is handled through the #ifdef check in include/soc/mscc/ocelot.h. Fixes: 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping") Signed-off-by: Arnd Bergmann Acked-by: Vladimir Oltean Link: https://lore.kernel.org/r/20210225143910.3964364-2-arnd@kernel.org Signed-off-by: Jakub Kicinski --- net/dsa/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 3589224c8da9..58b8fc82cd3c 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -118,6 +118,8 @@ config NET_DSA_TAG_OCELOT config NET_DSA_TAG_OCELOT_8021Q tristate "Tag driver for Ocelot family of switches, using VLAN" + depends on MSCC_OCELOT_SWITCH_LIB || \ + (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) select NET_DSA_TAG_8021Q help Say Y or M if you want to enable support for tagging frames with a From 63c75c053b4160f7b90a418dcc4e5bcfac2fb6fc Mon Sep 17 00:00:00 2001 From: DENG Qingfang Date: Fri, 26 Feb 2021 14:32:26 +0800 Subject: [PATCH 027/164] net: dsa: mt7530: don't build GPIO support if !GPIOLIB The new GPIO support may be optional at runtime, but it requires building against gpiolib: ERROR: modpost: "gpiochip_get_data" [drivers/net/dsa/mt7530.ko] undefined! ERROR: modpost: "devm_gpiochip_add_data_with_key" [drivers/net/dsa/mt7530.ko] undefined! Add #ifdef to exclude GPIO support if GPIOLIB is not enabled. Fixes: 429a0edeefd8 ("net: dsa: mt7530: MT7530 optional GPIO support") Reported-by: Arnd Bergmann Signed-off-by: DENG Qingfang Link: https://lore.kernel.org/r/20210226063226.8474-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index c17de2bcf2fe..f06f5fa2f898 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1624,6 +1624,7 @@ mtk_get_tag_protocol(struct dsa_switch *ds, int port, } } +#ifdef CONFIG_GPIOLIB static inline u32 mt7530_gpio_to_bit(unsigned int offset) { @@ -1726,6 +1727,7 @@ mt7530_setup_gpio(struct mt7530_priv *priv) return devm_gpiochip_add_data(dev, gc, priv); } +#endif /* CONFIG_GPIOLIB */ static int mt7530_setup(struct dsa_switch *ds) @@ -1868,11 +1870,13 @@ mt7530_setup(struct dsa_switch *ds) } } +#ifdef CONFIG_GPIOLIB if (of_property_read_bool(priv->dev->of_node, "gpio-controller")) { ret = mt7530_setup_gpio(priv); if (ret) return ret; } +#endif /* CONFIG_GPIOLIB */ mt7530_setup_port5(ds, interface); From 7f654157f0aefba04cd7f6297351c87b76b47b89 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2021 15:57:27 +0100 Subject: [PATCH 028/164] net: phy: make mdio_bus_phy_suspend/resume as __maybe_unused When CONFIG_PM_SLEEP is disabled, the compiler warns about unused functions: drivers/net/phy/phy_device.c:273:12: error: unused function 'mdio_bus_phy_suspend' [-Werror,-Wunused-function] static int mdio_bus_phy_suspend(struct device *dev) drivers/net/phy/phy_device.c:293:12: error: unused function 'mdio_bus_phy_resume' [-Werror,-Wunused-function] static int mdio_bus_phy_resume(struct device *dev) The logic is intentional, so just mark these two as __maybe_unused and remove the incorrect #ifdef. Fixes: 4c0d2e96ba05 ("net: phy: consider that suspend2ram may cut off PHY power") Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210225145748.404410-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index ce495473cd5d..cc38e326405a 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -230,7 +230,6 @@ static struct phy_driver genphy_driver; static LIST_HEAD(phy_fixup_list); static DEFINE_MUTEX(phy_fixup_lock); -#ifdef CONFIG_PM static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { struct device_driver *drv = phydev->mdio.dev.driver; @@ -270,7 +269,7 @@ out: return !phydev->suspended; } -static int mdio_bus_phy_suspend(struct device *dev) +static __maybe_unused int mdio_bus_phy_suspend(struct device *dev) { struct phy_device *phydev = to_phy_device(dev); @@ -290,7 +289,7 @@ static int mdio_bus_phy_suspend(struct device *dev) return phy_suspend(phydev); } -static int mdio_bus_phy_resume(struct device *dev) +static __maybe_unused int mdio_bus_phy_resume(struct device *dev) { struct phy_device *phydev = to_phy_device(dev); int ret; @@ -316,7 +315,6 @@ no_resume: static SIMPLE_DEV_PM_OPS(mdio_bus_phy_pm_ops, mdio_bus_phy_suspend, mdio_bus_phy_resume); -#endif /* CONFIG_PM */ /** * phy_register_fixup - creates a new phy_fixup and adds it to the list From 6a4d7234ae9a3bb31181f348ade9bbdb55aeb5c5 Mon Sep 17 00:00:00 2001 From: Heiko Thiery Date: Thu, 25 Feb 2021 22:15:16 +0100 Subject: [PATCH 029/164] net: fec: ptp: avoid register access when ipg clock is disabled When accessing the timecounter register on an i.MX8MQ the kernel hangs. This is only the case when the interface is down. This can be reproduced by reading with 'phc_ctrl eth0 get'. Like described in the change in 91c0d987a9788dcc5fe26baafd73bf9242b68900 the igp clock is disabled when the interface is down and leads to a system hang. So we check if the ptp clock status before reading the timecounter register. Signed-off-by: Heiko Thiery Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20210225211514.9115-1-heiko.thiery@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_ptp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index 2e344aada4c6..1753807cbf97 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -377,9 +377,16 @@ static int fec_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) u64 ns; unsigned long flags; + mutex_lock(&adapter->ptp_clk_mutex); + /* Check the ptp clock */ + if (!adapter->ptp_clk_on) { + mutex_unlock(&adapter->ptp_clk_mutex); + return -EINVAL; + } spin_lock_irqsave(&adapter->tmreg_lock, flags); ns = timecounter_read(&adapter->tc); spin_unlock_irqrestore(&adapter->tmreg_lock, flags); + mutex_unlock(&adapter->ptp_clk_mutex); *ts = ns_to_timespec64(ns); From 2107d45f17bedd7dbf4178462da0ac223835a2a7 Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Thu, 25 Feb 2021 15:26:28 -0800 Subject: [PATCH 030/164] tcp: Fix sign comparison bug in getsockopt(TCP_ZEROCOPY_RECEIVE) getsockopt(TCP_ZEROCOPY_RECEIVE) has a bug where we read a user-provided "len" field of type signed int, and then compare the value to the result of an "offsetofend" operation, which is unsigned. Negative values provided by the user will be promoted to large positive numbers; thus checking that len < offsetofend() will return false when the intention was that it return true. Note that while len is originally checked for negative values earlier on in do_tcp_getsockopt(), subsequent calls to get_user() re-read the value from userspace which may have changed in the meantime. Therefore, re-add the check for negative values after the call to get_user in the handler code for TCP_ZEROCOPY_RECEIVE. Fixes: c8856c051454 ("tcp-zerocopy: Return inq along with tcp receive zerocopy.") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Arjun Roy Link: https://lore.kernel.org/r/20210225232628.4033281-1-arjunroy.kdev@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a3422e42784e..dfb6f286c1de 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4143,7 +4143,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (get_user(len, optlen)) return -EFAULT; - if (len < offsetofend(struct tcp_zerocopy_receive, length)) + if (len < 0 || + len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { err = check_zeroed_user(optval + sizeof(zc), From edcbf5137f093b5502f5f6b97cce3cbadbde27aa Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 25 Feb 2021 18:57:19 +0200 Subject: [PATCH 031/164] selftests: forwarding: Fix race condition in mirror installation When mirroring to a gretap in hardware the device expects to be programmed with the egress port and all the encapsulating headers. This requires the driver to resolve the path the packet will take in the software data path and program the device accordingly. If the path cannot be resolved (in this case because of an unresolved neighbor), then mirror installation fails until the path is resolved. This results in a race that causes the test to sometimes fail. Fix this by setting the neighbor's state to permanent, so that it is always valid. Fixes: b5b029399fa6d ("selftests: forwarding: mirror_gre_bridge_1d_vlan: Add STP test") Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- .../net/forwarding/mirror_gre_bridge_1d_vlan.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh index 197e769c2ed1..f8cda822c1ce 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh @@ -86,11 +86,20 @@ test_ip6gretap() test_gretap_stp() { + # Sometimes after mirror installation, the neighbor's state is not valid. + # The reason is that there is no SW datapath activity related to the + # neighbor for the remote GRE address. Therefore whether the corresponding + # neighbor will be valid is a matter of luck, and the test is thus racy. + # Set the neighbor's state to permanent, so it would be always valid. + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_stp gt4 $swp3.555 "mirror to gretap" } test_ip6gretap_stp() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_stp gt6 $swp3.555 "mirror to ip6gretap" } From ae9b24ddb69b4e31cda1b5e267a5a08a1db11717 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 25 Feb 2021 18:57:20 +0200 Subject: [PATCH 032/164] mlxsw: spectrum_ethtool: Add an external speed to PTYS register Currently, only external bits are added to the PTYS register, whereas there is one external bit that is wrongly marked as internal, and so was recently removed from the register. Add that bit to the PTYS register again, as this bit is no longer internal. Its removal resulted in '100000baseLR4_ER4/Full' link mode no longer being supported, causing a regression on some setups. Fixes: 5bf01b571cf4 ("mlxsw: spectrum_ethtool: Remove internal speeds from PTYS register") Signed-off-by: Danielle Ratson Reported-by: Eddie Shklaer Tested-by: Eddie Shklaer Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 1 + drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c | 5 +++++ drivers/net/ethernet/mellanox/mlxsw/switchx2.c | 3 ++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 16e2df6ef2f4..c4adc7f740d3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -4430,6 +4430,7 @@ MLXSW_ITEM32(reg, ptys, ext_eth_proto_cap, 0x08, 0, 32); #define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4 BIT(20) #define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 BIT(21) #define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4 BIT(22) +#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4 BIT(23) #define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_CR BIT(27) #define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_KR BIT(28) #define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR BIT(29) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c index bd7f873f6290..0bd64169bf81 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c @@ -1169,6 +1169,11 @@ static const struct mlxsw_sp1_port_link_mode mlxsw_sp1_port_link_mode[] = { .mask_ethtool = ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, .speed = SPEED_100000, }, + { + .mask = MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4, + .mask_ethtool = ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT, + .speed = SPEED_100000, + }, }; #define MLXSW_SP1_PORT_LINK_MODE_LEN ARRAY_SIZE(mlxsw_sp1_port_link_mode) diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c index 40e2e79d4517..131b2a53d261 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c +++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c @@ -613,7 +613,8 @@ static const struct mlxsw_sx_port_link_mode mlxsw_sx_port_link_mode[] = { { .mask = MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4 | MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 | - MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4, + MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4 | + MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4, .speed = 100000, }, }; From dc860b88ce0a7ed9a048d5042cbb175daf60b657 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 25 Feb 2021 18:57:21 +0200 Subject: [PATCH 033/164] mlxsw: spectrum_router: Ignore routes using a deleted nexthop object Routes are currently processed from a workqueue whereas nexthop objects are processed in system call context. This can result in the driver not finding a suitable nexthop group for a route and issuing a warning [1]. Fix this by ignoring such routes earlier in the process. The subsequent deletion notification will be ignored as well. [1] WARNING: CPU: 2 PID: 7754 at drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:4853 mlxsw_sp_router_fib_event_work+0x1112/0x1e00 [mlxsw_spectrum] [...] CPU: 2 PID: 7754 Comm: kworker/u8:0 Not tainted 5.11.0-rc6-cq-20210207-1 #16 Hardware name: Mellanox Technologies Ltd. MSN2100/SA001390, BIOS 5.6.5 05/24/2018 Workqueue: mlxsw_core_ordered mlxsw_sp_router_fib_event_work [mlxsw_spectrum] RIP: 0010:mlxsw_sp_router_fib_event_work+0x1112/0x1e00 [mlxsw_spectrum] Fixes: cdd6cfc54c64 ("mlxsw: spectrum_router: Allow programming routes with nexthop objects") Signed-off-by: Ido Schimmel Reported-by: Alex Veber Tested-by: Alex Veber Reviewed-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 9ce90841f92d..eda99d82766a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5951,6 +5951,10 @@ mlxsw_sp_router_fib4_replace(struct mlxsw_sp *mlxsw_sp, if (mlxsw_sp->router->aborted) return 0; + if (fen_info->fi->nh && + !mlxsw_sp_nexthop_obj_group_lookup(mlxsw_sp, fen_info->fi->nh->id)) + return 0; + fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, fen_info->tb_id, &fen_info->dst, sizeof(fen_info->dst), fen_info->dst_len, @@ -6601,6 +6605,9 @@ static int mlxsw_sp_router_fib6_replace(struct mlxsw_sp *mlxsw_sp, if (mlxsw_sp_fib6_rt_should_ignore(rt)) return 0; + if (rt->nh && !mlxsw_sp_nexthop_obj_group_lookup(mlxsw_sp, rt->nh->id)) + return 0; + fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->fib6_table->tb6_id, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), From d20cd745218cde1b268bef5282095ec6c95a3ea2 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Fri, 26 Feb 2021 04:43:09 -0500 Subject: [PATCH 034/164] bnxt_en: Fix race between firmware reset and driver remove. The driver's error recovery reset sequence can take many seconds to complete and only the critical sections are protected by rtnl_lock. A recent change has introduced a regression in this sequence. bnxt_remove_one() may be called while the recovery is in progress. Normally, unregister_netdev() would cause bnxt_close_nic() to be called and this would cause the error recovery to safely abort with the BNXT_STATE_ABORT_ERR flag set in bnxt_close_nic(). Recently, we added bnxt_reinit_after_abort() to allow the user to reopen the device after an aborted recovery. This causes the regression in the scenario described above because we would attempt to re-open even after the netdev has been unregistered. Fix it by checking the netdev reg_state in bnxt_reinit_after_abort() and abort if it is unregistered. Fixes: 6882c36cf82e ("bnxt_en: attempt to reinitialize after aborted reset") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a680fd9c68ea..c55189c7bb36 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9890,6 +9890,9 @@ static int bnxt_reinit_after_abort(struct bnxt *bp) if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) return -EBUSY; + if (bp->dev->reg_state == NETREG_UNREGISTERED) + return -ENODEV; + rc = bnxt_fw_init_one(bp); if (!rc) { bnxt_clear_int_mode(bp); From 20d7d1c5c9b11e9f538ed4a2289be106de970d3e Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Fri, 26 Feb 2021 04:43:10 -0500 Subject: [PATCH 035/164] bnxt_en: reliably allocate IRQ table on reset to avoid crash The following trace excerpt corresponds with a NULL pointer dereference of 'bp->irq_tbl' in bnxt_setup_inta() on an Aarch64 system after many device resets: Unable to handle kernel NULL pointer dereference at ... 000000d ... pc : string+0x3c/0x80 lr : vsnprintf+0x294/0x7e0 sp : ffff00000f61ba70 pstate : 20000145 x29: ffff00000f61ba70 x28: 000000000000000d x27: ffff0000009c8b5a x26: ffff00000f61bb80 x25: ffff0000009c8b5a x24: 0000000000000012 x23: 00000000ffffffe0 x22: ffff000008990428 x21: ffff00000f61bb80 x20: 000000000000000d x19: 000000000000001f x18: 0000000000000000 x17: 0000000000000000 x16: ffff800b6d0fb400 x15: 0000000000000000 x14: ffff800b7fe31ae8 x13: 00001ed16472c920 x12: ffff000008c6b1c9 x11: ffff000008cf0580 x10: ffff00000f61bb80 x9 : 00000000ffffffd8 x8 : 000000000000000c x7 : ffff800b684b8000 x6 : 0000000000000000 x5 : 0000000000000065 x4 : 0000000000000001 x3 : ffff0a00ffffff04 x2 : 000000000000001f x1 : 0000000000000000 x0 : 000000000000000d Call trace: string+0x3c/0x80 vsnprintf+0x294/0x7e0 snprintf+0x44/0x50 __bnxt_open_nic+0x34c/0x928 [bnxt_en] bnxt_open+0xe8/0x238 [bnxt_en] __dev_open+0xbc/0x130 __dev_change_flags+0x12c/0x168 dev_change_flags+0x20/0x60 ... Ordinarily, a call to bnxt_setup_inta() (not in trace due to inlining) would not be expected on a system supporting MSIX at all. However, if bnxt_init_int_mode() does not end up being called after the call to bnxt_clear_int_mode() in bnxt_fw_reset_close(), then the driver will think that only INTA is supported and bp->irq_tbl will be NULL, causing the above crash. In the error recovery scenario, we call bnxt_clear_int_mode() in bnxt_fw_reset_close() early in the sequence. Ordinarily, we will call bnxt_init_int_mode() in bnxt_hwrm_if_change() after we reestablish communication with the firmware after reset. However, if the sequence has to abort before we call bnxt_init_int_mode() and if the user later attempts to re-open the device, then it will cause the crash above. We fix it in 2 ways: 1. Check for bp->irq_tbl in bnxt_setup_int_mode(). If it is NULL, call bnxt_init_init_mode(). 2. If we need to abort in bnxt_hwrm_if_change() and cannot complete the error recovery sequence, set the BNXT_STATE_ABORT_ERR flag. This will cause more drastic recovery at the next attempt to re-open the device, including a call to bnxt_init_int_mode(). Fixes: 3bc7d4a352ef ("bnxt_en: Add BNXT_STATE_IN_FW_RESET state.") Reviewed-by: Scott Branden Signed-off-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c55189c7bb36..b53a0d87371a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8556,10 +8556,18 @@ static void bnxt_setup_inta(struct bnxt *bp) bp->irq_tbl[0].handler = bnxt_inta; } +static int bnxt_init_int_mode(struct bnxt *bp); + static int bnxt_setup_int_mode(struct bnxt *bp) { int rc; + if (!bp->irq_tbl) { + rc = bnxt_init_int_mode(bp); + if (rc || !bp->irq_tbl) + return rc ?: -ENODEV; + } + if (bp->flags & BNXT_FLAG_USING_MSIX) bnxt_setup_msix(bp); else @@ -8744,7 +8752,7 @@ static int bnxt_init_inta(struct bnxt *bp) static int bnxt_init_int_mode(struct bnxt *bp) { - int rc = 0; + int rc = -ENODEV; if (bp->flags & BNXT_FLAG_MSIX_CAP) rc = bnxt_init_msix(bp); @@ -9514,7 +9522,8 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) { struct hwrm_func_drv_if_change_output *resp = bp->hwrm_cmd_resp_addr; struct hwrm_func_drv_if_change_input req = {0}; - bool resc_reinit = false, fw_reset = false; + bool fw_reset = !bp->irq_tbl; + bool resc_reinit = false; int rc, retry = 0; u32 flags = 0; @@ -9557,6 +9566,7 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state) && !fw_reset) { netdev_err(bp->dev, "RESET_DONE not set during FW reset.\n"); + set_bit(BNXT_STATE_ABORT_ERR, &bp->state); return -ENODEV; } if (resc_reinit || fw_reset) { From c33cb0020ee6dd96cc9976d6085a7d8422f6dbed Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 22 Feb 2021 08:00:00 +0000 Subject: [PATCH 036/164] uapi: nfnetlink_cthelper.h: fix userspace compilation error Apparently, and could not be included into the same compilation unit because of a cut-and-paste typo in the former header. Fixes: 12f7a505331e6 ("netfilter: add user-space connection tracking helper infrastructure") Cc: # v3.6 Signed-off-by: Dmitry V. Levin Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_cthelper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/netfilter/nfnetlink_cthelper.h b/include/uapi/linux/netfilter/nfnetlink_cthelper.h index a13137afc429..70af02092d16 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cthelper.h +++ b/include/uapi/linux/netfilter/nfnetlink_cthelper.h @@ -5,7 +5,7 @@ #define NFCT_HELPER_STATUS_DISABLED 0 #define NFCT_HELPER_STATUS_ENABLED 1 -enum nfnl_acct_msg_types { +enum nfnl_cthelper_msg_types { NFNL_MSG_CTHELPER_NEW, NFNL_MSG_CTHELPER_GET, NFNL_MSG_CTHELPER_DEL, From c57ea2d7d81fbaa72c7d0ffbff61ade1039f4a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klemen=20Ko=C5=A1ir?= Date: Sat, 20 Feb 2021 18:29:26 +0900 Subject: [PATCH 037/164] netfilter: conntrack: Remove a double space in a log message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed an extra space in a log message and an extra blank line in code. Signed-off-by: Klemen Košir Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 118f415928ae..b055187235f8 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -219,7 +219,7 @@ nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) return NULL; pr_info("nf_conntrack: default automatic helper assignment " "has been turned off for security reasons and CT-based " - " firewall rule not found. Use the iptables CT target " + "firewall rule not found. Use the iptables CT target " "to attach helpers instead.\n"); net->ct.auto_assign_helper_warned = 1; return NULL; @@ -228,7 +228,6 @@ nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); } - int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { From 03a3ca37e4c6478e3a84f04c8429dd5889e107fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 24 Feb 2021 17:23:19 +0100 Subject: [PATCH 038/164] netfilter: nf_nat: undo erroneous tcp edemux lookup Under extremely rare conditions TCP early demux will retrieve the wrong socket. 1. local machine establishes a connection to a remote server, S, on port p. This gives: laddr:lport -> S:p ... both in tcp and conntrack. 2. local machine establishes a connection to host H, on port p2. 2a. TCP stack choses same laddr:lport, so we have laddr:lport -> H:p2 from TCP point of view. 2b). There is a destination NAT rewrite in place, translating H:p2 to S:p. This results in following conntrack entries: I) laddr:lport -> S:p (origin) S:p -> laddr:lport (reply) II) laddr:lport -> H:p2 (origin) S:p -> laddr:lport2 (reply) NAT engine has rewritten laddr:lport to laddr:lport2 to map the reply packet to the correct origin. When server sends SYN/ACK to laddr:lport2, the PREROUTING hook will undo-the SNAT transformation, rewriting IP header to S:p -> laddr:lport This causes TCP early demux to associate the skb with the TCP socket of the first connection. The INPUT hook will then reverse the DNAT transformation, rewriting the IP header to H:p2 -> laddr:lport. Because packet ends up with the wrong socket, the new connection never completes: originator stays in SYN_SENT and conntrack entry remains in SYN_RECV until timeout, and responder retransmits SYN/ACK until it gives up. To resolve this, orphan the skb after the input rewrite: Because the source IP address changed, the socket must be incorrect. We can't move the DNAT undo to prerouting due to backwards compatibility, doing so will make iptables/nftables rules to no longer match the way they did. After orphan, the packet will be handed to the next protocol layer (tcp, udp, ...) and that will repeat the socket lookup just like as if early demux was disabled. Fixes: 41063e9dd1195 ("ipv4: Early TCP socket demux.") Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1427 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_proto.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index e87b6bd6b3cd..4731d21fc3ad 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -646,8 +646,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, } static unsigned int -nf_nat_ipv4_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; @@ -659,6 +659,23 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb, return ret; } +static unsigned int +nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + __be32 saddr = ip_hdr(skb)->saddr; + struct sock *sk = skb->sk; + unsigned int ret; + + ret = nf_nat_ipv4_fn(priv, skb, state); + + if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr && + !inet_sk_transparent(sk)) + skb_orphan(skb); /* TCP edemux obtained wrong socket */ + + return ret; +} + static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -736,7 +753,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_in, + .hook = nf_nat_ipv4_pre_routing, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, @@ -757,7 +774,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_fn, + .hook = nf_nat_ipv4_local_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, From 07b5a76e18925a595bfef44531dbf2f397bb5507 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 24 Feb 2021 17:23:20 +0100 Subject: [PATCH 039/164] netfilter: conntrack: avoid misleading 'invalid' in log message The packet is not flagged as invalid: conntrack will accept it and its associated with the conntrack entry. This happens e.g. when receiving a retransmitted SYN in SYN_RECV state. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1d7e1c595546..ec23330687a5 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -982,8 +982,10 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in " - "state %s ", tcp_conntrack_names[old_state]); + nf_ct_l4proto_log_invalid(skb, ct, + "packet (index %d) in dir %d ignored, state %s", + index, dir, + tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or From c2c16ccba2f55d527dd145a5d8c038694b3b343f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 24 Feb 2021 17:23:21 +0100 Subject: [PATCH 040/164] selftests: netfilter: test nat port clash resolution interaction with tcp early demux Convert Antonio Ojeas bug reproducer to a kselftest. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/Makefile | 2 +- .../selftests/netfilter/nf_nat_edemux.sh | 99 +++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/nf_nat_edemux.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 3006a8e5b41a..3171069a6b46 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -4,7 +4,7 @@ TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ - nft_queue.sh nft_meta.sh \ + nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ ipip-conntrack-mtu.sh LDLIBS = -lmnl diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh new file mode 100755 index 000000000000..cfee3b65be0f --- /dev/null +++ b/tools/testing/selftests/netfilter/nf_nat_edemux.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test NAT source port clash resolution +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" + +cleanup() +{ + ip netns del $ns1 + ip netns del $ns2 +} + +iperf3 -v > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iperf3" + exit $ksft_skip +fi + +iptables --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iptables" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add "$ns1" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $ns1" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add $ns2 + +# Connect the namespaces using a veth pair +ip link add name veth2 type veth peer name veth1 +ip link set netns $ns1 dev veth1 +ip link set netns $ns2 dev veth2 + +ip netns exec $ns1 ip link set up dev lo +ip netns exec $ns1 ip link set up dev veth1 +ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 + +ip netns exec $ns2 ip link set up dev lo +ip netns exec $ns2 ip link set up dev veth2 +ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 + +# Create a server in one namespace +ip netns exec $ns1 iperf3 -s > /dev/null 2>&1 & +iperfs=$! + +# Restrict source port to just one so we don't have to exhaust +# all others. +ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" + +# add a virtual IP using DNAT +ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 + +# ... and route it to the other namespace +ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 + +sleep 1 + +# add a persistent connection from the other namespace +ip netns exec $ns2 nc -q 10 -w 10 192.168.1.1 5201 > /dev/null & + +sleep 1 + +# ip daddr:dport will be rewritten to 192.168.1.1 5201 +# NAT must reallocate source port 10000 because +# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use +echo test | ip netns exec $ns2 nc -w 3 -q 3 10.96.0.1 443 >/dev/null +ret=$? + +kill $iperfs + +# Check nc can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). +if [ $ret -eq 0 ]; then + echo "PASS: nc can connect via NAT'd address" +else + echo "FAIL: nc cannot connect via NAT'd address" + exit 1 +fi + +exit 0 From 8e24edddad152b998b37a7f583175137ed2e04a5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 27 Feb 2021 11:27:45 +0300 Subject: [PATCH 041/164] netfilter: x_tables: gpf inside xt_find_revision() nested target/match_revfn() calls work with xt[NFPROTO_UNSPEC] lists without taking xt[NFPROTO_UNSPEC].mutex. This can race with module unload and cause host to crash: general protection fault: 0000 [#1] Modules linked in: ... [last unloaded: xt_cluster] CPU: 0 PID: 542455 Comm: iptables RIP: 0010:[] [] strcmp+0x18/0x40 RDX: 0000000000000003 RSI: ffff9a5a5d9abe10 RDI: dead000000000111 R13: ffff9a5a5d9abe10 R14: ffff9a5a5d9abd8c R15: dead000000000100 (VvS: %R15 -- &xt_match, %RDI -- &xt_match.name, xt_cluster unregister match in xt[NFPROTO_UNSPEC].match list) Call Trace: [] match_revfn+0x54/0xc0 [] match_revfn+0xaf/0xc0 [] xt_find_revision+0x6e/0xf0 [] do_ipt_get_ctl+0x100/0x420 [ip_tables] [] nf_getsockopt+0x4f/0x70 [] ip_getsockopt+0xde/0x100 [] raw_getsockopt+0x25/0x50 [] sock_common_getsockopt+0x1a/0x20 [] SyS_getsockopt+0x7d/0xf0 [] system_call_fastpath+0x25/0x2a Fixes: 656caff20e1 ("netfilter 04/09: x_tables: fix match/target revision lookup") Signed-off-by: Vasily Averin Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index acce622582e3..bce6ca203d46 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -330,6 +330,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) const struct xt_match *m; int have_rev = 0; + mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision > *bestp) @@ -338,6 +339,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); @@ -350,6 +352,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) const struct xt_target *t; int have_rev = 0; + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision > *bestp) @@ -358,6 +361,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); @@ -371,12 +375,10 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, { int have_rev, best = -1; - mutex_lock(&xt[af].mutex); if (target == 1) have_rev = target_revfn(af, name, revision, &best); else have_rev = match_revfn(af, name, revision, &best); - mutex_unlock(&xt[af].mutex); /* Nothing at all? Return 0 to try loading module. */ if (best == -1) { From a4fc088ad4ff4a99d01978aa41065132b574b4b2 Mon Sep 17 00:00:00 2001 From: Yinjun Zhang Date: Thu, 25 Feb 2021 13:51:02 +0100 Subject: [PATCH 042/164] ethtool: fix the check logic of at least one channel for RX/TX The command "ethtool -L combined 0" may clean the RX/TX channel count and skip the error path, since the attrs tb[ETHTOOL_A_CHANNELS_RX_COUNT] and tb[ETHTOOL_A_CHANNELS_TX_COUNT] are NULL in this case when recent ethtool is used. Tested using ethtool v5.10. Fixes: 7be92514b99c ("ethtool: check if there is at least one channel for TX/RX in the core") Signed-off-by: Yinjun Zhang Signed-off-by: Simon Horman Signed-off-by: Louis Peens Link: https://lore.kernel.org/r/20210225125102.23989-1-simon.horman@netronome.com Signed-off-by: Jakub Kicinski --- net/ethtool/channels.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 25a9e566ef5c..6a070dc8e4b0 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -116,10 +116,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) struct ethtool_channels channels = {}; struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; - const struct nlattr *err_attr; + u32 err_attr, max_rx_in_use = 0; const struct ethtool_ops *ops; struct net_device *dev; - u32 max_rx_in_use = 0; int ret; ret = ethnl_parse_header_dev_get(&req_info, @@ -157,34 +156,35 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) /* ensure new channel counts are within limits */ if (channels.rx_count > channels.max_rx) - err_attr = tb[ETHTOOL_A_CHANNELS_RX_COUNT]; + err_attr = ETHTOOL_A_CHANNELS_RX_COUNT; else if (channels.tx_count > channels.max_tx) - err_attr = tb[ETHTOOL_A_CHANNELS_TX_COUNT]; + err_attr = ETHTOOL_A_CHANNELS_TX_COUNT; else if (channels.other_count > channels.max_other) - err_attr = tb[ETHTOOL_A_CHANNELS_OTHER_COUNT]; + err_attr = ETHTOOL_A_CHANNELS_OTHER_COUNT; else if (channels.combined_count > channels.max_combined) - err_attr = tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT]; + err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; else - err_attr = NULL; + err_attr = 0; if (err_attr) { ret = -EINVAL; - NL_SET_ERR_MSG_ATTR(info->extack, err_attr, + NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel count exceeds maximum"); goto out_ops; } /* ensure there is at least one RX and one TX channel */ if (!channels.combined_count && !channels.rx_count) - err_attr = tb[ETHTOOL_A_CHANNELS_RX_COUNT]; + err_attr = ETHTOOL_A_CHANNELS_RX_COUNT; else if (!channels.combined_count && !channels.tx_count) - err_attr = tb[ETHTOOL_A_CHANNELS_TX_COUNT]; + err_attr = ETHTOOL_A_CHANNELS_TX_COUNT; else - err_attr = NULL; + err_attr = 0; if (err_attr) { if (mod_combined) - err_attr = tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT]; + err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; ret = -EINVAL; - NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested channel counts would result in no RX or TX channel being configured"); + NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], + "requested channel counts would result in no RX or TX channel being configured"); goto out_ops; } From d313d16bbaea0f11a2e98f04a6c678b43c208915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 26 Feb 2021 14:20:38 +0100 Subject: [PATCH 043/164] net: broadcom: bcm4908_enet: enable RX after processing packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When receiving a lot of packets hardware may run out of free descriptiors and stop RX ring. Enable it every time after handling received packets. Fixes: 4feffeadbcb2 ("net: broadcom: bcm4908enet: add BCM4908 controller driver") Signed-off-by: Rafał Miłecki Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20210226132038.29849-1-zajec5@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcm4908_enet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index 0b70e9e0ddad..98cf82dea3e4 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -592,6 +592,9 @@ static int bcm4908_enet_poll(struct napi_struct *napi, int weight) bcm4908_enet_intrs_on(enet); } + /* Hardware could disable ring if it run out of descriptors */ + bcm4908_enet_dma_rx_ring_enable(enet, &enet->rx_ring); + return handled; } From 89e5c58fc1e2857ccdaae506fb8bc5fed57ee063 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Feb 2021 22:22:48 +0100 Subject: [PATCH 044/164] net: Fix gro aggregation for udp encaps with zero csum We noticed a GRO issue for UDP-based encaps such as vxlan/geneve when the csum for the UDP header itself is 0. In that case, GRO aggregation does not take place on the phys dev, but instead is deferred to the vxlan/geneve driver (see trace below). The reason is essentially that GRO aggregation bails out in udp_gro_receive() for such case when drivers marked the skb with CHECKSUM_UNNECESSARY (ice, i40e, others) where for non-zero csums 2abb7cdc0dc8 ("udp: Add support for doing checksum unnecessary conversion") promotes those skbs to CHECKSUM_COMPLETE and napi context has csum_valid set. This is however not the case for zero UDP csum (here: csum_cnt is still 0 and csum_valid continues to be false). At the same time 57c67ff4bd92 ("udp: additional GRO support") added matches on !uh->check ^ !uh2->check as part to determine candidates for aggregation, so it certainly is expected to handle zero csums in udp_gro_receive(). The purpose of the check added via 662880f44203 ("net: Allow GRO to use and set levels of checksum unnecessary") seems to catch bad csum and stop aggregation right away. One way to fix aggregation in the zero case is to only perform the !csum_valid check in udp_gro_receive() if uh->check is infact non-zero. Before: [...] swapper 0 [008] 731.946506: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100400 len=1500 (1) swapper 0 [008] 731.946507: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100200 len=1500 swapper 0 [008] 731.946507: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497101100 len=1500 swapper 0 [008] 731.946508: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497101700 len=1500 swapper 0 [008] 731.946508: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497101b00 len=1500 swapper 0 [008] 731.946508: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100600 len=1500 swapper 0 [008] 731.946508: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100f00 len=1500 swapper 0 [008] 731.946509: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100a00 len=1500 swapper 0 [008] 731.946516: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100500 len=1500 swapper 0 [008] 731.946516: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100700 len=1500 swapper 0 [008] 731.946516: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497101d00 len=1500 (2) swapper 0 [008] 731.946517: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497101000 len=1500 swapper 0 [008] 731.946517: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497101c00 len=1500 swapper 0 [008] 731.946517: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497101400 len=1500 swapper 0 [008] 731.946518: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100e00 len=1500 swapper 0 [008] 731.946518: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497101600 len=1500 swapper 0 [008] 731.946521: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff966497100800 len=774 swapper 0 [008] 731.946530: net:netif_receive_skb: dev=test_vxlan skbaddr=0xffff966497100400 len=14032 (1) swapper 0 [008] 731.946530: net:netif_receive_skb: dev=test_vxlan skbaddr=0xffff966497101d00 len=9112 (2) [...] # netperf -H 10.55.10.4 -t TCP_STREAM -l 20 MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.55.10.4 () port 0 AF_INET : demo Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 20.01 13129.24 After: [...] swapper 0 [026] 521.862641: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff93ab0d479000 len=11286 (1) swapper 0 [026] 521.862643: net:netif_receive_skb: dev=test_vxlan skbaddr=0xffff93ab0d479000 len=11236 (1) swapper 0 [026] 521.862650: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff93ab0d478500 len=2898 (2) swapper 0 [026] 521.862650: net:netif_receive_skb: dev=enp10s0f0 skbaddr=0xffff93ab0d479f00 len=8490 (3) swapper 0 [026] 521.862653: net:netif_receive_skb: dev=test_vxlan skbaddr=0xffff93ab0d478500 len=2848 (2) swapper 0 [026] 521.862653: net:netif_receive_skb: dev=test_vxlan skbaddr=0xffff93ab0d479f00 len=8440 (3) [...] # netperf -H 10.55.10.4 -t TCP_STREAM -l 20 MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.55.10.4 () port 0 AF_INET : demo Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 20.01 24576.53 Fixes: 57c67ff4bd92 ("udp: additional GRO support") Fixes: 662880f44203 ("net: Allow GRO to use and set levels of checksum unnecessary") Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Jesse Brandeburg Cc: Tom Herbert Acked-by: Willem de Bruijn Acked-by: John Fastabend Link: https://lore.kernel.org/r/20210226212248.8300-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b76c48efd37e..c5b4b586570f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -526,7 +526,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, } if (!sk || NAPI_GRO_CB(skb)->encap_mark || - (skb->ip_summed != CHECKSUM_PARTIAL && + (uh->check && skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid) || !udp_sk(sk)->gro_receive) From ae85ddda0f1b341b2d25f5a5e0eff1d42b6ef3df Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Sat, 27 Feb 2021 15:24:51 +0800 Subject: [PATCH 045/164] net: hns3: fix error mask definition of flow director Currently, some bit filed definitions of flow director TCAM configuration command are incorrect. Since the wrong MSB is always 0, and these fields are assgined in order, so it still works. Fix it by redefine them. Fixes: 117328680288 ("net: hns3: Add input key and action config support for flow director") Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index ff52a65b4cff..057dda735492 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -1053,16 +1053,16 @@ struct hclge_fd_tcam_config_3_cmd { #define HCLGE_FD_AD_DROP_B 0 #define HCLGE_FD_AD_DIRECT_QID_B 1 #define HCLGE_FD_AD_QID_S 2 -#define HCLGE_FD_AD_QID_M GENMASK(12, 2) +#define HCLGE_FD_AD_QID_M GENMASK(11, 2) #define HCLGE_FD_AD_USE_COUNTER_B 12 #define HCLGE_FD_AD_COUNTER_NUM_S 13 #define HCLGE_FD_AD_COUNTER_NUM_M GENMASK(20, 13) #define HCLGE_FD_AD_NXT_STEP_B 20 #define HCLGE_FD_AD_NXT_KEY_S 21 -#define HCLGE_FD_AD_NXT_KEY_M GENMASK(26, 21) +#define HCLGE_FD_AD_NXT_KEY_M GENMASK(25, 21) #define HCLGE_FD_AD_WR_RULE_ID_B 0 #define HCLGE_FD_AD_RULE_ID_S 1 -#define HCLGE_FD_AD_RULE_ID_M GENMASK(13, 1) +#define HCLGE_FD_AD_RULE_ID_M GENMASK(12, 1) #define HCLGE_FD_AD_TC_OVRD_B 16 #define HCLGE_FD_AD_TC_SIZE_S 17 #define HCLGE_FD_AD_TC_SIZE_M GENMASK(20, 17) From c75ec148a316e8cf52274d16b9b422703b96f5ce Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Sat, 27 Feb 2021 15:24:52 +0800 Subject: [PATCH 046/164] net: hns3: fix query vlan mask value error for flow director Currently, the driver returns VLAN_VID_MASK for vlan mask field, when get flow director rule information for rule doesn't use vlan. It may cause the vlan mask value display as 0xf000 in this case, like below: estuary:/$ ethtool -u eth1 50 RX rings available Total 1 rules Filter: 2 Rule Type: TCP over IPv4 Src IP addr: 0.0.0.0 mask: 255.255.255.255 Dest IP addr: 0.0.0.0 mask: 255.255.255.255 TOS: 0x0 mask: 0xff Src port: 0 mask: 0xffff Dest port: 0 mask: 0xffff VLAN EtherType: 0x0 mask: 0xffff VLAN: 0x0 mask: 0xf000 User-defined: 0x1234 mask: 0x0 Action: Direct to queue 3 Fix it by return 0. Fixes: 05c2314fe6a8 ("net: hns3: Add support for rule query of flow director") Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 34b744df6709..932cfd1fb7e9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6330,8 +6330,7 @@ static void hclge_fd_get_ext_info(struct ethtool_rx_flow_spec *fs, fs->h_ext.vlan_tci = cpu_to_be16(rule->tuples.vlan_tag1); fs->m_ext.vlan_tci = rule->unused_tuple & BIT(INNER_VLAN_TAG_FST) ? - cpu_to_be16(VLAN_VID_MASK) : - cpu_to_be16(rule->tuples_mask.vlan_tag1); + 0 : cpu_to_be16(rule->tuples_mask.vlan_tag1); } if (fs->flow_type & FLOW_MAC_EXT) { From b36fc875bcdee56865c444a2cdae17d354a6d5f5 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Sat, 27 Feb 2021 15:24:53 +0800 Subject: [PATCH 047/164] net: hns3: fix bug when calculating the TCAM table info The function hclge_fd_convert_tuple() is used to convert tuples and tuples mask to TCAM x and y. But it misuses the source mac as source mac mask when convert INNER_SRC_MAC, which may cause the flow director rule works unexpectedly. So fix it. Fixes: 117328680288 ("net: hns3: Add input key and action config support for flow director") Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 932cfd1fb7e9..e3f81c7e0ce7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -5245,9 +5245,9 @@ static bool hclge_fd_convert_tuple(u32 tuple_bit, u8 *key_x, u8 *key_y, case BIT(INNER_SRC_MAC): for (i = 0; i < ETH_ALEN; i++) { calc_x(key_x[ETH_ALEN - 1 - i], rule->tuples.src_mac[i], - rule->tuples.src_mac[i]); + rule->tuples_mask.src_mac[i]); calc_y(key_y[ETH_ALEN - 1 - i], rule->tuples.src_mac[i], - rule->tuples.src_mac[i]); + rule->tuples_mask.src_mac[i]); } return true; From 449052cfebf624b670faa040245d3feed770d22f Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 18 Feb 2021 19:00:35 +0800 Subject: [PATCH 048/164] can: flexcan: assert FRZ bit in flexcan_chip_freeze() Assert HALT bit to enter freeze mode, there is a premise that FRZ bit is asserted. This patch asserts FRZ bit in flexcan_chip_freeze, although the reset value is 1b'1. This is a prepare patch, later patch will invoke flexcan_chip_freeze() to enter freeze mode, which polling freeze mode acknowledge. Fixes: b1aa1c7a2165b ("can: flexcan: fix transition from and to freeze mode in chip_{,un}freeze") Link: https://lore.kernel.org/r/20210218110037.16591-2-qiangqing.zhang@nxp.com Signed-off-by: Joakim Zhang Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 971ada36e37f..ee2d4967d66a 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -701,7 +701,7 @@ static int flexcan_chip_freeze(struct flexcan_priv *priv) u32 reg; reg = priv->read(®s->mcr); - reg |= FLEXCAN_MCR_HALT; + reg |= FLEXCAN_MCR_FRZ | FLEXCAN_MCR_HALT; priv->write(reg, ®s->mcr); while (timeout-- && !(priv->read(®s->mcr) & FLEXCAN_MCR_FRZ_ACK)) From ec15e27cc8904605846a354bb1f808ea1432f853 Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 18 Feb 2021 19:00:36 +0800 Subject: [PATCH 049/164] can: flexcan: enable RX FIFO after FRZ/HALT valid RX FIFO enable failed could happen when do system reboot stress test: [ 0.303958] flexcan 5a8d0000.can: 5a8d0000.can supply xceiver not found, using dummy regulator [ 0.304281] flexcan 5a8d0000.can (unnamed net_device) (uninitialized): Could not enable RX FIFO, unsupported core [ 0.314640] flexcan 5a8d0000.can: registering netdev failed [ 0.320728] flexcan 5a8e0000.can: 5a8e0000.can supply xceiver not found, using dummy regulator [ 0.320991] flexcan 5a8e0000.can (unnamed net_device) (uninitialized): Could not enable RX FIFO, unsupported core [ 0.331360] flexcan 5a8e0000.can: registering netdev failed [ 0.337444] flexcan 5a8f0000.can: 5a8f0000.can supply xceiver not found, using dummy regulator [ 0.337716] flexcan 5a8f0000.can (unnamed net_device) (uninitialized): Could not enable RX FIFO, unsupported core [ 0.348117] flexcan 5a8f0000.can: registering netdev failed RX FIFO should be enabled after the FRZ/HALT are valid. But the current code enable RX FIFO and FRZ/HALT at the same time. Fixes: e955cead03117 ("CAN: Add Flexcan CAN controller driver") Link: https://lore.kernel.org/r/20210218110037.16591-3-qiangqing.zhang@nxp.com Signed-off-by: Joakim Zhang Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index ee2d4967d66a..e66a51dbea0a 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -1865,10 +1865,14 @@ static int register_flexcandev(struct net_device *dev) if (err) goto out_chip_disable; - /* set freeze, halt and activate FIFO, restrict register access */ + /* set freeze, halt */ + err = flexcan_chip_freeze(priv); + if (err) + goto out_chip_disable; + + /* activate FIFO, restrict register access */ reg = priv->read(®s->mcr); - reg |= FLEXCAN_MCR_FRZ | FLEXCAN_MCR_HALT | - FLEXCAN_MCR_FEN | FLEXCAN_MCR_SUPV; + reg |= FLEXCAN_MCR_FEN | FLEXCAN_MCR_SUPV; priv->write(reg, ®s->mcr); /* Currently we only support newer versions of this core From c63820045e2000f05657467a08715c18c9f490d9 Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 18 Feb 2021 19:00:37 +0800 Subject: [PATCH 050/164] can: flexcan: invoke flexcan_chip_freeze() to enter freeze mode Invoke flexcan_chip_freeze() to enter freeze mode, since need poll freeze mode acknowledge. Fixes: e955cead03117 ("CAN: Add Flexcan CAN controller driver") Link: https://lore.kernel.org/r/20210218110037.16591-4-qiangqing.zhang@nxp.com Signed-off-by: Joakim Zhang Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index e66a51dbea0a..134c05757a3b 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -1480,10 +1480,13 @@ static int flexcan_chip_start(struct net_device *dev) flexcan_set_bittiming(dev); + /* set freeze, halt */ + err = flexcan_chip_freeze(priv); + if (err) + goto out_chip_disable; + /* MCR * - * enable freeze - * halt now * only supervisor access * enable warning int * enable individual RX masking @@ -1492,9 +1495,8 @@ static int flexcan_chip_start(struct net_device *dev) */ reg_mcr = priv->read(®s->mcr); reg_mcr &= ~FLEXCAN_MCR_MAXMB(0xff); - reg_mcr |= FLEXCAN_MCR_FRZ | FLEXCAN_MCR_HALT | FLEXCAN_MCR_SUPV | - FLEXCAN_MCR_WRN_EN | FLEXCAN_MCR_IRMQ | FLEXCAN_MCR_IDAM_C | - FLEXCAN_MCR_MAXMB(priv->tx_mb_idx); + reg_mcr |= FLEXCAN_MCR_SUPV | FLEXCAN_MCR_WRN_EN | FLEXCAN_MCR_IRMQ | + FLEXCAN_MCR_IDAM_C | FLEXCAN_MCR_MAXMB(priv->tx_mb_idx); /* MCR * From 2afe72ead5ab672c8012bda83cbe65f8145568e0 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 22 Feb 2021 20:46:06 +0100 Subject: [PATCH 051/164] can: mcp251xfd: revert "can: mcp251xfd: add BQL support" In the following 4 patches | 99842c9685ab can: dev: can_rx_offload_get_echo_skb(): extend to return can frame length | 9420e1d495e2 can: dev: can_get_echo_skb(): extend to return can frame length | 1dcb6e57db83 can: dev: can_put_echo_skb(): extend to handle frame_len | f0ef72febc9a can: dev: extend struct can_skb_priv to hold CAN frame length the CAN echo SKB support was extended to hold the CAN frame length (which is the length of the CAN frame on the wire). It is meant as a helper for BQL support, to avoid the re-calculation of the frame length before sending it and on TX-completion. However if the CAN frame is send without the request to be looped back the SKB is discarded in can_put_echo_skb() and the subsequent can_get_echo_skb() and can_rx_offload_get_echo_skb() return 0 for the CAN frame length. This results in BQL stalling the TX queue after a few packages. Until the BQL helpers can_get_echo_skb() and can_rx_offload_get_echo_skb() are fixed, revert the BQL support for the mcp251xfd driver. This reverts commit 4162e18e949ba520d5116ac0323500355479a00e. Fixes: 4162e18e949b ("can: mcp251xfd: add BQL support") Cc: Manivannan Sadhasivam Cc: Thomas Kopp Link: https://lore.kernel.org/r/20210228083347.28580-1-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- .../net/can/spi/mcp251xfd/mcp251xfd-core.c | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 3c5b92911d46..799e9d5d3481 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -335,8 +335,6 @@ static void mcp251xfd_ring_init(struct mcp251xfd_priv *priv) u8 len; int i, j; - netdev_reset_queue(priv->ndev); - /* TEF */ tef_ring = priv->tef; tef_ring->head = 0; @@ -1249,8 +1247,7 @@ mcp251xfd_handle_tefif_recover(const struct mcp251xfd_priv *priv, const u32 seq) static int mcp251xfd_handle_tefif_one(struct mcp251xfd_priv *priv, - const struct mcp251xfd_hw_tef_obj *hw_tef_obj, - unsigned int *frame_len_ptr) + const struct mcp251xfd_hw_tef_obj *hw_tef_obj) { struct net_device_stats *stats = &priv->ndev->stats; u32 seq, seq_masked, tef_tail_masked; @@ -1272,8 +1269,7 @@ mcp251xfd_handle_tefif_one(struct mcp251xfd_priv *priv, stats->tx_bytes += can_rx_offload_get_echo_skb(&priv->offload, mcp251xfd_get_tef_tail(priv), - hw_tef_obj->ts, - frame_len_ptr); + hw_tef_obj->ts, NULL); stats->tx_packets++; priv->tef->tail++; @@ -1331,7 +1327,6 @@ mcp251xfd_tef_obj_read(const struct mcp251xfd_priv *priv, static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) { struct mcp251xfd_hw_tef_obj hw_tef_obj[MCP251XFD_TX_OBJ_NUM_MAX]; - unsigned int total_frame_len = 0; u8 tef_tail, len, l; int err, i; @@ -1353,9 +1348,7 @@ static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) } for (i = 0; i < len; i++) { - unsigned int frame_len; - - err = mcp251xfd_handle_tefif_one(priv, &hw_tef_obj[i], &frame_len); + err = mcp251xfd_handle_tefif_one(priv, &hw_tef_obj[i]); /* -EAGAIN means the Sequence Number in the TEF * doesn't match our tef_tail. This can happen if we * read the TEF objects too early. Leave loop let the @@ -1365,8 +1358,6 @@ static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) goto out_netif_wake_queue; if (err) return err; - - total_frame_len += frame_len; } out_netif_wake_queue: @@ -1397,7 +1388,6 @@ static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) return err; tx_ring->tail += len; - netdev_completed_queue(priv->ndev, len, total_frame_len); err = mcp251xfd_check_tef_tail(priv); if (err) @@ -2443,7 +2433,6 @@ static netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, struct mcp251xfd_priv *priv = netdev_priv(ndev); struct mcp251xfd_tx_ring *tx_ring = priv->tx; struct mcp251xfd_tx_obj *tx_obj; - unsigned int frame_len; u8 tx_head; int err; @@ -2462,9 +2451,7 @@ static netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, if (mcp251xfd_get_tx_free(tx_ring) == 0) netif_stop_queue(ndev); - frame_len = can_skb_get_frame_len(skb); - can_put_echo_skb(skb, ndev, tx_head, frame_len); - netdev_sent_queue(priv->ndev, frame_len); + can_put_echo_skb(skb, ndev, tx_head, 0); err = mcp251xfd_tx_obj_write(priv, tx_obj); if (err) From e940e0895a82c6fbaa259f2615eb52b57ee91a7e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 26 Feb 2021 10:24:56 +0100 Subject: [PATCH 052/164] can: skb: can_skb_set_owner(): fix ref counting if socket was closed before setting skb ownership There are two ref count variables controlling the free()ing of a socket: - struct sock::sk_refcnt - which is changed by sock_hold()/sock_put() - struct sock::sk_wmem_alloc - which accounts the memory allocated by the skbs in the send path. In case there are still TX skbs on the fly and the socket() is closed, the struct sock::sk_refcnt reaches 0. In the TX-path the CAN stack clones an "echo" skb, calls sock_hold() on the original socket and references it. This produces the following back trace: | WARNING: CPU: 0 PID: 280 at lib/refcount.c:25 refcount_warn_saturate+0x114/0x134 | refcount_t: addition on 0; use-after-free. | Modules linked in: coda_vpu(E) v4l2_jpeg(E) videobuf2_vmalloc(E) imx_vdoa(E) | CPU: 0 PID: 280 Comm: test_can.sh Tainted: G E 5.11.0-04577-gf8ff6603c617 #203 | Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) | Backtrace: | [<80bafea4>] (dump_backtrace) from [<80bb0280>] (show_stack+0x20/0x24) r7:00000000 r6:600f0113 r5:00000000 r4:81441220 | [<80bb0260>] (show_stack) from [<80bb593c>] (dump_stack+0xa0/0xc8) | [<80bb589c>] (dump_stack) from [<8012b268>] (__warn+0xd4/0x114) r9:00000019 r8:80f4a8c2 r7:83e4150c r6:00000000 r5:00000009 r4:80528f90 | [<8012b194>] (__warn) from [<80bb09c4>] (warn_slowpath_fmt+0x88/0xc8) r9:83f26400 r8:80f4a8d1 r7:00000009 r6:80528f90 r5:00000019 r4:80f4a8c2 | [<80bb0940>] (warn_slowpath_fmt) from [<80528f90>] (refcount_warn_saturate+0x114/0x134) r8:00000000 r7:00000000 r6:82b44000 r5:834e5600 r4:83f4d540 | [<80528e7c>] (refcount_warn_saturate) from [<8079a4c8>] (__refcount_add.constprop.0+0x4c/0x50) | [<8079a47c>] (__refcount_add.constprop.0) from [<8079a57c>] (can_put_echo_skb+0xb0/0x13c) | [<8079a4cc>] (can_put_echo_skb) from [<8079ba98>] (flexcan_start_xmit+0x1c4/0x230) r9:00000010 r8:83f48610 r7:0fdc0000 r6:0c080000 r5:82b44000 r4:834e5600 | [<8079b8d4>] (flexcan_start_xmit) from [<80969078>] (netdev_start_xmit+0x44/0x70) r9:814c0ba0 r8:80c8790c r7:00000000 r6:834e5600 r5:82b44000 r4:82ab1f00 | [<80969034>] (netdev_start_xmit) from [<809725a4>] (dev_hard_start_xmit+0x19c/0x318) r9:814c0ba0 r8:00000000 r7:82ab1f00 r6:82b44000 r5:00000000 r4:834e5600 | [<80972408>] (dev_hard_start_xmit) from [<809c6584>] (sch_direct_xmit+0xcc/0x264) r10:834e5600 r9:00000000 r8:00000000 r7:82b44000 r6:82ab1f00 r5:834e5600 r4:83f27400 | [<809c64b8>] (sch_direct_xmit) from [<809c6c0c>] (__qdisc_run+0x4f0/0x534) To fix this problem, only set skb ownership to sockets which have still a ref count > 0. Fixes: 0ae89beb283a ("can: add destructor for self generated skbs") Cc: Oliver Hartkopp Cc: Andre Naujoks Link: https://lore.kernel.org/r/20210226092456.27126-1-o.rempel@pengutronix.de Suggested-by: Eric Dumazet Signed-off-by: Oleksij Rempel Reviewed-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 685f34cfba20..d438eb058069 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -65,8 +65,12 @@ static inline void can_skb_reserve(struct sk_buff *skb) static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) { - if (sk) { - sock_hold(sk); + /* If the socket has already been closed by user space, the + * refcount may already be 0 (and the socket will be freed + * after the last TX skb has been freed). So only increase + * socket refcount if the refcount is > 0. + */ + if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb->destructor = sock_efree; skb->sk = sk; } From 2712625200ed69c642b9abc3a403830c4643364c Mon Sep 17 00:00:00 2001 From: Torin Cooper-Bennun Date: Fri, 26 Feb 2021 16:34:41 +0000 Subject: [PATCH 053/164] can: tcan4x5x: tcan4x5x_init(): fix initialization - clear MRAM before entering Normal Mode This patch prevents a potentially destructive race condition. The device is fully operational on the bus after entering Normal Mode, so zeroing the MRAM after entering this mode may lead to loss of information, e.g. new received messages. This patch fixes the problem by first initializing the MRAM, then bringing the device into Normale Mode. Fixes: 5443c226ba91 ("can: tcan4x5x: Add tcan4x5x driver to the kernel") Link: https://lore.kernel.org/r/20210226163440.313628-1-torin@maxiluxsystems.com Suggested-by: Marc Kleine-Budde Signed-off-by: Torin Cooper-Bennun Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/tcan4x5x-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/m_can/tcan4x5x-core.c b/drivers/net/can/m_can/tcan4x5x-core.c index b7caec769ddb..4147cecfbbd6 100644 --- a/drivers/net/can/m_can/tcan4x5x-core.c +++ b/drivers/net/can/m_can/tcan4x5x-core.c @@ -237,14 +237,14 @@ static int tcan4x5x_init(struct m_can_classdev *cdev) if (ret) return ret; + /* Zero out the MCAN buffers */ + m_can_init_ram(cdev); + ret = regmap_update_bits(tcan4x5x->regmap, TCAN4X5X_CONFIG, TCAN4X5X_MODE_SEL_MASK, TCAN4X5X_MODE_NORMAL); if (ret) return ret; - /* Zero out the MCAN buffers */ - m_can_init_ram(cdev); - return ret; } From 73f476aa1975bae6a792b340f5b26ffcfba869a6 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Fri, 26 Feb 2021 17:30:20 +0200 Subject: [PATCH 054/164] net: phy: ti: take into account all possible interrupt sources The previous implementation of .handle_interrupt() did not take into account the fact that all the interrupt status registers should be acknowledged since multiple interrupt sources could be asserted. Fix this by reading all the status registers before exiting with IRQ_NONE or triggering the PHY state machine. Fixes: 1d1ae3c6ca3f ("net: phy: ti: implement generic .handle_interrupt() callback") Reported-by: Sven Schuchmann Signed-off-by: Ioana Ciornei Link: https://lore.kernel.org/r/20210226153020.867852-1-ciorneiioana@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 9 +++++---- drivers/net/phy/dp83tc811.c | 11 ++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index be1224b4447b..f7a2ec150e54 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -290,6 +290,7 @@ static int dp83822_config_intr(struct phy_device *phydev) static irqreturn_t dp83822_handle_interrupt(struct phy_device *phydev) { + bool trigger_machine = false; int irq_status; /* The MISR1 and MISR2 registers are holding the interrupt status in @@ -305,7 +306,7 @@ static irqreturn_t dp83822_handle_interrupt(struct phy_device *phydev) return IRQ_NONE; } if (irq_status & ((irq_status & GENMASK(7, 0)) << 8)) - goto trigger_machine; + trigger_machine = true; irq_status = phy_read(phydev, MII_DP83822_MISR2); if (irq_status < 0) { @@ -313,11 +314,11 @@ static irqreturn_t dp83822_handle_interrupt(struct phy_device *phydev) return IRQ_NONE; } if (irq_status & ((irq_status & GENMASK(7, 0)) << 8)) - goto trigger_machine; + trigger_machine = true; - return IRQ_NONE; + if (!trigger_machine) + return IRQ_NONE; -trigger_machine: phy_trigger_machine(phydev); return IRQ_HANDLED; diff --git a/drivers/net/phy/dp83tc811.c b/drivers/net/phy/dp83tc811.c index 688fadffb249..7ea32fb77190 100644 --- a/drivers/net/phy/dp83tc811.c +++ b/drivers/net/phy/dp83tc811.c @@ -264,6 +264,7 @@ static int dp83811_config_intr(struct phy_device *phydev) static irqreturn_t dp83811_handle_interrupt(struct phy_device *phydev) { + bool trigger_machine = false; int irq_status; /* The INT_STAT registers 1, 2 and 3 are holding the interrupt status @@ -279,7 +280,7 @@ static irqreturn_t dp83811_handle_interrupt(struct phy_device *phydev) return IRQ_NONE; } if (irq_status & ((irq_status & GENMASK(7, 0)) << 8)) - goto trigger_machine; + trigger_machine = true; irq_status = phy_read(phydev, MII_DP83811_INT_STAT2); if (irq_status < 0) { @@ -287,7 +288,7 @@ static irqreturn_t dp83811_handle_interrupt(struct phy_device *phydev) return IRQ_NONE; } if (irq_status & ((irq_status & GENMASK(7, 0)) << 8)) - goto trigger_machine; + trigger_machine = true; irq_status = phy_read(phydev, MII_DP83811_INT_STAT3); if (irq_status < 0) { @@ -295,11 +296,11 @@ static irqreturn_t dp83811_handle_interrupt(struct phy_device *phydev) return IRQ_NONE; } if (irq_status & ((irq_status & GENMASK(7, 0)) << 8)) - goto trigger_machine; + trigger_machine = true; - return IRQ_NONE; + if (!trigger_machine) + return IRQ_NONE; -trigger_machine: phy_trigger_machine(phydev); return IRQ_HANDLED; From 826d82170b539f16e1955ab940222543c012044e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 25 Feb 2021 16:39:01 +0100 Subject: [PATCH 055/164] xen-netback: use local var in xenvif_tx_check_gop() instead of re-calculating shinfo already holds the result of skb_shinfo(skb) at this point - no need to re-invoke the construct even twice. Signed-off-by: Jan Beulich Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index e5c73f819662..aff5ac1f002a 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -557,8 +557,8 @@ check_frags: } if (skb_has_frag_list(skb) && !first_shinfo) { - first_shinfo = skb_shinfo(skb); - shinfo = skb_shinfo(skb_shinfo(skb)->frag_list); + first_shinfo = shinfo; + shinfo = skb_shinfo(shinfo->frag_list); nr_frags = shinfo->nr_frags; goto check_frags; From 9eb8bc593a5eed167dac2029abef343854c5ba75 Mon Sep 17 00:00:00 2001 From: DENG Qingfang Date: Mon, 1 Mar 2021 01:08:23 +0800 Subject: [PATCH 056/164] net: dsa: tag_rtl4_a: fix egress tags Commit 86dd9868b878 has several issues, but was accepted too soon before anyone could take a look. - Double free. dsa_slave_xmit() will free the skb if the xmit function returns NULL, but the skb is already freed by eth_skb_pad(). Use __skb_put_padto() to avoid that. - Unnecessary allocation. It has been done by DSA core since commit a3b0b6479700. - A u16 pointer points to skb data. It should be __be16 for network byte order. - Typo in comments. "numer" -> "number". Fixes: 86dd9868b878 ("net: dsa: tag_rtl4_a: Support also egress tags") Signed-off-by: DENG Qingfang Reviewed-by: Florian Fainelli Reviewed-by: Linus Walleij Signed-off-by: David S. Miller --- net/dsa/tag_rtl4_a.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index c17d39b4a1a0..e9176475bac8 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -35,14 +35,12 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); + __be16 *p; u8 *tag; - u16 *p; u16 out; /* Pad out to at least 60 bytes */ - if (unlikely(eth_skb_pad(skb))) - return NULL; - if (skb_cow_head(skb, RTL4_A_HDR_LEN) < 0) + if (unlikely(__skb_put_padto(skb, ETH_ZLEN, false))) return NULL; netdev_dbg(dev, "add realtek tag to package to port %d\n", @@ -53,13 +51,13 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, tag = skb->data + 2 * ETH_ALEN; /* Set Ethertype */ - p = (u16 *)tag; + p = (__be16 *)tag; *p = htons(RTL4_A_ETHERTYPE); out = (RTL4_A_PROTOCOL_RTL8366RB << 12) | (2 << 8); - /* The lower bits is the port numer */ + /* The lower bits is the port number */ out |= (u8)dp->index; - p = (u16 *)(tag + 2); + p = (__be16 *)(tag + 2); *p = htons(out); return skb; From 4372339efc06bc2a796f4cc9d0a7a929dfda4967 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 27 Feb 2021 01:40:19 +0100 Subject: [PATCH 057/164] net: always use icmp{,v6}_ndo_send from ndo_start_xmit There were a few remaining tunnel drivers that didn't receive the prior conversion to icmp{,v6}_ndo_send. Knowing now that this could lead to memory corrution (see ee576c47db60 ("net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before sending") for details), there's even more imperative to have these all converted. So this commit goes through the remaining cases that I could find and does a boring translation to the ndo variety. The Fixes: line below is the merge that originally added icmp{,v6}_ ndo_send and converted the first batch of icmp{,v6}_send users. The rationale then for the change applies equally to this patch. It's just that these drivers were left out of the initial conversion because these network devices are hiding in net/ rather than in drivers/net/. Cc: Florian Westphal Cc: Willem de Bruijn Cc: David S. Miller Cc: Hideaki YOSHIFUJI Cc: David Ahern Cc: Jakub Kicinski Cc: Steffen Klassert Fixes: 803381f9f117 ("Merge branch 'icmp-account-for-NAT-when-sending-icmps-from-ndo-layer'") Signed-off-by: Jason A. Donenfeld Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 5 ++--- net/ipv4/ip_vti.c | 6 +++--- net/ipv6/ip6_gre.c | 16 ++++++++-------- net/ipv6/ip6_tunnel.c | 10 +++++----- net/ipv6/ip6_vti.c | 6 +++--- net/ipv6/sit.c | 2 +- 6 files changed, 22 insertions(+), 23 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 76a420c76f16..f6cc26de5ed3 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -502,8 +502,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, if (!skb_is_gso(skb) && (inner_iph->frag_off & htons(IP_DF)) && mtu < pkt_size) { - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return -E2BIG; } } @@ -527,7 +526,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && mtu < pkt_size) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); return -E2BIG; } } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index abc171e79d3e..eb207089ece0 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -238,13 +238,13 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); } else { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } dst_release(dst); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c3bc89b6b1a1..1baf43aacb2e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -678,8 +678,8 @@ static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb, tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); + icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); return -1; } *encap_limit = tel->encap_limit - 1; @@ -805,8 +805,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); return -1; } @@ -837,7 +837,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) &mtu, skb->protocol); if (err != 0) { if (err == -EMSGSIZE) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); return -1; } @@ -1063,10 +1063,10 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) { if (skb->protocol == htons(ETH_P_IP)) - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_FRAG_NEEDED, htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); else - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } goto tx_err; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a7950baa05e5..3fa0eca5a06f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1332,8 +1332,8 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); + icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; @@ -1385,11 +1385,11 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_FRAG_NEEDED, htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0225fd694192..f10e7a72ea62 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -521,10 +521,10 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } else { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); } err = -EMSGSIZE; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 93636867aee2..63ccd9f2dccc 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -987,7 +987,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; } From d9032dba5a2b2bbf0fdce67c8795300ec9923b43 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Sat, 27 Feb 2021 11:05:58 +0800 Subject: [PATCH 058/164] net: phy: fix save wrong speed and duplex problem if autoneg is on If phy uses generic driver and autoneg is on, enter command "ethtool -s eth0 speed 50" will not change phy speed actually, but command "ethtool eth0" shows speed is 50Mb/s because phydev->speed has been set to 50 and no update later. And duplex setting has same problem too. However, if autoneg is on, phy only changes speed and duplex according to phydev->advertising, but not phydev->speed and phydev->duplex. So in this case, phydev->speed and phydev->duplex don't need to be set in function phy_ethtool_ksettings_set() if autoneg is on. Fixes: 51e2a3846eab ("PHY: Avoid unnecessary aneg restarts") Signed-off-by: Guangbin Huang Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 1be07e45d314..fc2e7cb5b2e5 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -276,14 +276,16 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, phydev->autoneg = autoneg; - phydev->speed = speed; + if (autoneg == AUTONEG_DISABLE) { + phydev->speed = speed; + phydev->duplex = duplex; + } linkmode_copy(phydev->advertising, advertising); linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, phydev->advertising, autoneg == AUTONEG_ENABLE); - phydev->duplex = duplex; phydev->master_slave_set = cmd->base.master_slave_cfg; phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; From 4deb550bc3b698a1f03d0332cde3df154d1b6c1e Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sat, 27 Feb 2021 16:15:06 -0500 Subject: [PATCH 059/164] atm: eni: dont release is never initialized label err_eni_release is reachable when eni_start() fail. In eni_start() it calls dev->phy->start() in the last step, if start() fail we don't need to call phy->stop(), if start() is never called, we neither need to call phy->stop(), otherwise null-ptr-deref will happen. In order to fix this issue, don't call phy->stop() in label err_eni_release [ 4.875714] ================================================================== [ 4.876091] BUG: KASAN: null-ptr-deref in suni_stop+0x47/0x100 [suni] [ 4.876433] Read of size 8 at addr 0000000000000030 by task modprobe/95 [ 4.876778] [ 4.876862] CPU: 0 PID: 95 Comm: modprobe Not tainted 5.11.0-rc7-00090-gdcc0b49040c7 #2 [ 4.877290] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd94 [ 4.877876] Call Trace: [ 4.878009] dump_stack+0x7d/0xa3 [ 4.878191] kasan_report.cold+0x10c/0x10e [ 4.878410] ? __slab_free+0x2f0/0x340 [ 4.878612] ? suni_stop+0x47/0x100 [suni] [ 4.878832] suni_stop+0x47/0x100 [suni] [ 4.879043] eni_do_release+0x3b/0x70 [eni] [ 4.879269] eni_init_one.cold+0x1152/0x1747 [eni] [ 4.879528] ? _raw_spin_lock_irqsave+0x7b/0xd0 [ 4.879768] ? eni_ioctl+0x270/0x270 [eni] [ 4.879990] ? __mutex_lock_slowpath+0x10/0x10 [ 4.880226] ? eni_ioctl+0x270/0x270 [eni] [ 4.880448] local_pci_probe+0x6f/0xb0 [ 4.880650] pci_device_probe+0x171/0x240 [ 4.880864] ? pci_device_remove+0xe0/0xe0 [ 4.881086] ? kernfs_create_link+0xb6/0x110 [ 4.881315] ? sysfs_do_create_link_sd.isra.0+0x76/0xe0 [ 4.881594] really_probe+0x161/0x420 [ 4.881791] driver_probe_device+0x6d/0xd0 [ 4.882010] device_driver_attach+0x82/0x90 [ 4.882233] ? device_driver_attach+0x90/0x90 [ 4.882465] __driver_attach+0x60/0x100 [ 4.882671] ? device_driver_attach+0x90/0x90 [ 4.882903] bus_for_each_dev+0xe1/0x140 [ 4.883114] ? subsys_dev_iter_exit+0x10/0x10 [ 4.883346] ? klist_node_init+0x61/0x80 [ 4.883557] bus_add_driver+0x254/0x2a0 [ 4.883764] driver_register+0xd3/0x150 [ 4.883971] ? 0xffffffffc0038000 [ 4.884149] do_one_initcall+0x84/0x250 [ 4.884355] ? trace_event_raw_event_initcall_finish+0x150/0x150 [ 4.884674] ? unpoison_range+0xf/0x30 [ 4.884875] ? ____kasan_kmalloc.constprop.0+0x84/0xa0 [ 4.885150] ? unpoison_range+0xf/0x30 [ 4.885352] ? unpoison_range+0xf/0x30 [ 4.885557] do_init_module+0xf8/0x350 [ 4.885760] load_module+0x3fe6/0x4340 [ 4.885960] ? vm_unmap_ram+0x1d0/0x1d0 [ 4.886166] ? ____kasan_kmalloc.constprop.0+0x84/0xa0 [ 4.886441] ? module_frob_arch_sections+0x20/0x20 [ 4.886697] ? __do_sys_finit_module+0x108/0x170 [ 4.886941] __do_sys_finit_module+0x108/0x170 [ 4.887178] ? __ia32_sys_init_module+0x40/0x40 [ 4.887419] ? file_open_root+0x200/0x200 [ 4.887634] ? do_sys_open+0x85/0xe0 [ 4.887826] ? filp_open+0x50/0x50 [ 4.888009] ? fpregs_assert_state_consistent+0x4d/0x60 [ 4.888287] ? exit_to_user_mode_prepare+0x2f/0x130 [ 4.888547] do_syscall_64+0x33/0x40 [ 4.888739] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 4.889010] RIP: 0033:0x7ff62fcf1cf7 [ 4.889202] Code: 48 89 57 30 48 8b 04 24 48 89 47 38 e9 1d a0 02 00 48 89 f8 48 89 f71 [ 4.890172] RSP: 002b:00007ffe6644ade8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 4.890570] RAX: ffffffffffffffda RBX: 0000000000f2ca70 RCX: 00007ff62fcf1cf7 [ 4.890944] RDX: 0000000000000000 RSI: 0000000000f2b9e0 RDI: 0000000000000003 [ 4.891318] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000001 [ 4.891691] R10: 00007ff62fd55300 R11: 0000000000000246 R12: 0000000000f2b9e0 [ 4.892064] R13: 0000000000000000 R14: 0000000000f2bdd0 R15: 0000000000000001 [ 4.892439] ================================================================== Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- drivers/atm/eni.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 316a9947541f..b574cce98dc3 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -2260,7 +2260,8 @@ out: return rc; err_eni_release: - eni_do_release(dev); + dev->phy = NULL; + iounmap(ENI_DEV(dev)->ioaddr); err_unregister: atm_dev_deregister(dev); err_free_consistent: From a2bd45834e83d6c5a04d397bde13d744a4812dfc Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sat, 27 Feb 2021 22:55:50 -0500 Subject: [PATCH 060/164] atm: lanai: dont run lanai_dev_close if not open lanai_dev_open() can fail. When it fail, lanai->base is unmapped and the pci device is disabled. The caller, lanai_init_one(), then tries to run atm_dev_deregister(). This will subsequently call lanai_dev_close() and use the already released MMIO area. To fix this issue, set the lanai->base to NULL if open fail, and test the flag in lanai_dev_close(). [ 8.324153] lanai: lanai_start() failed, err=19 [ 8.324819] lanai(itf 0): shutting down interface [ 8.325211] BUG: unable to handle page fault for address: ffffc90000180024 [ 8.325781] #PF: supervisor write access in kernel mode [ 8.326215] #PF: error_code(0x0002) - not-present page [ 8.326641] PGD 100000067 P4D 100000067 PUD 100139067 PMD 10013a067 PTE 0 [ 8.327206] Oops: 0002 [#1] SMP KASAN NOPTI [ 8.327557] CPU: 0 PID: 95 Comm: modprobe Not tainted 5.11.0-rc7-00090-gdcc0b49040c7 #12 [ 8.328229] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-4 [ 8.329145] RIP: 0010:lanai_dev_close+0x4f/0xe5 [lanai] [ 8.329587] Code: 00 48 c7 c7 00 d3 01 c0 e8 49 4e 0a c2 48 8d bd 08 02 00 00 e8 6e 52 14 c1 48 80 [ 8.330917] RSP: 0018:ffff8881029ef680 EFLAGS: 00010246 [ 8.331196] RAX: 000000000003fffe RBX: ffff888102fb4800 RCX: ffffffffc001a98a [ 8.331572] RDX: ffffc90000180000 RSI: 0000000000000246 RDI: ffff888102fb4000 [ 8.331948] RBP: ffff888102fb4000 R08: ffffffff8115da8a R09: ffffed102053deaa [ 8.332326] R10: 0000000000000003 R11: ffffed102053dea9 R12: ffff888102fb48a4 [ 8.332701] R13: ffffffffc00123c0 R14: ffff888102fb4b90 R15: ffff888102fb4b88 [ 8.333077] FS: 00007f08eb9056a0(0000) GS:ffff88815b400000(0000) knlGS:0000000000000000 [ 8.333502] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8.333806] CR2: ffffc90000180024 CR3: 0000000102a28000 CR4: 00000000000006f0 [ 8.334182] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8.334557] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8.334932] Call Trace: [ 8.335066] atm_dev_deregister+0x161/0x1a0 [atm] [ 8.335324] lanai_init_one.cold+0x20c/0x96d [lanai] [ 8.335594] ? lanai_send+0x2a0/0x2a0 [lanai] [ 8.335831] local_pci_probe+0x6f/0xb0 [ 8.336039] pci_device_probe+0x171/0x240 [ 8.336255] ? pci_device_remove+0xe0/0xe0 [ 8.336475] ? kernfs_create_link+0xb6/0x110 [ 8.336704] ? sysfs_do_create_link_sd.isra.0+0x76/0xe0 [ 8.336983] really_probe+0x161/0x420 [ 8.337181] driver_probe_device+0x6d/0xd0 [ 8.337401] device_driver_attach+0x82/0x90 [ 8.337626] ? device_driver_attach+0x90/0x90 [ 8.337859] __driver_attach+0x60/0x100 [ 8.338065] ? device_driver_attach+0x90/0x90 [ 8.338298] bus_for_each_dev+0xe1/0x140 [ 8.338511] ? subsys_dev_iter_exit+0x10/0x10 [ 8.338745] ? klist_node_init+0x61/0x80 [ 8.338956] bus_add_driver+0x254/0x2a0 [ 8.339164] driver_register+0xd3/0x150 [ 8.339370] ? 0xffffffffc0028000 [ 8.339550] do_one_initcall+0x84/0x250 [ 8.339755] ? trace_event_raw_event_initcall_finish+0x150/0x150 [ 8.340076] ? free_vmap_area_noflush+0x1a5/0x5c0 [ 8.340329] ? unpoison_range+0xf/0x30 [ 8.340532] ? ____kasan_kmalloc.constprop.0+0x84/0xa0 [ 8.340806] ? unpoison_range+0xf/0x30 [ 8.341014] ? unpoison_range+0xf/0x30 [ 8.341217] do_init_module+0xf8/0x350 [ 8.341419] load_module+0x3fe6/0x4340 [ 8.341621] ? vm_unmap_ram+0x1d0/0x1d0 [ 8.341826] ? ____kasan_kmalloc.constprop.0+0x84/0xa0 [ 8.342101] ? module_frob_arch_sections+0x20/0x20 [ 8.342358] ? __do_sys_finit_module+0x108/0x170 [ 8.342604] __do_sys_finit_module+0x108/0x170 [ 8.342841] ? __ia32_sys_init_module+0x40/0x40 [ 8.343083] ? file_open_root+0x200/0x200 [ 8.343298] ? do_sys_open+0x85/0xe0 [ 8.343491] ? filp_open+0x50/0x50 [ 8.343675] ? exit_to_user_mode_prepare+0xfc/0x130 [ 8.343935] do_syscall_64+0x33/0x40 [ 8.344132] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 8.344401] RIP: 0033:0x7f08eb887cf7 [ 8.344594] Code: 48 89 57 30 48 8b 04 24 48 89 47 38 e9 1d a0 02 00 48 89 f8 48 89 f7 48 89 d6 41 [ 8.345565] RSP: 002b:00007ffcd5c98ad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 8.345962] RAX: ffffffffffffffda RBX: 00000000008fea70 RCX: 00007f08eb887cf7 [ 8.346336] RDX: 0000000000000000 RSI: 00000000008fd9e0 RDI: 0000000000000003 [ 8.346711] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000001 [ 8.347085] R10: 00007f08eb8eb300 R11: 0000000000000246 R12: 00000000008fd9e0 [ 8.347460] R13: 0000000000000000 R14: 00000000008fddd0 R15: 0000000000000001 [ 8.347836] Modules linked in: lanai(+) atm [ 8.348065] CR2: ffffc90000180024 [ 8.348244] ---[ end trace 7fdc1c668f2003e5 ]--- [ 8.348490] RIP: 0010:lanai_dev_close+0x4f/0xe5 [lanai] [ 8.348772] Code: 00 48 c7 c7 00 d3 01 c0 e8 49 4e 0a c2 48 8d bd 08 02 00 00 e8 6e 52 14 c1 48 80 [ 8.349745] RSP: 0018:ffff8881029ef680 EFLAGS: 00010246 [ 8.350022] RAX: 000000000003fffe RBX: ffff888102fb4800 RCX: ffffffffc001a98a [ 8.350397] RDX: ffffc90000180000 RSI: 0000000000000246 RDI: ffff888102fb4000 [ 8.350772] RBP: ffff888102fb4000 R08: ffffffff8115da8a R09: ffffed102053deaa [ 8.351151] R10: 0000000000000003 R11: ffffed102053dea9 R12: ffff888102fb48a4 [ 8.351525] R13: ffffffffc00123c0 R14: ffff888102fb4b90 R15: ffff888102fb4b88 [ 8.351918] FS: 00007f08eb9056a0(0000) GS:ffff88815b400000(0000) knlGS:0000000000000000 [ 8.352343] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8.352647] CR2: ffffc90000180024 CR3: 0000000102a28000 CR4: 00000000000006f0 [ 8.353022] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8.353397] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8.353958] modprobe (95) used greatest stack depth: 26216 bytes left Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- drivers/atm/lanai.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c index d7277c26e423..32d7aa141d96 100644 --- a/drivers/atm/lanai.c +++ b/drivers/atm/lanai.c @@ -2233,6 +2233,7 @@ static int lanai_dev_open(struct atm_dev *atmdev) conf1_write(lanai); #endif iounmap(lanai->base); + lanai->base = NULL; error_pci: pci_disable_device(lanai->pci); error: @@ -2245,6 +2246,8 @@ static int lanai_dev_open(struct atm_dev *atmdev) static void lanai_dev_close(struct atm_dev *atmdev) { struct lanai_dev *lanai = (struct lanai_dev *) atmdev->dev_data; + if (lanai->base==NULL) + return; printk(KERN_INFO DEV_LABEL "(itf %d): shutting down interface\n", lanai->number); lanai_timed_poll_stop(lanai); @@ -2552,7 +2555,7 @@ static int lanai_init_one(struct pci_dev *pci, struct atm_dev *atmdev; int result; - lanai = kmalloc(sizeof(*lanai), GFP_KERNEL); + lanai = kzalloc(sizeof(*lanai), GFP_KERNEL); if (lanai == NULL) { printk(KERN_ERR DEV_LABEL ": couldn't allocate dev_data structure!\n"); From 8c91bc3d44dfef8284af384877fbe61117e8b7d1 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sun, 28 Feb 2021 23:25:43 +0300 Subject: [PATCH 061/164] sh_eth: fix TRSCER mask for SH771x According to the SH7710, SH7712, SH7713 Group User's Manual: Hardware, Rev. 3.00, the TRSCER register actually has only bit 7 valid (and named differently), with all the other bits reserved. Apparently, this was not the case with some early revisions of the manual as we have the other bits declared (and set) in the original driver. Follow the suit and add the explicit sh_eth_cpu_data::trscer_err_mask initializer for SH771x... Fixes: 86a74ff21a7a ("net: sh_eth: add support for Renesas SuperH Ethernet") Signed-off-by: Sergey Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 590b088bc4c7..e79bb0a3ced5 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -1089,6 +1089,9 @@ static struct sh_eth_cpu_data sh771x_data = { EESIPR_CEEFIP | EESIPR_CELFIP | EESIPR_RRFIP | EESIPR_RTLFIP | EESIPR_RTSFIP | EESIPR_PREIP | EESIPR_CERFIP, + + .trscer_err_mask = DESC_I_RINT8, + .tsu = 1, .dual_port = 1, }; From 75be7fb7f978202c4c3a1a713af4485afb2ff5f6 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sun, 28 Feb 2021 23:26:34 +0300 Subject: [PATCH 062/164] sh_eth: fix TRSCER mask for R7S72100 According to the RZ/A1H Group, RZ/A1M Group User's Manual: Hardware, Rev. 4.00, the TRSCER register has bit 9 reserved, hence we can't use the driver's default TRSCER mask. Add the explicit initializer for sh_eth_cpu_data::trscer_err_mask for R7S72100. Fixes: db893473d313 ("sh_eth: Add support for r7s72100") Signed-off-by: Sergey Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index e79bb0a3ced5..7f14d4aa5b3e 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -560,6 +560,8 @@ static struct sh_eth_cpu_data r7s72100_data = { EESR_TDE, .fdr_value = 0x0000070f, + .trscer_err_mask = DESC_I_RINT8 | DESC_I_RINT5, + .no_psr = 1, .apr = 1, .mpr = 1, From 165bc5a4f30eee4735845aa7dbd6b738643f2603 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sun, 28 Feb 2021 23:27:32 +0300 Subject: [PATCH 063/164] sh_eth: fix TRSCER mask for R7S9210 According to the RZ/A2M Group User's Manual: Hardware, Rev. 2.00, the TRSCER register has bit 9 reserved, hence we can't use the driver's default TRSCER mask. Add the explicit initializer for sh_eth_cpu_data:: trscer_err_mask for R7S9210. Fixes: 6e0bb04d0e4f ("sh_eth: Add R7S9210 support") Signed-off-by: Sergey Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 7f14d4aa5b3e..f029c7c03804 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -782,6 +782,8 @@ static struct sh_eth_cpu_data r7s9210_data = { .fdr_value = 0x0000070f, + .trscer_err_mask = DESC_I_RINT8 | DESC_I_RINT5, + .apr = 1, .mpr = 1, .tpauser = 1, From 093b036aa94e01a0bea31a38d7f0ee28a2749023 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Mon, 1 Mar 2021 02:22:40 +0300 Subject: [PATCH 064/164] net/qrtr: fix __netdev_alloc_skb call syzbot found WARNING in __alloc_pages_nodemask()[1] when order >= MAX_ORDER. It was caused by a huge length value passed from userspace to qrtr_tun_write_iter(), which tries to allocate skb. Since the value comes from the untrusted source there is no need to raise a warning in __alloc_pages_nodemask(). [1] WARNING in __alloc_pages_nodemask+0x5f8/0x730 mm/page_alloc.c:5014 Call Trace: __alloc_pages include/linux/gfp.h:511 [inline] __alloc_pages_node include/linux/gfp.h:524 [inline] alloc_pages_node include/linux/gfp.h:538 [inline] kmalloc_large_node+0x60/0x110 mm/slub.c:3999 __kmalloc_node_track_caller+0x319/0x3f0 mm/slub.c:4496 __kmalloc_reserve net/core/skbuff.c:150 [inline] __alloc_skb+0x4e4/0x5a0 net/core/skbuff.c:210 __netdev_alloc_skb+0x70/0x400 net/core/skbuff.c:446 netdev_alloc_skb include/linux/skbuff.h:2832 [inline] qrtr_endpoint_post+0x84/0x11b0 net/qrtr/qrtr.c:442 qrtr_tun_write_iter+0x11f/0x1a0 net/qrtr/tun.c:98 call_write_iter include/linux/fs.h:1901 [inline] new_sync_write+0x426/0x650 fs/read_write.c:518 vfs_write+0x791/0xa30 fs/read_write.c:605 ksys_write+0x12d/0x250 fs/read_write.c:658 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported-by: syzbot+80dccaee7c6630fa9dcf@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Acked-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index b34358282f37..82d2eb8c21d1 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -439,7 +439,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (len == 0 || len & 3) return -EINVAL; - skb = netdev_alloc_skb(NULL, len); + skb = __netdev_alloc_skb(NULL, len, GFP_ATOMIC | __GFP_NOWARN); if (!skb) return -ENOMEM; From 8bd2a05527349c8627d2b9795d3c7a6f76033676 Mon Sep 17 00:00:00 2001 From: Yejune Deng Date: Mon, 1 Mar 2021 14:05:48 +0800 Subject: [PATCH 065/164] inetpeer: use div64_ul() and clamp_val() calculate inet_peer_threshold In inet_initpeers(), struct inet_peer on IA32 uses 128 bytes in nowdays. Get rid of the cascade and use div64_ul() and clamp_val() calculate that will not need to be adjusted in the future as suggested by Eric Dumazet. Suggested-by: Eric Dumazet Signed-off-by: Yejune Deng Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ff327a62c9ce..da21dfce24d7 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -65,7 +65,7 @@ EXPORT_SYMBOL_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ -int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more +int inet_peer_threshold __read_mostly; /* start to throw entries more * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ @@ -73,20 +73,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { - struct sysinfo si; + u64 nr_entries; - /* Use the straight interface to information about memory. */ - si_meminfo(&si); - /* The values below were suggested by Alexey Kuznetsov - * . I don't have any opinion about the values - * myself. --SAW - */ - if (si.totalram <= (32768*1024)/PAGE_SIZE) - inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ - if (si.totalram <= (16384*1024)/PAGE_SIZE) - inet_peer_threshold >>= 1; /* about 512KB */ - if (si.totalram <= (8192*1024)/PAGE_SIZE) - inet_peer_threshold >>= 2; /* about 128KB */ + /* 1% of physical memory */ + nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT, + 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer))); + + inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); peer_cachep = kmem_cache_create("inet_peer_cache", sizeof(struct inet_peer), From c646d10dda2dcde82c6ce5a474522621ab2b8b19 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 1 Mar 2021 13:18:11 +0200 Subject: [PATCH 066/164] net: enetc: don't overwrite the RSS indirection table when initializing After the blamed patch, all RX traffic gets hashed to CPU 0 because the hashing indirection table set up in: enetc_pf_probe -> enetc_alloc_si_resources -> enetc_configure_si -> enetc_setup_default_rss_table is overwritten later in: enetc_pf_probe -> enetc_init_port_rss_memory which zero-initializes the entire port RSS table in order to avoid ECC errors. The trouble really is that enetc_init_port_rss_memory really neads enetc_alloc_si_resources to be called, because it depends upon enetc_alloc_cbdr and enetc_setup_cbdr. But that whole enetc_configure_si thing could have been better thought out, it has nothing to do in a function called "alloc_si_resources", especially since its counterpart, "free_si_resources", does nothing to unwind the configuration of the SI. The point is, we need to pull out enetc_configure_si out of enetc_alloc_resources, and move it after enetc_init_port_rss_memory. This allows us to set up the default RSS indirection table after initializing the memory. Fixes: 07bf34a50e32 ("net: enetc: initialize the RFS and RSS memories") Cc: Jesse Brandeburg Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 11 +++-------- drivers/net/ethernet/freescale/enetc/enetc.h | 1 + drivers/net/ethernet/freescale/enetc/enetc_pf.c | 7 +++++++ drivers/net/ethernet/freescale/enetc/enetc_vf.c | 7 +++++++ 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index c78d12229730..fdb6b9e8da78 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1058,13 +1058,12 @@ static int enetc_setup_default_rss_table(struct enetc_si *si, int num_groups) return 0; } -static int enetc_configure_si(struct enetc_ndev_priv *priv) +int enetc_configure_si(struct enetc_ndev_priv *priv) { struct enetc_si *si = priv->si; struct enetc_hw *hw = &si->hw; int err; - enetc_setup_cbdr(hw, &si->cbd_ring); /* set SI cache attributes */ enetc_wr(hw, ENETC_SICAR0, ENETC_SICAR_RD_COHERENT | ENETC_SICAR_WR_COHERENT); @@ -1112,6 +1111,8 @@ int enetc_alloc_si_resources(struct enetc_ndev_priv *priv) if (err) return err; + enetc_setup_cbdr(&si->hw, &si->cbd_ring); + priv->cls_rules = kcalloc(si->num_fs_entries, sizeof(*priv->cls_rules), GFP_KERNEL); if (!priv->cls_rules) { @@ -1119,14 +1120,8 @@ int enetc_alloc_si_resources(struct enetc_ndev_priv *priv) goto err_alloc_cls; } - err = enetc_configure_si(priv); - if (err) - goto err_config_si; - return 0; -err_config_si: - kfree(priv->cls_rules); err_alloc_cls: enetc_clear_cbdr(&si->hw); enetc_free_cbdr(priv->dev, &si->cbd_ring); diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 8532d23b54f5..f8275cef3b5c 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -292,6 +292,7 @@ void enetc_get_si_caps(struct enetc_si *si); void enetc_init_si_rings_params(struct enetc_ndev_priv *priv); int enetc_alloc_si_resources(struct enetc_ndev_priv *priv); void enetc_free_si_resources(struct enetc_ndev_priv *priv); +int enetc_configure_si(struct enetc_ndev_priv *priv); int enetc_open(struct net_device *ndev); int enetc_close(struct net_device *ndev); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 515c5b29d7aa..d02ecb2e46ae 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -1108,6 +1108,12 @@ static int enetc_pf_probe(struct pci_dev *pdev, goto err_init_port_rss; } + err = enetc_configure_si(priv); + if (err) { + dev_err(&pdev->dev, "Failed to configure SI\n"); + goto err_config_si; + } + err = enetc_alloc_msix(priv); if (err) { dev_err(&pdev->dev, "MSIX alloc failed\n"); @@ -1136,6 +1142,7 @@ err_phylink_create: enetc_mdiobus_destroy(pf); err_mdiobus_create: enetc_free_msix(priv); +err_config_si: err_init_port_rss: err_init_port_rfs: err_alloc_msix: diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c index 39c1a09e69a9..9b755a84c2d6 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c @@ -171,6 +171,12 @@ static int enetc_vf_probe(struct pci_dev *pdev, goto err_alloc_si_res; } + err = enetc_configure_si(priv); + if (err) { + dev_err(&pdev->dev, "Failed to configure SI\n"); + goto err_config_si; + } + err = enetc_alloc_msix(priv); if (err) { dev_err(&pdev->dev, "MSIX alloc failed\n"); @@ -187,6 +193,7 @@ static int enetc_vf_probe(struct pci_dev *pdev, err_reg_netdev: enetc_free_msix(priv); +err_config_si: err_alloc_msix: enetc_free_si_resources(priv); err_alloc_si_res: From 3222b5b613db558e9a494bbf53f3c984d90f71ea Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 1 Mar 2021 13:18:12 +0200 Subject: [PATCH 067/164] net: enetc: initialize RFS/RSS memories for unused ports too Michael reports that since linux-next-20210211, the AER messages for ECC errors have started reappearing, and this time they can be reliably reproduced with the first ping on one of his LS1028A boards. $ ping 1[ 33.258069] pcieport 0000:00:1f.0: AER: Multiple Corrected error received: 0000:00:00.0 72.16.0.1 PING [ 33.267050] pcieport 0000:00:1f.0: AER: can't find device of ID0000 172.16.0.1 (172.16.0.1): 56 data bytes 64 bytes from 172.16.0.1: seq=0 ttl=64 time=17.124 ms 64 bytes from 172.16.0.1: seq=1 ttl=64 time=0.273 ms $ devmem 0x1f8010e10 32 0xC0000006 It isn't clear why this is necessary, but it seems that for the errors to go away, we must clear the entire RFS and RSS memory, not just for the ports in use. Sadly the code is structured in such a way that we can't have unified logic for the used and unused ports. For the minimal initialization of an unused port, we need just to enable and ioremap the PF memory space, and a control buffer descriptor ring. Unused ports must then free the CBDR because the driver will exit, but used ports can not pick up from where that code path left, since the CBDR API does not reinitialize a ring when setting it up, so its producer and consumer indices are out of sync between the software and hardware state. So a separate enetc_init_unused_port function was created, and it gets called right after the PF memory space is enabled. Fixes: 07bf34a50e32 ("net: enetc: initialize the RFS and RSS memories") Reported-by: Michael Walle Cc: Jesse Brandeburg Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 8 ++--- drivers/net/ethernet/freescale/enetc/enetc.h | 4 +++ .../net/ethernet/freescale/enetc/enetc_pf.c | 33 ++++++++++++++++--- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index fdb6b9e8da78..eb45830a1667 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -984,7 +984,7 @@ static void enetc_free_rxtx_rings(struct enetc_ndev_priv *priv) enetc_free_tx_ring(priv->tx_ring[i]); } -static int enetc_alloc_cbdr(struct device *dev, struct enetc_cbdr *cbdr) +int enetc_alloc_cbdr(struct device *dev, struct enetc_cbdr *cbdr) { int size = cbdr->bd_count * sizeof(struct enetc_cbd); @@ -1005,7 +1005,7 @@ static int enetc_alloc_cbdr(struct device *dev, struct enetc_cbdr *cbdr) return 0; } -static void enetc_free_cbdr(struct device *dev, struct enetc_cbdr *cbdr) +void enetc_free_cbdr(struct device *dev, struct enetc_cbdr *cbdr) { int size = cbdr->bd_count * sizeof(struct enetc_cbd); @@ -1013,7 +1013,7 @@ static void enetc_free_cbdr(struct device *dev, struct enetc_cbdr *cbdr) cbdr->bd_base = NULL; } -static void enetc_setup_cbdr(struct enetc_hw *hw, struct enetc_cbdr *cbdr) +void enetc_setup_cbdr(struct enetc_hw *hw, struct enetc_cbdr *cbdr) { /* set CBDR cache attributes */ enetc_wr(hw, ENETC_SICAR2, @@ -1033,7 +1033,7 @@ static void enetc_setup_cbdr(struct enetc_hw *hw, struct enetc_cbdr *cbdr) cbdr->cir = hw->reg + ENETC_SICBDRCIR; } -static void enetc_clear_cbdr(struct enetc_hw *hw) +void enetc_clear_cbdr(struct enetc_hw *hw) { enetc_wr(hw, ENETC_SICBDRMR, 0); } diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index f8275cef3b5c..8b380fc13314 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -310,6 +310,10 @@ int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, void enetc_set_ethtool_ops(struct net_device *ndev); /* control buffer descriptor ring (CBDR) */ +int enetc_alloc_cbdr(struct device *dev, struct enetc_cbdr *cbdr); +void enetc_free_cbdr(struct device *dev, struct enetc_cbdr *cbdr); +void enetc_setup_cbdr(struct enetc_hw *hw, struct enetc_cbdr *cbdr); +void enetc_clear_cbdr(struct enetc_hw *hw); int enetc_set_mac_flt_entry(struct enetc_si *si, int index, char *mac_addr, int si_map); int enetc_clear_mac_flt_entry(struct enetc_si *si, int index); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index d02ecb2e46ae..62ba4bf56f0d 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -1041,6 +1041,26 @@ static int enetc_init_port_rss_memory(struct enetc_si *si) return err; } +static void enetc_init_unused_port(struct enetc_si *si) +{ + struct device *dev = &si->pdev->dev; + struct enetc_hw *hw = &si->hw; + int err; + + si->cbd_ring.bd_count = ENETC_CBDR_DEFAULT_SIZE; + err = enetc_alloc_cbdr(dev, &si->cbd_ring); + if (err) + return; + + enetc_setup_cbdr(hw, &si->cbd_ring); + + enetc_init_port_rfs_memory(si); + enetc_init_port_rss_memory(si); + + enetc_clear_cbdr(hw); + enetc_free_cbdr(dev, &si->cbd_ring); +} + static int enetc_pf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -1051,11 +1071,6 @@ static int enetc_pf_probe(struct pci_dev *pdev, struct enetc_pf *pf; int err; - if (node && !of_device_is_available(node)) { - dev_info(&pdev->dev, "device is disabled, skipping\n"); - return -ENODEV; - } - err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(*pf)); if (err) { dev_err(&pdev->dev, "PCI probing failed\n"); @@ -1069,6 +1084,13 @@ static int enetc_pf_probe(struct pci_dev *pdev, goto err_map_pf_space; } + if (node && !of_device_is_available(node)) { + enetc_init_unused_port(si); + dev_info(&pdev->dev, "device is disabled, skipping\n"); + err = -ENODEV; + goto err_device_disabled; + } + pf = enetc_si_priv(si); pf->si = si; pf->total_vfs = pci_sriov_get_totalvfs(pdev); @@ -1151,6 +1173,7 @@ err_alloc_si_res: si->ndev = NULL; free_netdev(ndev); err_alloc_netdev: +err_device_disabled: err_map_pf_space: enetc_pci_remove(pdev); From 6d36ecdbc4410e61a0e02adc5d3abeee22a8ffd3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 1 Mar 2021 13:18:13 +0200 Subject: [PATCH 068/164] net: enetc: take the MDIO lock only once per NAPI poll cycle The workaround for the ENETC MDIO erratum caused a performance degradation of 82 Kpps (seen with IP forwarding of two 1Gbps streams of 64B packets). This is due to excessive locking and unlocking in the fast path, which can be avoided. By taking the MDIO read-side lock only once per NAPI poll cycle, we are able to regain 54 Kpps (65%) of the performance hit. The rest of the performance degradation comes from the TX data path, but unfortunately it doesn't look like we can optimize that away easily, even with netdev_xmit_more(), there just isn't any skb batching done, to help with taking the MDIO lock less often than once per packet. We need to change the register accessor type for enetc_get_tx_tstamp, because it now runs under the enetc_lock_mdio as per the new call path detailed below: enetc_msix -> napi_schedule -> enetc_poll -> enetc_lock_mdio -> enetc_clean_tx_ring -> enetc_get_tx_tstamp -> enetc_clean_rx_ring -> enetc_unlock_mdio Fixes: fd5736bf9f23 ("enetc: Workaround for MDIO register access issue") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 31 ++++++------------- .../net/ethernet/freescale/enetc/enetc_hw.h | 2 ++ 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index eb45830a1667..9bcceb74fb9c 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -281,6 +281,8 @@ static int enetc_poll(struct napi_struct *napi, int budget) int work_done; int i; + enetc_lock_mdio(); + for (i = 0; i < v->count_tx_rings; i++) if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) complete = false; @@ -291,8 +293,10 @@ static int enetc_poll(struct napi_struct *napi, int budget) if (work_done) v->rx_napi_work = true; - if (!complete) + if (!complete) { + enetc_unlock_mdio(); return budget; + } napi_complete_done(napi, work_done); @@ -301,8 +305,6 @@ static int enetc_poll(struct napi_struct *napi, int budget) v->rx_napi_work = false; - enetc_lock_mdio(); - /* enable interrupts */ enetc_wr_reg_hot(v->rbier, ENETC_RBIER_RXTIE); @@ -327,8 +329,8 @@ static void enetc_get_tx_tstamp(struct enetc_hw *hw, union enetc_tx_bd *txbd, { u32 lo, hi, tstamp_lo; - lo = enetc_rd(hw, ENETC_SICTR0); - hi = enetc_rd(hw, ENETC_SICTR1); + lo = enetc_rd_hot(hw, ENETC_SICTR0); + hi = enetc_rd_hot(hw, ENETC_SICTR1); tstamp_lo = le32_to_cpu(txbd->wb.tstamp); if (lo <= tstamp_lo) hi -= 1; @@ -358,9 +360,7 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) i = tx_ring->next_to_clean; tx_swbd = &tx_ring->tx_swbd[i]; - enetc_lock_mdio(); bds_to_clean = enetc_bd_ready_count(tx_ring, i); - enetc_unlock_mdio(); do_tstamp = false; @@ -403,8 +403,6 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) tx_swbd = tx_ring->tx_swbd; } - enetc_lock_mdio(); - /* BD iteration loop end */ if (is_eof) { tx_frm_cnt++; @@ -415,8 +413,6 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) if (unlikely(!bds_to_clean)) bds_to_clean = enetc_bd_ready_count(tx_ring, i); - - enetc_unlock_mdio(); } tx_ring->next_to_clean = i; @@ -660,8 +656,6 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, u32 bd_status; u16 size; - enetc_lock_mdio(); - if (cleaned_cnt >= ENETC_RXBD_BUNDLE) { int count = enetc_refill_rx_ring(rx_ring, cleaned_cnt); @@ -672,19 +666,15 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, rxbd = enetc_rxbd(rx_ring, i); bd_status = le32_to_cpu(rxbd->r.lstatus); - if (!bd_status) { - enetc_unlock_mdio(); + if (!bd_status) break; - } enetc_wr_reg_hot(rx_ring->idr, BIT(rx_ring->index)); dma_rmb(); /* for reading other rxbd fields */ size = le16_to_cpu(rxbd->r.buf_len); skb = enetc_map_rx_buff_to_skb(rx_ring, i, size); - if (!skb) { - enetc_unlock_mdio(); + if (!skb) break; - } enetc_get_offloads(rx_ring, rxbd, skb); @@ -696,7 +686,6 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, if (unlikely(bd_status & ENETC_RXBD_LSTATUS(ENETC_RXBD_ERR_MASK))) { - enetc_unlock_mdio(); dev_kfree_skb(skb); while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { dma_rmb(); @@ -736,8 +725,6 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, enetc_process_skb(rx_ring, skb); - enetc_unlock_mdio(); - napi_gro_receive(napi, skb); rx_frm_cnt++; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index c71fe8d751d5..8b54562f5da6 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -453,6 +453,8 @@ static inline u64 _enetc_rd_reg64_wa(void __iomem *reg) #define enetc_wr_reg(reg, val) _enetc_wr_reg_wa((reg), (val)) #define enetc_rd(hw, off) enetc_rd_reg((hw)->reg + (off)) #define enetc_wr(hw, off, val) enetc_wr_reg((hw)->reg + (off), val) +#define enetc_rd_hot(hw, off) enetc_rd_reg_hot((hw)->reg + (off)) +#define enetc_wr_hot(hw, off, val) enetc_wr_reg_hot((hw)->reg + (off), val) #define enetc_rd64(hw, off) _enetc_rd_reg64_wa((hw)->reg + (off)) /* port register accessors - PF only */ #define enetc_port_rd(hw, off) enetc_rd_reg((hw)->port + (off)) From 827b6fd046516af605e190c872949f22208b5d41 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 1 Mar 2021 13:18:14 +0200 Subject: [PATCH 069/164] net: enetc: fix incorrect TPID when receiving 802.1ad tagged packets When the enetc ports have rx-vlan-offload enabled, they report a TPID of ETH_P_8021Q regardless of what was actually in the packet. When rx-vlan-offload is disabled, packets have the proper TPID. Fix this inconsistency by finishing the TODO left in the code. Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 34 ++++++++++++++----- .../net/ethernet/freescale/enetc/enetc_hw.h | 3 ++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 9bcceb74fb9c..8ddf0cdc37a5 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -523,9 +523,8 @@ static void enetc_get_rx_tstamp(struct net_device *ndev, static void enetc_get_offloads(struct enetc_bdr *rx_ring, union enetc_rx_bd *rxbd, struct sk_buff *skb) { -#ifdef CONFIG_FSL_ENETC_PTP_CLOCK struct enetc_ndev_priv *priv = netdev_priv(rx_ring->ndev); -#endif + /* TODO: hashing */ if (rx_ring->ndev->features & NETIF_F_RXCSUM) { u16 inet_csum = le16_to_cpu(rxbd->r.inet_csum); @@ -534,12 +533,31 @@ static void enetc_get_offloads(struct enetc_bdr *rx_ring, skb->ip_summed = CHECKSUM_COMPLETE; } - /* copy VLAN to skb, if one is extracted, for now we assume it's a - * standard TPID, but HW also supports custom values - */ - if (le16_to_cpu(rxbd->r.flags) & ENETC_RXBD_FLAG_VLAN) - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), - le16_to_cpu(rxbd->r.vlan_opt)); + if (le16_to_cpu(rxbd->r.flags) & ENETC_RXBD_FLAG_VLAN) { + __be16 tpid = 0; + + switch (le16_to_cpu(rxbd->r.flags) & ENETC_RXBD_FLAG_TPID) { + case 0: + tpid = htons(ETH_P_8021Q); + break; + case 1: + tpid = htons(ETH_P_8021AD); + break; + case 2: + tpid = htons(enetc_port_rd(&priv->si->hw, + ENETC_PCVLANR1)); + break; + case 3: + tpid = htons(enetc_port_rd(&priv->si->hw, + ENETC_PCVLANR2)); + break; + default: + break; + } + + __vlan_hwaccel_put_tag(skb, tpid, le16_to_cpu(rxbd->r.vlan_opt)); + } + #ifdef CONFIG_FSL_ENETC_PTP_CLOCK if (priv->active_offloads & ENETC_F_RX_TSTAMP) enetc_get_rx_tstamp(rx_ring->ndev, rxbd, skb); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index 8b54562f5da6..a62604a1e54e 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -172,6 +172,8 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_PSIPMAR0(n) (0x0100 + (n) * 0x8) /* n = SI index */ #define ENETC_PSIPMAR1(n) (0x0104 + (n) * 0x8) #define ENETC_PVCLCTR 0x0208 +#define ENETC_PCVLANR1 0x0210 +#define ENETC_PCVLANR2 0x0214 #define ENETC_VLAN_TYPE_C BIT(0) #define ENETC_VLAN_TYPE_S BIT(1) #define ENETC_PVCLCTR_OVTPIDL(bmp) ((bmp) & 0xff) /* VLAN_TYPE */ @@ -570,6 +572,7 @@ union enetc_rx_bd { #define ENETC_RXBD_LSTATUS(flags) ((flags) << 16) #define ENETC_RXBD_FLAG_VLAN BIT(9) #define ENETC_RXBD_FLAG_TSTMP BIT(10) +#define ENETC_RXBD_FLAG_TPID GENMASK(1, 0) #define ENETC_MAC_ADDR_FILT_CNT 8 /* # of supported entries per port */ #define EMETC_MAC_ADDR_FILT_RES 3 /* # of reserved entries at the beginning */ From a74dbce9d4541888fe0d39afe69a3a95004669b4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 1 Mar 2021 13:18:15 +0200 Subject: [PATCH 070/164] net: enetc: don't disable VLAN filtering in IFF_PROMISC mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quoting from the blamed commit: In promiscuous mode, it is more intuitive that all traffic is received, including VLAN tagged traffic. It appears that it is necessary to set the flag in PSIPVMR for that to be the case, so VLAN promiscuous mode is also temporarily enabled. On exit from promiscuous mode, the setting made by ethtool is restored. Intuitive or not, there isn't any definition issued by a standards body which says that promiscuity has anything to do with VLAN filtering - it only has to do with accepting packets regardless of destination MAC address. In fact people are already trying to use this misunderstanding/bug of the enetc driver as a justification to transform promiscuity into something it never was about: accepting every packet (maybe that would be the "rx-all" netdev feature?): https://lore.kernel.org/netdev/20201110153958.ci5ekor3o2ekg3ky@ipetronik.com/ This is relevant because there are use cases in the kernel (such as tc-flower rules with the protocol 802.1Q and a vlan_id key) which do not (yet) use the vlan_vid_add API to be compatible with VLAN-filtering NICs such as enetc, so for those, disabling rx-vlan-filter is currently the only right solution to make these setups work: https://lore.kernel.org/netdev/CA+h21hoxwRdhq4y+w8Kwgm74d4cA0xLeiHTrmT-VpSaM7obhkg@mail.gmail.com/ The blamed patch has unintentionally introduced one more way for this to work, which is to enable IFF_PROMISC, however this is non-portable because port promiscuity is not meant to disable VLAN filtering. Therefore, it could invite people to write broken scripts for enetc, and then wonder why they are broken when migrating to other drivers that don't handle promiscuity in the same way. Fixes: 7070eea5e95a ("enetc: permit configuration of rx-vlan-filter with ethtool") Cc: Markus Blöchl Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_pf.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 62ba4bf56f0d..49681a0566ed 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -190,7 +190,6 @@ static void enetc_pf_set_rx_mode(struct net_device *ndev) { struct enetc_ndev_priv *priv = netdev_priv(ndev); struct enetc_pf *pf = enetc_si_priv(priv->si); - char vlan_promisc_simap = pf->vlan_promisc_simap; struct enetc_hw *hw = &priv->si->hw; bool uprom = false, mprom = false; struct enetc_mac_filter *filter; @@ -203,16 +202,12 @@ static void enetc_pf_set_rx_mode(struct net_device *ndev) psipmr = ENETC_PSIPMR_SET_UP(0) | ENETC_PSIPMR_SET_MP(0); uprom = true; mprom = true; - /* Enable VLAN promiscuous mode for SI0 (PF) */ - vlan_promisc_simap |= BIT(0); } else if (ndev->flags & IFF_ALLMULTI) { /* enable multi cast promisc mode for SI0 (PF) */ psipmr = ENETC_PSIPMR_SET_MP(0); mprom = true; } - enetc_set_vlan_promisc(&pf->si->hw, vlan_promisc_simap); - /* first 2 filter entries belong to PF */ if (!uprom) { /* Update unicast filters */ From c76a97218dcbb2cb7cec1404ace43ef96c87d874 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 1 Mar 2021 13:18:16 +0200 Subject: [PATCH 071/164] net: enetc: force the RGMII speed and duplex instead of operating in inband mode The ENETC port 0 MAC supports in-band status signaling coming from a PHY when operating in RGMII mode, and this feature is enabled by default. It has been reported that RGMII is broken in fixed-link, and that is not surprising considering the fact that no PHY is attached to the MAC in that case, but a switch. This brings us to the topic of the patch: the enetc driver should have not enabled the optional in-band status signaling for RGMII unconditionally, but should have forced the speed and duplex to what was resolved by phylink. Note that phylink does not accept the RGMII modes as valid for in-band signaling, and these operate a bit differently than 1000base-x and SGMII (notably there is no clause 37 state machine so no ACK required from the MAC, instead the PHY sends extra code words on RXD[3:0] whenever it is not transmitting something else, so it should be safe to leave a PHY with this option unconditionally enabled even if we ignore it). The spec talks about this here: https://e2e.ti.com/cfs-file/__key/communityserver-discussions-components-files/138/RGMIIv1_5F00_3.pdf Fixes: 71b77a7a27a3 ("enetc: Migrate to PHYLINK and PCS_LYNX") Cc: Florian Fainelli Cc: Andrew Lunn Cc: Russell King Signed-off-by: Vladimir Oltean Acked-by: Russell King Signed-off-by: David S. Miller --- .../net/ethernet/freescale/enetc/enetc_hw.h | 13 +++-- .../net/ethernet/freescale/enetc/enetc_pf.c | 53 ++++++++++++++++--- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index a62604a1e54e..de0d20b0f489 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -238,10 +238,17 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_PM_IMDIO_BASE 0x8030 #define ENETC_PM0_IF_MODE 0x8300 -#define ENETC_PMO_IFM_RG BIT(2) +#define ENETC_PM0_IFM_RG BIT(2) #define ENETC_PM0_IFM_RLP (BIT(5) | BIT(11)) -#define ENETC_PM0_IFM_RGAUTO (BIT(15) | ENETC_PMO_IFM_RG | BIT(1)) -#define ENETC_PM0_IFM_XGMII BIT(12) +#define ENETC_PM0_IFM_EN_AUTO BIT(15) +#define ENETC_PM0_IFM_SSP_MASK GENMASK(14, 13) +#define ENETC_PM0_IFM_SSP_1000 (2 << 13) +#define ENETC_PM0_IFM_SSP_100 (0 << 13) +#define ENETC_PM0_IFM_SSP_10 (1 << 13) +#define ENETC_PM0_IFM_FULL_DPX BIT(12) +#define ENETC_PM0_IFM_IFMODE_MASK GENMASK(1, 0) +#define ENETC_PM0_IFM_IFMODE_XGMII 0 +#define ENETC_PM0_IFM_IFMODE_GMII 2 #define ENETC_PSIDCAPR 0x1b08 #define ENETC_PSIDCAPR_MSK GENMASK(15, 0) #define ENETC_PSFCAPR 0x1b18 diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 49681a0566ed..ca02f033bea2 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -315,7 +315,7 @@ static void enetc_set_loopback(struct net_device *ndev, bool en) u32 reg; reg = enetc_port_rd(hw, ENETC_PM0_IF_MODE); - if (reg & ENETC_PMO_IFM_RG) { + if (reg & ENETC_PM0_IFM_RG) { /* RGMII mode */ reg = (reg & ~ENETC_PM0_IFM_RLP) | (en ? ENETC_PM0_IFM_RLP : 0); @@ -494,13 +494,20 @@ static void enetc_configure_port_mac(struct enetc_hw *hw) static void enetc_mac_config(struct enetc_hw *hw, phy_interface_t phy_mode) { - /* set auto-speed for RGMII */ - if (enetc_port_rd(hw, ENETC_PM0_IF_MODE) & ENETC_PMO_IFM_RG || - phy_interface_mode_is_rgmii(phy_mode)) - enetc_port_wr(hw, ENETC_PM0_IF_MODE, ENETC_PM0_IFM_RGAUTO); + u32 val; - if (phy_mode == PHY_INTERFACE_MODE_USXGMII) - enetc_port_wr(hw, ENETC_PM0_IF_MODE, ENETC_PM0_IFM_XGMII); + if (phy_interface_mode_is_rgmii(phy_mode)) { + val = enetc_port_rd(hw, ENETC_PM0_IF_MODE); + val &= ~ENETC_PM0_IFM_EN_AUTO; + val &= ENETC_PM0_IFM_IFMODE_MASK; + val |= ENETC_PM0_IFM_IFMODE_GMII | ENETC_PM0_IFM_RG; + enetc_port_wr(hw, ENETC_PM0_IF_MODE, val); + } + + if (phy_mode == PHY_INTERFACE_MODE_USXGMII) { + val = ENETC_PM0_IFM_FULL_DPX | ENETC_PM0_IFM_IFMODE_XGMII; + enetc_port_wr(hw, ENETC_PM0_IF_MODE, val); + } } static void enetc_mac_enable(struct enetc_hw *hw, bool en) @@ -932,6 +939,34 @@ static void enetc_pl_mac_config(struct phylink_config *config, phylink_set_pcs(priv->phylink, &pf->pcs->pcs); } +static void enetc_force_rgmii_mac(struct enetc_hw *hw, int speed, int duplex) +{ + u32 old_val, val; + + old_val = val = enetc_port_rd(hw, ENETC_PM0_IF_MODE); + + if (speed == SPEED_1000) { + val &= ~ENETC_PM0_IFM_SSP_MASK; + val |= ENETC_PM0_IFM_SSP_1000; + } else if (speed == SPEED_100) { + val &= ~ENETC_PM0_IFM_SSP_MASK; + val |= ENETC_PM0_IFM_SSP_100; + } else if (speed == SPEED_10) { + val &= ~ENETC_PM0_IFM_SSP_MASK; + val |= ENETC_PM0_IFM_SSP_10; + } + + if (duplex == DUPLEX_FULL) + val |= ENETC_PM0_IFM_FULL_DPX; + else + val &= ~ENETC_PM0_IFM_FULL_DPX; + + if (val == old_val) + return; + + enetc_port_wr(hw, ENETC_PM0_IF_MODE, val); +} + static void enetc_pl_mac_link_up(struct phylink_config *config, struct phy_device *phy, unsigned int mode, phy_interface_t interface, int speed, @@ -944,6 +979,10 @@ static void enetc_pl_mac_link_up(struct phylink_config *config, if (priv->active_offloads & ENETC_F_QBV) enetc_sched_speed_set(priv, speed); + if (!phylink_autoneg_inband(mode) && + phy_interface_mode_is_rgmii(interface)) + enetc_force_rgmii_mac(&pf->si->hw, speed, duplex); + enetc_mac_enable(&pf->si->hw, true); } From 96a5223b918c8b79270fc0fec235a7ebad459098 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 1 Mar 2021 13:18:17 +0200 Subject: [PATCH 072/164] net: enetc: remove bogus write to SIRXIDR from enetc_setup_rxbdr The Station Interface Receive Interrupt Detect Register (SIRXIDR) contains a 16-bit wide mask of 'interrupt detected' events for each ring associated with a port. Bit i is write-1-to-clean for RX ring i. I have no explanation whatsoever how this line of code came to be inserted in the blamed commit. I checked the downstream versions of that patch and none of them have it. The somewhat comical aspect of it is that we're writing a binary number to the SIRXIDR register, which is derived from enetc_bd_unused(rx_ring). Since the RX rings have 512 buffer descriptors, we end up writing 511 to this register, which is 0x1ff, so we are effectively clearing the 'interrupt detected' event for rings 0-8. This register is not what is used for interrupt handling though - it only provides a summary for the entire SI. The hardware provides one separate Interrupt Detect Register per RX ring, which auto-clears upon read. So there doesn't seem to be any adverse effect caused by this bogus write. There is, however, one reason why this should be handled as a bugfix: next_to_clean _should_ be committed to hardware, just not to that register, and this was obscuring the fact that it wasn't. This is fixed in the next patch, and removing the bogus line now allows the fix patch to be backported beyond that point. Fixes: fd5736bf9f23 ("enetc: Workaround for MDIO register access issue") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 8ddf0cdc37a5..abb29ee81463 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1212,7 +1212,6 @@ static void enetc_setup_rxbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) rx_ring->idr = hw->reg + ENETC_SIRXIDR; enetc_refill_rx_ring(rx_ring, enetc_bd_unused(rx_ring)); - enetc_wr(hw, ENETC_SIRXIDR, rx_ring->next_to_use); /* enable ring */ enetc_rxbdr_wr(hw, idx, ENETC_RBMR, rbmr); From 3a5d12c9be6f30080600c8bacaf310194e37d029 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 1 Mar 2021 13:18:18 +0200 Subject: [PATCH 073/164] net: enetc: keep RX ring consumer index in sync with hardware The RX rings have a producer index owned by hardware, where newly received frame buffers are placed, and a consumer index owned by software, where newly allocated buffers are placed, in expectation of hardware being able to place frame data in them. Hardware increments the producer index when a frame is received, however it is not allowed to increment the producer index to match the consumer index (RBCIR) since the ring can hold at most RBLENR[LENGTH]-1 received BDs. Whenever the producer index matches the value of the consumer index, the ring has no unprocessed received frames and all BDs in the ring have been initialized/prepared by software, i.e. hardware owns all BDs in the ring. The code uses the next_to_clean variable to keep track of the producer index, and the next_to_use variable to keep track of the consumer index. The RX rings are seeded from enetc_refill_rx_ring, which is called from two places: 1. initially the ring is seeded until full with enetc_bd_unused(rx_ring), i.e. with 511 buffers. This will make next_to_clean=0 and next_to_use=511: .ndo_open -> enetc_open -> enetc_setup_bdrs -> enetc_setup_rxbdr -> enetc_refill_rx_ring 2. then during the data path processing, it is refilled with 16 buffers at a time: enetc_msix -> napi_schedule -> enetc_poll -> enetc_clean_rx_ring -> enetc_refill_rx_ring There is just one problem: the initial seeding done during .ndo_open updates just the producer index (ENETC_RBPIR) with 0, and the software next_to_clean and next_to_use variables. Notably, it will not update the consumer index to make the hardware aware of the newly added buffers. Wait, what? So how does it work? Well, the reset values of the producer index and of the consumer index of a ring are both zero. As per the description in the second paragraph, it means that the ring is full of buffers waiting for hardware to put frames in them, which by coincidence is almost true, because we have in fact seeded 511 buffers into the ring. But will the hardware attempt to access the 512th entry of the ring, which has an invalid BD in it? Well, no, because in order to do that, it would have to first populate the first 511 entries, and the NAPI enetc_poll will kick in by then. Eventually, after 16 processed slots have become available in the RX ring, enetc_clean_rx_ring will call enetc_refill_rx_ring and then will [ finally ] update the consumer index with the new software next_to_use variable. From now on, the next_to_clean and next_to_use variables are in sync with the producer and consumer ring indices. So the day is saved, right? Well, not quite. Freeing the memory allocated for the rings is done in: enetc_close -> enetc_clear_bdrs -> enetc_clear_rxbdr -> this just disables the ring -> enetc_free_rxtx_rings -> enetc_free_rx_ring -> sets next_to_clean and next_to_use to 0 but again, nothing is committed to the hardware producer and consumer indices (yay!). The assumption is that the ring is disabled, so the indices don't matter anyway, and it's the responsibility of the "open" code path to set those up. .. Except that the "open" code path does not set those up properly. While initially, things almost work, during subsequent enetc_close -> enetc_open sequences, we have problems. To be precise, the enetc_open that is subsequent to enetc_close will again refill the ring with 511 entries, but it will leave the consumer index untouched. Untouched means, of course, equal to the value it had before disabling the ring and draining the old buffers in enetc_close. But as mentioned, enetc_setup_rxbdr will at least update the producer index though, through this line of code: enetc_rxbdr_wr(hw, idx, ENETC_RBPIR, 0); so at this stage we'll have: next_to_clean=0 (in hardware 0) next_to_use=511 (in hardware we'll have the refill index prior to enetc_close) Again, the next_to_clean and producer index are in sync and set to correct values, so the driver manages to limp on. Eventually, 16 ring entries will be consumed by enetc_poll, and the savior enetc_clean_rx_ring will come and call enetc_refill_rx_ring, and then update the hardware consumer ring based upon the new next_to_use. So.. it works? Well, by coincidence, it almost does, but there's a circumstance where enetc_clean_rx_ring won't be there to save us. If the previous value of the consumer index was 15, there's a problem, because the NAPI poll sequence will only issue a refill when 16 or more buffers have been consumed. It's easiest to illustrate this with an example: ip link set eno0 up ip addr add 192.168.100.1/24 dev eno0 ping 192.168.100.1 -c 20 # ping this port from another board ip link set eno0 down ip link set eno0 up ping 192.168.100.1 -c 20 # ping it again from the same other board One by one: 1. ip link set eno0 up -> calls enetc_setup_rxbdr: -> calls enetc_refill_rx_ring(511 buffers) -> next_to_clean=0 (in hw 0) -> next_to_use=511 (in hw 0) 2. ping 192.168.100.1 -c 20 # ping this port from another board enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=1 next_to_clean 0 (in hw 1) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=2 next_to_clean 1 (in hw 2) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=3 next_to_clean 2 (in hw 3) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=4 next_to_clean 3 (in hw 4) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=5 next_to_clean 4 (in hw 5) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=6 next_to_clean 5 (in hw 6) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=7 next_to_clean 6 (in hw 7) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=8 next_to_clean 7 (in hw 8) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=9 next_to_clean 8 (in hw 9) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=10 next_to_clean 9 (in hw 10) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=11 next_to_clean 10 (in hw 11) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=12 next_to_clean 11 (in hw 12) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=13 next_to_clean 12 (in hw 13) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=14 next_to_clean 13 (in hw 14) next_to_use 511 (in hw 0) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=15 next_to_clean 14 (in hw 15) next_to_use 511 (in hw 0) enetc_clean_rx_ring: enetc_refill_rx_ring(16) increments next_to_use by 16 (mod 512) and writes it to hw enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=0 next_to_clean 15 (in hw 16) next_to_use 15 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=1 next_to_clean 16 (in hw 17) next_to_use 15 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=2 next_to_clean 17 (in hw 18) next_to_use 15 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=3 next_to_clean 18 (in hw 19) next_to_use 15 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=4 next_to_clean 19 (in hw 20) next_to_use 15 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=5 next_to_clean 20 (in hw 21) next_to_use 15 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=6 next_to_clean 21 (in hw 22) next_to_use 15 (in hw 15) 20 packets transmitted, 20 packets received, 0% packet loss 3. ip link set eno0 down enetc_free_rx_ring: next_to_clean 0 (in hw 22), next_to_use 0 (in hw 15) 4. ip link set eno0 up -> calls enetc_setup_rxbdr: -> calls enetc_refill_rx_ring(511 buffers) -> next_to_clean=0 (in hw 0) -> next_to_use=511 (in hw 15) 5. ping 192.168.100.1 -c 20 # ping it again from the same other board enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=1 next_to_clean 0 (in hw 1) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=2 next_to_clean 1 (in hw 2) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=3 next_to_clean 2 (in hw 3) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=4 next_to_clean 3 (in hw 4) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=5 next_to_clean 4 (in hw 5) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=6 next_to_clean 5 (in hw 6) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=7 next_to_clean 6 (in hw 7) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=8 next_to_clean 7 (in hw 8) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=9 next_to_clean 8 (in hw 9) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=10 next_to_clean 9 (in hw 10) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=11 next_to_clean 10 (in hw 11) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=12 next_to_clean 11 (in hw 12) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=13 next_to_clean 12 (in hw 13) next_to_use 511 (in hw 15) enetc_clean_rx_ring: rx_frm_cnt=1 cleaned_cnt=14 next_to_clean 13 (in hw 14) next_to_use 511 (in hw 15) 20 packets transmitted, 12 packets received, 40% packet loss And there it dies. No enetc_refill_rx_ring (because cleaned_cnt must be equal to 15 for that to happen), no nothing. The hardware enters the condition where the producer (14) + 1 is equal to the consumer (15) index, which makes it believe it has no more free buffers to put packets in, so it starts discarding them: ip netns exec ns0 ethtool -S eno0 | grep -v ': 0' NIC statistics: Rx ring 0 discarded frames: 8 Summarized, if the interface receives between 16 and 32 (mod 512) frames and then there is a link flap, then the port will eventually die with no way to recover. If it receives less than 16 (mod 512) frames, then the initial NAPI poll [ before the link flap ] will not update the consumer index in hardware (it will remain zero) which will be ok when the buffers are later reinitialized. If more than 32 (mod 512) frames are received, the initial NAPI poll has the chance to refill the ring twice, updating the consumer index to at least 32. So after the link flap, the consumer index is still wrong, but the post-flap NAPI poll gets a chance to refill the ring once (because it passes through cleaned_cnt=15) and makes the consumer index be again back in sync with next_to_use. The solution to this problem is actually simple, we just need to write next_to_use into the hardware consumer index at enetc_open time, which always brings it back in sync after an initial buffer seeding process. The simpler thing would be to put the write to the consumer index into enetc_refill_rx_ring directly, but there are issues with the MDIO locking: in the NAPI poll code we have the enetc_lock_mdio() taken from top-level and we use the unlocked enetc_wr_reg_hot, whereas in enetc_open, the enetc_lock_mdio() is not taken at the top level, but instead by each individual enetc_wr_reg, so we are forced to put an additional enetc_wr_reg in enetc_setup_rxbdr. Better organization of the code is left as a refactoring exercise. Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index abb29ee81463..30d7d4e83900 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1212,6 +1212,8 @@ static void enetc_setup_rxbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) rx_ring->idr = hw->reg + ENETC_SIRXIDR; enetc_refill_rx_ring(rx_ring, enetc_bd_unused(rx_ring)); + /* update ENETC's consumer index */ + enetc_rxbdr_wr(hw, idx, ENETC_RBCIR, rx_ring->next_to_use); /* enable ring */ enetc_rxbdr_wr(hw, idx, ENETC_RBMR, rbmr); From 2353db75c3db1dd26ff9c8feccfd3543a9cb73be Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 1 Mar 2021 21:28:23 +0900 Subject: [PATCH 074/164] docs: networking: bonding.rst Fix a typo in bonding.rst This patch fixes a spelling typo in bonding.rst. Signed-off-by: Masanari Iida Signed-off-by: David S. Miller --- Documentation/networking/bonding.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 5f690f0ad0e4..62f2aab8eaec 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -1988,7 +1988,7 @@ netif_carrier. If use_carrier is 0, then the MII monitor will first query the device's (via ioctl) MII registers and check the link state. If that request fails (not just that it returns carrier down), then the MII -monitor will make an ethtool ETHOOL_GLINK request to attempt to obtain +monitor will make an ethtool ETHTOOL_GLINK request to attempt to obtain the same information. If both methods fail (i.e., the driver either does not support or had some error in processing both the MII register and ethtool requests), then the MII monitor will assume the link is From b228c9b058760500fda5edb3134527f629fc2dc3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 1 Mar 2021 15:09:44 +0000 Subject: [PATCH 075/164] net: expand textsearch ts_state to fit skb_seq_state The referenced commit expands the skb_seq_state used by skb_find_text with a 4B frag_off field, growing it to 48B. This exceeds container ts_state->cb, causing a stack corruption: [ 73.238353] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: skb_find_text+0xc5/0xd0 [ 73.247384] CPU: 1 PID: 376 Comm: nping Not tainted 5.11.0+ #4 [ 73.252613] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 [ 73.260078] Call Trace: [ 73.264677] dump_stack+0x57/0x6a [ 73.267866] panic+0xf6/0x2b7 [ 73.270578] ? skb_find_text+0xc5/0xd0 [ 73.273964] __stack_chk_fail+0x10/0x10 [ 73.277491] skb_find_text+0xc5/0xd0 [ 73.280727] string_mt+0x1f/0x30 [ 73.283639] ipt_do_table+0x214/0x410 The struct is passed between skb_find_text and its callbacks skb_prepare_seq_read, skb_seq_read and skb_abort_seq read through the textsearch interface using TS_SKB_CB. I assumed that this mapped to skb->cb like other .._SKB_CB wrappers. skb->cb is 48B. But it maps to ts_state->cb, which is only 40B. skb->cb was increased from 40B to 48B after ts_state was introduced, in commit 3e3850e989c5 ("[NETFILTER]: Fix xfrm lookup in ip_route_me_harder/ip6_route_me_harder"). Increase ts_state.cb[] to 48 to fit the struct. Also add a BUILD_BUG_ON to avoid a repeat. The alternative is to directly add a dependency from textsearch onto linux/skbuff.h, but I think the intent is textsearch to have no such dependencies on its callers. Link: https://bugzilla.kernel.org/show_bug.cgi?id=211911 Fixes: 97550f6fa592 ("net: compound page support in skb_seq_read") Reported-by: Kris Karas Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/textsearch.h | 2 +- net/core/skbuff.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h index 13770cfe33ad..6673e4d4ac2e 100644 --- a/include/linux/textsearch.h +++ b/include/linux/textsearch.h @@ -23,7 +23,7 @@ struct ts_config; struct ts_state { unsigned int offset; - char cb[40]; + char cb[48]; }; /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 545a472273a5..c421c8f80925 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3659,6 +3659,8 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, struct ts_state state; unsigned int ret; + BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); + config->get_next_block = skb_ts_get_next_block; config->finish = skb_ts_finish; From 9200f515c41f4cbaeffd8fdd1d8b6373a18b1b67 Mon Sep 17 00:00:00 2001 From: DENG Qingfang Date: Tue, 2 Mar 2021 00:01:59 +0800 Subject: [PATCH 076/164] net: dsa: tag_mtk: fix 802.1ad VLAN egress A different TPID bit is used for 802.1ad VLAN frames. Reported-by: Ilario Gelmetti Fixes: f0af34317f4b ("net: dsa: mediatek: combine MediaTek tag with VLAN tag") Signed-off-by: DENG Qingfang Signed-off-by: David S. Miller --- net/dsa/tag_mtk.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 38dcdded74c0..59748487664f 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -13,6 +13,7 @@ #define MTK_HDR_LEN 4 #define MTK_HDR_XMIT_UNTAGGED 0 #define MTK_HDR_XMIT_TAGGED_TPID_8100 1 +#define MTK_HDR_XMIT_TAGGED_TPID_88A8 2 #define MTK_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0) #define MTK_HDR_XMIT_DP_BIT_MASK GENMASK(5, 0) #define MTK_HDR_XMIT_SA_DIS BIT(6) @@ -21,8 +22,8 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); + u8 xmit_tpid; u8 *mtk_tag; - bool is_vlan_skb = true; unsigned char *dest = eth_hdr(skb)->h_dest; bool is_multicast_skb = is_multicast_ether_addr(dest) && !is_broadcast_ether_addr(dest); @@ -33,10 +34,17 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, * the both special and VLAN tag at the same time and then look up VLAN * table with VID. */ - if (!skb_vlan_tagged(skb)) { + switch (skb->protocol) { + case htons(ETH_P_8021Q): + xmit_tpid = MTK_HDR_XMIT_TAGGED_TPID_8100; + break; + case htons(ETH_P_8021AD): + xmit_tpid = MTK_HDR_XMIT_TAGGED_TPID_88A8; + break; + default: + xmit_tpid = MTK_HDR_XMIT_UNTAGGED; skb_push(skb, MTK_HDR_LEN); memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); - is_vlan_skb = false; } mtk_tag = skb->data + 2 * ETH_ALEN; @@ -44,8 +52,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, /* Mark tag attribute on special tag insertion to notify hardware * whether that's a combined special tag with 802.1Q header. */ - mtk_tag[0] = is_vlan_skb ? MTK_HDR_XMIT_TAGGED_TPID_8100 : - MTK_HDR_XMIT_UNTAGGED; + mtk_tag[0] = xmit_tpid; mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; /* Disable SA learning for multicast frames */ @@ -53,7 +60,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, mtk_tag[1] |= MTK_HDR_XMIT_SA_DIS; /* Tag control information is kept for 802.1Q */ - if (!is_vlan_skb) { + if (xmit_tpid == MTK_HDR_XMIT_UNTAGGED) { mtk_tag[2] = 0; mtk_tag[3] = 0; } From 3946688edbc5b629110c339b3babf10aa9e7adad Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 1 Mar 2021 19:25:30 +0100 Subject: [PATCH 077/164] hv_netvsc: Fix validation in netvsc_linkstatus_callback() Contrary to the RNDIS protocol specification, certain (pre-Fe) implementations of Hyper-V's vSwitch did not account for the status buffer field in the length of an RNDIS packet; the bug was fixed in newer implementations. Validate the status buffer fields using the length of the 'vmtransfer_page' packet (all implementations), that is known/validated to be less than or equal to the receive section size and not smaller than the length of the RNDIS message. Reported-by: Dexuan Cui Suggested-by: Haiyang Zhang Signed-off-by: Andrea Parri (Microsoft) Fixes: 505e3f00c3f36 ("hv_netvsc: Add (more) validation for untrusted Hyper-V values") Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 2 +- drivers/net/hyperv/netvsc_drv.c | 13 +++++++++---- drivers/net/hyperv/rndis_filter.c | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index e1a497d3c9ba..59ac04a610ad 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -229,7 +229,7 @@ int netvsc_send(struct net_device *net, bool xdp_tx); void netvsc_linkstatus_callback(struct net_device *net, struct rndis_message *resp, - void *data); + void *data, u32 data_buflen); int netvsc_recv_callback(struct net_device *net, struct netvsc_device *nvdev, struct netvsc_channel *nvchan); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 8176fa0c8b16..15f262b70489 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -744,7 +744,7 @@ static netdev_tx_t netvsc_start_xmit(struct sk_buff *skb, */ void netvsc_linkstatus_callback(struct net_device *net, struct rndis_message *resp, - void *data) + void *data, u32 data_buflen) { struct rndis_indicate_status *indicate = &resp->msg.indicate_status; struct net_device_context *ndev_ctx = netdev_priv(net); @@ -765,11 +765,16 @@ void netvsc_linkstatus_callback(struct net_device *net, if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) { u32 speed; - /* Validate status_buf_offset */ + /* Validate status_buf_offset and status_buflen. + * + * Certain (pre-Fe) implementations of Hyper-V's vSwitch didn't account + * for the status buffer field in resp->msg_len; perform the validation + * using data_buflen (>= resp->msg_len). + */ if (indicate->status_buflen < sizeof(speed) || indicate->status_buf_offset < sizeof(*indicate) || - resp->msg_len - RNDIS_HEADER_SIZE < indicate->status_buf_offset || - resp->msg_len - RNDIS_HEADER_SIZE - indicate->status_buf_offset + data_buflen - RNDIS_HEADER_SIZE < indicate->status_buf_offset || + data_buflen - RNDIS_HEADER_SIZE - indicate->status_buf_offset < indicate->status_buflen) { netdev_err(net, "invalid rndis_indicate_status packet\n"); return; diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 123cc9d25f5e..c0e89e107d57 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -620,7 +620,7 @@ int rndis_filter_receive(struct net_device *ndev, case RNDIS_MSG_INDICATE: /* notification msgs */ - netvsc_linkstatus_callback(ndev, rndis_msg, data); + netvsc_linkstatus_callback(ndev, rndis_msg, data, buflen); break; default: netdev_err(ndev, From 8811f4a9836e31c14ecdf79d9f3cb7c5d463265d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Mar 2021 10:29:17 -0800 Subject: [PATCH 078/164] tcp: add sanity tests to TCP_QUEUE_SEQ Qingyu Li reported a syzkaller bug where the repro changes RCV SEQ _after_ restoring data in the receive queue. mprotect(0x4aa000, 12288, PROT_READ) = 0 mmap(0x1ffff000, 4096, PROT_NONE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x1ffff000 mmap(0x20000000, 16777216, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x20000000 mmap(0x21000000, 4096, PROT_NONE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x21000000 socket(AF_INET6, SOCK_STREAM, IPPROTO_IP) = 3 setsockopt(3, SOL_TCP, TCP_REPAIR, [1], 4) = 0 connect(3, {sa_family=AF_INET6, sin6_port=htons(0), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_scope_id=0}, 28) = 0 setsockopt(3, SOL_TCP, TCP_REPAIR_QUEUE, [1], 4) = 0 sendmsg(3, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="0x0000000000000003\0\0", iov_len=20}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20 setsockopt(3, SOL_TCP, TCP_REPAIR, [0], 4) = 0 setsockopt(3, SOL_TCP, TCP_QUEUE_SEQ, [128], 4) = 0 recvfrom(3, NULL, 20, 0, NULL, NULL) = -1 ECONNRESET (Connection reset by peer) syslog shows: [ 111.205099] TCP recvmsg seq # bug 2: copied 80, seq 0, rcvnxt 80, fl 0 [ 111.207894] WARNING: CPU: 1 PID: 356 at net/ipv4/tcp.c:2343 tcp_recvmsg_locked+0x90e/0x29a0 This should not be allowed. TCP_QUEUE_SEQ should only be used when queues are empty. This patch fixes this case, and the tx path as well. Fixes: ee9952831cfd ("tcp: Initial repair mode") Signed-off-by: Eric Dumazet Cc: Pavel Emelyanov Link: https://bugzilla.kernel.org/show_bug.cgi?id=212005 Reported-by: Qingyu Li Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dfb6f286c1de..de7cc8445ac0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3469,16 +3469,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_QUEUE_SEQ: - if (sk->sk_state != TCP_CLOSE) + if (sk->sk_state != TCP_CLOSE) { err = -EPERM; - else if (tp->repair_queue == TCP_SEND_QUEUE) - WRITE_ONCE(tp->write_seq, val); - else if (tp->repair_queue == TCP_RECV_QUEUE) { - WRITE_ONCE(tp->rcv_nxt, val); - WRITE_ONCE(tp->copied_seq, val); - } - else + } else if (tp->repair_queue == TCP_SEND_QUEUE) { + if (!tcp_rtx_queue_empty(sk)) + err = -EPERM; + else + WRITE_ONCE(tp->write_seq, val); + } else if (tp->repair_queue == TCP_RECV_QUEUE) { + if (tp->rcv_nxt != tp->copied_seq) { + err = -EPERM; + } else { + WRITE_ONCE(tp->rcv_nxt, val); + WRITE_ONCE(tp->copied_seq, val); + } + } else { err = -EINVAL; + } break; case TCP_REPAIR_OPTIONS: From 42a382a466a967dc053c73b969cd2ac2fec502cf Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Sat, 27 Feb 2021 06:17:26 +0100 Subject: [PATCH 079/164] selftests/bpf: Use the last page in test_snprintf_btf on s390 test_snprintf_btf fails on s390, because NULL points to a readable struct lowcore there. Fix by using the last page instead. Error message example: printing fffffffffffff000 should generate error, got (361) Fixes: 076a95f5aff2 ("selftests/bpf: Add bpf_snprintf_btf helper tests") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Acked-by: Heiko Carstens Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210227051726.121256-1-iii@linux.ibm.com --- .../testing/selftests/bpf/progs/netif_receive_skb.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/netif_receive_skb.c b/tools/testing/selftests/bpf/progs/netif_receive_skb.c index 6b670039ea67..1d8918dfbd3f 100644 --- a/tools/testing/selftests/bpf/progs/netif_receive_skb.c +++ b/tools/testing/selftests/bpf/progs/netif_receive_skb.c @@ -16,6 +16,13 @@ bool skip = false; #define STRSIZE 2048 #define EXPECTED_STRSIZE 256 +#if defined(bpf_target_s390) +/* NULL points to a readable struct lowcore on s390, so take the last page */ +#define BADPTR ((void *)0xFFFFFFFFFFFFF000ULL) +#else +#define BADPTR 0 +#endif + #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif @@ -113,11 +120,11 @@ int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb) } /* Check invalid ptr value */ - p.ptr = 0; + p.ptr = BADPTR; __ret = bpf_snprintf_btf(str, STRSIZE, &p, sizeof(p), 0); if (__ret >= 0) { - bpf_printk("printing NULL should generate error, got (%d)", - __ret); + bpf_printk("printing %llx should generate error, got (%d)", + (unsigned long long)BADPTR, __ret); ret = -ERANGE; } From 6185266c5a853bb0f2a459e3ff594546f277609b Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Sun, 28 Feb 2021 12:30:17 +0200 Subject: [PATCH 080/164] selftests/bpf: Mask bpf_csum_diff() return value to 16 bits in test_verifier The verifier test labelled "valid read map access into a read-only array 2" calls the bpf_csum_diff() helper and checks its return value. However, architecture implementations of csum_partial() (which is what the helper uses) differ in whether they fold the return value to 16 bit or not. For example, x86 version has ... if (unlikely(odd)) { result = from32to16(result); result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); } ... while generic lib/checksum.c does: result = from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); This makes the helper return different values on different architectures, breaking the test on non-x86. To fix this, add an additional instruction to always mask the return value to 16 bits, and update the expected return value accordingly. Fixes: fb2abb73e575 ("bpf, selftest: test {rd, wr}only flags and direct value access") Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210228103017.320240-1-yauheni.kaliuta@redhat.com --- tools/testing/selftests/bpf/verifier/array_access.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index bed53b561e04..1b138cd2b187 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -250,12 +250,13 @@ BPF_MOV64_IMM(BPF_REG_5, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffff), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_array_ro = { 3 }, .result = ACCEPT, - .retval = -29, + .retval = 65507, }, { "invalid write map access into a read-only array 1", From 9cc0001a18b4e5f46ec481201c88ae16f0a69bb0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 27 Feb 2021 22:31:27 +0100 Subject: [PATCH 081/164] netfilter: nftables: disallow updates on table ownership Disallow updating the ownership bit on an existing table: Do not allow to grab ownership on an existing table. Do not allow to drop ownership on an existing table. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1eb5cdb3033..b07703e19108 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -916,6 +916,12 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if (flags == ctx->table->flags) return 0; + if ((nft_table_has_owner(ctx->table) && + !(flags & NFT_TABLE_F_OWNER)) || + (!nft_table_has_owner(ctx->table) && + flags & NFT_TABLE_F_OWNER)) + return -EOPNOTSUPP; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) From fa706dce2f2d7012654e2eab40da2b526c1424b3 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Tue, 2 Mar 2021 16:57:21 +0800 Subject: [PATCH 082/164] stmmac: intel: Fix mdio bus registration issue for TGL-H/ADL-S On Intel platforms which consist of two Ethernet Controllers such as TGL-H and ADL-S, a unique MDIO bus id is required for MDIO bus to be successful registered: [ 13.076133] sysfs: cannot create duplicate filename '/class/mdio_bus/stmmac-1' [ 13.083404] CPU: 8 PID: 1898 Comm: systemd-udevd Tainted: G U 5.11.0-net-next #106 [ 13.092410] Hardware name: Intel Corporation Alder Lake Client Platform/AlderLake-S ADP-S DRR4 CRB, BIOS ADLIFSI1.R00.1494.B00.2012031421 12/03/2020 [ 13.105709] Call Trace: [ 13.108176] dump_stack+0x64/0x7c [ 13.111553] sysfs_warn_dup+0x56/0x70 [ 13.115273] sysfs_do_create_link_sd.isra.2+0xbd/0xd0 [ 13.120371] device_add+0x4df/0x840 [ 13.123917] ? complete_all+0x2a/0x40 [ 13.127636] __mdiobus_register+0x98/0x310 [libphy] [ 13.132572] stmmac_mdio_register+0x1c5/0x3f0 [stmmac] [ 13.137771] ? stmmac_napi_add+0xa5/0xf0 [stmmac] [ 13.142493] stmmac_dvr_probe+0x806/0xee0 [stmmac] [ 13.147341] intel_eth_pci_probe+0x1cb/0x250 [dwmac_intel] [ 13.152884] pci_device_probe+0xd2/0x150 [ 13.156897] really_probe+0xf7/0x4d0 [ 13.160527] driver_probe_device+0x5d/0x140 [ 13.164761] device_driver_attach+0x4f/0x60 [ 13.168996] __driver_attach+0xa2/0x140 [ 13.172891] ? device_driver_attach+0x60/0x60 [ 13.177300] bus_for_each_dev+0x76/0xc0 [ 13.181188] bus_add_driver+0x189/0x230 [ 13.185083] ? 0xffffffffc0795000 [ 13.188446] driver_register+0x5b/0xf0 [ 13.192249] ? 0xffffffffc0795000 [ 13.195577] do_one_initcall+0x4d/0x210 [ 13.199467] ? kmem_cache_alloc_trace+0x2ff/0x490 [ 13.204228] do_init_module+0x5b/0x21c [ 13.208031] load_module+0x2a0c/0x2de0 [ 13.211838] ? __do_sys_finit_module+0xb1/0x110 [ 13.216420] __do_sys_finit_module+0xb1/0x110 [ 13.220825] do_syscall_64+0x33/0x40 [ 13.224451] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 13.229515] RIP: 0033:0x7fc2b1919ccd [ 13.233113] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 31 0c 00 f7 d8 64 89 01 48 [ 13.251912] RSP: 002b:00007ffcea2e5b98 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 13.259527] RAX: ffffffffffffffda RBX: 0000560558920f10 RCX: 00007fc2b1919ccd [ 13.266706] RDX: 0000000000000000 RSI: 00007fc2b1a881e3 RDI: 0000000000000012 [ 13.273887] RBP: 0000000000020000 R08: 0000000000000000 R09: 0000000000000000 [ 13.281036] R10: 0000000000000012 R11: 0000000000000246 R12: 00007fc2b1a881e3 [ 13.288183] R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffcea2e5d58 [ 13.295389] libphy: mii_bus stmmac-1 failed to register Fixes: 88af9bd4efbd ("stmmac: intel: Add ADL-S 1Gbps PCI IDs") Fixes: 8450e23f142f ("stmmac: intel: Add PCI IDs for TGL-H platform") Signed-off-by: Wong Vee Khee Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-intel.c | 54 ++++++++++++++----- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 751dfdeec41c..f2896872a86c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -446,8 +446,8 @@ static int tgl_common_data(struct pci_dev *pdev, return intel_mgbe_common_data(pdev, plat); } -static int tgl_sgmii_data(struct pci_dev *pdev, - struct plat_stmmacenet_data *plat) +static int tgl_sgmii_phy0_data(struct pci_dev *pdev, + struct plat_stmmacenet_data *plat) { plat->bus_id = 1; plat->phy_interface = PHY_INTERFACE_MODE_SGMII; @@ -456,12 +456,26 @@ static int tgl_sgmii_data(struct pci_dev *pdev, return tgl_common_data(pdev, plat); } -static struct stmmac_pci_info tgl_sgmii1g_info = { - .setup = tgl_sgmii_data, +static struct stmmac_pci_info tgl_sgmii1g_phy0_info = { + .setup = tgl_sgmii_phy0_data, }; -static int adls_sgmii_data(struct pci_dev *pdev, - struct plat_stmmacenet_data *plat) +static int tgl_sgmii_phy1_data(struct pci_dev *pdev, + struct plat_stmmacenet_data *plat) +{ + plat->bus_id = 2; + plat->phy_interface = PHY_INTERFACE_MODE_SGMII; + plat->serdes_powerup = intel_serdes_powerup; + plat->serdes_powerdown = intel_serdes_powerdown; + return tgl_common_data(pdev, plat); +} + +static struct stmmac_pci_info tgl_sgmii1g_phy1_info = { + .setup = tgl_sgmii_phy1_data, +}; + +static int adls_sgmii_phy0_data(struct pci_dev *pdev, + struct plat_stmmacenet_data *plat) { plat->bus_id = 1; plat->phy_interface = PHY_INTERFACE_MODE_SGMII; @@ -471,10 +485,24 @@ static int adls_sgmii_data(struct pci_dev *pdev, return tgl_common_data(pdev, plat); } -static struct stmmac_pci_info adls_sgmii1g_info = { - .setup = adls_sgmii_data, +static struct stmmac_pci_info adls_sgmii1g_phy0_info = { + .setup = adls_sgmii_phy0_data, }; +static int adls_sgmii_phy1_data(struct pci_dev *pdev, + struct plat_stmmacenet_data *plat) +{ + plat->bus_id = 2; + plat->phy_interface = PHY_INTERFACE_MODE_SGMII; + + /* SerDes power up and power down are done in BIOS for ADL */ + + return tgl_common_data(pdev, plat); +} + +static struct stmmac_pci_info adls_sgmii1g_phy1_info = { + .setup = adls_sgmii_phy1_data, +}; static const struct stmmac_pci_func_data galileo_stmmac_func_data[] = { { .func = 6, @@ -756,11 +784,11 @@ static const struct pci_device_id intel_eth_pci_id_table[] = { { PCI_DEVICE_DATA(INTEL, EHL_PSE1_RGMII1G_ID, &ehl_pse1_rgmii1g_info) }, { PCI_DEVICE_DATA(INTEL, EHL_PSE1_SGMII1G_ID, &ehl_pse1_sgmii1g_info) }, { PCI_DEVICE_DATA(INTEL, EHL_PSE1_SGMII2G5_ID, &ehl_pse1_sgmii1g_info) }, - { PCI_DEVICE_DATA(INTEL, TGL_SGMII1G_ID, &tgl_sgmii1g_info) }, - { PCI_DEVICE_DATA(INTEL, TGLH_SGMII1G_0_ID, &tgl_sgmii1g_info) }, - { PCI_DEVICE_DATA(INTEL, TGLH_SGMII1G_1_ID, &tgl_sgmii1g_info) }, - { PCI_DEVICE_DATA(INTEL, ADLS_SGMII1G_0_ID, &adls_sgmii1g_info) }, - { PCI_DEVICE_DATA(INTEL, ADLS_SGMII1G_1_ID, &adls_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, TGL_SGMII1G_ID, &tgl_sgmii1g_phy0_info) }, + { PCI_DEVICE_DATA(INTEL, TGLH_SGMII1G_0_ID, &tgl_sgmii1g_phy0_info) }, + { PCI_DEVICE_DATA(INTEL, TGLH_SGMII1G_1_ID, &tgl_sgmii1g_phy1_info) }, + { PCI_DEVICE_DATA(INTEL, ADLS_SGMII1G_0_ID, &adls_sgmii1g_phy0_info) }, + { PCI_DEVICE_DATA(INTEL, ADLS_SGMII1G_1_ID, &adls_sgmii1g_phy1_info) }, {} }; MODULE_DEVICE_TABLE(pci, intel_eth_pci_id_table); From 95b39f07a17faef3a9b225248ba449b976e529c8 Mon Sep 17 00:00:00 2001 From: Biao Huang Date: Tue, 2 Mar 2021 11:33:23 +0800 Subject: [PATCH 083/164] net: ethernet: mtk-star-emac: fix wrong unmap in RX handling mtk_star_dma_unmap_rx() should unmap the dma_addr of old skb rather than that of new skb. Assign new_dma_addr to desc_data.dma_addr after all handling of old skb ends to avoid unexpected receive side error. Fixes: f96e9641e92b ("net: ethernet: mtk-star-emac: fix error path in RX handling") Signed-off-by: Biao Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index a8641a407c06..96d2891f1675 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -1225,8 +1225,6 @@ static int mtk_star_receive_packet(struct mtk_star_priv *priv) goto push_new_skb; } - desc_data.dma_addr = new_dma_addr; - /* We can't fail anymore at this point: it's safe to unmap the skb. */ mtk_star_dma_unmap_rx(priv, &desc_data); @@ -1236,6 +1234,9 @@ static int mtk_star_receive_packet(struct mtk_star_priv *priv) desc_data.skb->dev = ndev; netif_receive_skb(desc_data.skb); + /* update dma_addr for new skb */ + desc_data.dma_addr = new_dma_addr; + push_new_skb: desc_data.len = skb_tailroom(new_skb); desc_data.skb = new_skb; From a22549f12767fce49c74c53a853595f82b727935 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 23 Feb 2021 14:00:39 +0000 Subject: [PATCH 084/164] iwlwifi: mvm: add terminate entry for dmi_system_id tables Make sure dmi_system_id tables are NULL terminated. This crashed when LTO was enabled: BUG: KASAN: global-out-of-bounds in dmi_check_system+0x5a/0x70 Read of size 1 at addr ffffffffc16af750 by task NetworkManager/1913 CPU: 4 PID: 1913 Comm: NetworkManager Not tainted 5.12.0-rc1+ #10057 Hardware name: LENOVO 20THCTO1WW/20THCTO1WW, BIOS N2VET27W (1.12 ) 12/21/2020 Call Trace: dump_stack+0x90/0xbe print_address_description.constprop.0+0x1d/0x140 ? dmi_check_system+0x5a/0x70 ? dmi_check_system+0x5a/0x70 kasan_report.cold+0x7b/0xd4 ? dmi_check_system+0x5a/0x70 __asan_load1+0x4d/0x50 dmi_check_system+0x5a/0x70 iwl_mvm_up+0x1360/0x1690 [iwlmvm] ? iwl_mvm_send_recovery_cmd+0x270/0x270 [iwlmvm] ? setup_object.isra.0+0x27/0xd0 ? kasan_poison+0x20/0x50 ? ___slab_alloc.constprop.0+0x483/0x5b0 ? mempool_kmalloc+0x17/0x20 ? ftrace_graph_ret_addr+0x2a/0xb0 ? kasan_poison+0x3c/0x50 ? cfg80211_iftype_allowed+0x2e/0x90 [cfg80211] ? __kasan_check_write+0x14/0x20 ? mutex_lock+0x86/0xe0 ? __mutex_lock_slowpath+0x20/0x20 __iwl_mvm_mac_start+0x49/0x290 [iwlmvm] iwl_mvm_mac_start+0x37/0x50 [iwlmvm] drv_start+0x73/0x1b0 [mac80211] ieee80211_do_open+0x53e/0xf10 [mac80211] ? ieee80211_check_concurrent_iface+0x266/0x2e0 [mac80211] ieee80211_open+0xb9/0x100 [mac80211] __dev_open+0x1b8/0x280 Fixes: a2ac0f48a07c ("iwlwifi: mvm: implement approved list for the PPAG feature") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Reviewed-by: Nathan Chancellor Tested-by: Victor Michel Acked-by: Luca Coelho [kvalo@codeaurora.org: improve commit log] Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210223140039.1708534-1-weiyongjun1@huawei.com --- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index 15e2773ce7e7..5ee64f7f3c85 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -1083,6 +1083,7 @@ static const struct dmi_system_id dmi_ppag_approved_list[] = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTek COMPUTER INC."), }, }, + {} }; static int iwl_mvm_ppag_init(struct iwl_mvm *mvm) From 436b265671d653787eed9bc716f44882d2a458cb Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 1 Mar 2021 19:16:37 -0600 Subject: [PATCH 085/164] iwlwifi: fix ARCH=i386 compilation warnings An unsigned long variable should rely on '%lu' format strings, not '%zd' Fixes: a1a6a4cf49ece ("iwlwifi: pnvm: implement reading PNVM from UEFI") Signed-off-by: Pierre-Louis Bossart Acked-by: Luca Coelho Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210302011640.1276636-1-pierre-louis.bossart@linux.intel.com --- drivers/net/wireless/intel/iwlwifi/fw/pnvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c b/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c index fd070ca5e517..40f2109a097f 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c @@ -271,12 +271,12 @@ static int iwl_pnvm_get_from_efi(struct iwl_trans *trans, err = efivar_entry_get(pnvm_efivar, NULL, &package_size, package); if (err) { IWL_DEBUG_FW(trans, - "PNVM UEFI variable not found %d (len %zd)\n", + "PNVM UEFI variable not found %d (len %lu)\n", err, package_size); goto out; } - IWL_DEBUG_FW(trans, "Read PNVM fro UEFI with size %zd\n", package_size); + IWL_DEBUG_FW(trans, "Read PNVM fro UEFI with size %lu\n", package_size); *data = kmemdup(package->data, *len, GFP_KERNEL); if (!*data) From 295d4cd82b0181dd36b145fd535c13d623d7a335 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 2 Mar 2021 11:34:51 +0100 Subject: [PATCH 086/164] iwlwifi: don't call netif_napi_add() with rxq->lock held (was Re: Lockdep warning in iwl_pcie_rx_handle()) We can't call netif_napi_add() with rxq-lock held, as there is a potential for deadlock as spotted by lockdep (see below). rxq->lock is not protecting anything over the netif_napi_add() codepath anyway, so let's drop it just before calling into NAPI. ======================================================== WARNING: possible irq lock inversion dependency detected 5.12.0-rc1-00002-gbada49429032 #5 Not tainted -------------------------------------------------------- irq/136-iwlwifi/565 just changed the state of lock: ffff89f28433b0b0 (&rxq->lock){+.-.}-{2:2}, at: iwl_pcie_rx_handle+0x7f/0x960 [iwlwifi] but this lock took another, SOFTIRQ-unsafe lock in the past: (napi_hash_lock){+.+.}-{2:2} and interrupts could create inverse lock ordering between them. other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(napi_hash_lock); local_irq_disable(); lock(&rxq->lock); lock(napi_hash_lock); lock(&rxq->lock); *** DEADLOCK *** 1 lock held by irq/136-iwlwifi/565: #0: ffff89f2b1440170 (sync_cmd_lockdep_map){+.+.}-{0:0}, at: iwl_pcie_irq_handler+0x5/0xb30 the shortest dependencies between 2nd lock and 1st lock: -> (napi_hash_lock){+.+.}-{2:2} { HARDIRQ-ON-W at: lock_acquire+0x277/0x3d0 _raw_spin_lock+0x2c/0x40 netif_napi_add+0x14b/0x270 e1000_probe+0x2fe/0xee0 [e1000e] local_pci_probe+0x42/0x90 pci_device_probe+0x10b/0x1c0 really_probe+0xef/0x4b0 driver_probe_device+0xde/0x150 device_driver_attach+0x4f/0x60 __driver_attach+0x9c/0x140 bus_for_each_dev+0x79/0xc0 bus_add_driver+0x18d/0x220 driver_register+0x5b/0xf0 do_one_initcall+0x5b/0x300 do_init_module+0x5b/0x21c load_module+0x1dae/0x22c0 __do_sys_finit_module+0xad/0x110 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae SOFTIRQ-ON-W at: lock_acquire+0x277/0x3d0 _raw_spin_lock+0x2c/0x40 netif_napi_add+0x14b/0x270 e1000_probe+0x2fe/0xee0 [e1000e] local_pci_probe+0x42/0x90 pci_device_probe+0x10b/0x1c0 really_probe+0xef/0x4b0 driver_probe_device+0xde/0x150 device_driver_attach+0x4f/0x60 __driver_attach+0x9c/0x140 bus_for_each_dev+0x79/0xc0 bus_add_driver+0x18d/0x220 driver_register+0x5b/0xf0 do_one_initcall+0x5b/0x300 do_init_module+0x5b/0x21c load_module+0x1dae/0x22c0 __do_sys_finit_module+0xad/0x110 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae INITIAL USE at: lock_acquire+0x277/0x3d0 _raw_spin_lock+0x2c/0x40 netif_napi_add+0x14b/0x270 e1000_probe+0x2fe/0xee0 [e1000e] local_pci_probe+0x42/0x90 pci_device_probe+0x10b/0x1c0 really_probe+0xef/0x4b0 driver_probe_device+0xde/0x150 device_driver_attach+0x4f/0x60 __driver_attach+0x9c/0x140 bus_for_each_dev+0x79/0xc0 bus_add_driver+0x18d/0x220 driver_register+0x5b/0xf0 do_one_initcall+0x5b/0x300 do_init_module+0x5b/0x21c load_module+0x1dae/0x22c0 __do_sys_finit_module+0xad/0x110 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae } ... key at: [] napi_hash_lock+0x18/0x40 ... acquired at: _raw_spin_lock+0x2c/0x40 netif_napi_add+0x14b/0x270 _iwl_pcie_rx_init+0x1f4/0x710 [iwlwifi] iwl_pcie_rx_init+0x1b/0x3b0 [iwlwifi] iwl_trans_pcie_start_fw+0x2ac/0x6a0 [iwlwifi] iwl_mvm_load_ucode_wait_alive+0x116/0x460 [iwlmvm] iwl_run_init_mvm_ucode+0xa4/0x3a0 [iwlmvm] iwl_op_mode_mvm_start+0x9ed/0xbf0 [iwlmvm] _iwl_op_mode_start.isra.4+0x42/0x80 [iwlwifi] iwl_opmode_register+0x71/0xe0 [iwlwifi] iwl_mvm_init+0x34/0x1000 [iwlmvm] do_one_initcall+0x5b/0x300 do_init_module+0x5b/0x21c load_module+0x1dae/0x22c0 __do_sys_finit_module+0xad/0x110 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae [ ... lockdep output trimmed .... ] Fixes: 25edc8f259c7106 ("iwlwifi: pcie: properly implement NAPI") Signed-off-by: Jiri Kosina Acked-by: Luca Coelho Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/nycvar.YFH.7.76.2103021134060.12405@cbobk.fhfr.pm --- drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c index 42426e25cac6..2bec97133119 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c @@ -1129,6 +1129,8 @@ static int _iwl_pcie_rx_init(struct iwl_trans *trans) iwl_pcie_rx_init_rxb_lists(rxq); + spin_unlock_bh(&rxq->lock); + if (!rxq->napi.poll) { int (*poll)(struct napi_struct *, int) = iwl_pcie_napi_poll; @@ -1149,7 +1151,6 @@ static int _iwl_pcie_rx_init(struct iwl_trans *trans) napi_enable(&rxq->napi); } - spin_unlock_bh(&rxq->lock); } /* move the pool to the default queue and allocator ownerships */ From 2378b2c9ecf437b918dff246b81b5b624ec14f80 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 2 Mar 2021 14:21:54 +0300 Subject: [PATCH 087/164] octeontx2-af: cn10k: fix an array overflow in is_lmac_valid() The value of "lmac_id" can be controlled by the user and if it is larger then the number of bits in long then it reads outside the bitmap. The highest valid value is less than MAX_LMAC_PER_CGX (4). Fixes: 91c6945ea1f9 ("octeontx2-af: cn10k: Add RPM MAC support") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 9caa375d01b1..68deae529bc9 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -56,7 +56,9 @@ static bool is_dev_rpm(void *cgxd) bool is_lmac_valid(struct cgx *cgx, int lmac_id) { - return cgx && test_bit(lmac_id, &cgx->lmac_bmap); + if (!cgx || lmac_id < 0 || lmac_id >= MAX_LMAC_PER_CGX) + return false; + return test_bit(lmac_id, &cgx->lmac_bmap); } struct mac_ops *get_mac_ops(void *cgxd) From 6881b07fdd24850def1f03761c66042b983ff86e Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Tue, 2 Mar 2021 20:47:47 +0100 Subject: [PATCH 088/164] ibmvnic: Fix possibly uninitialized old_num_tx_queues variable warning. GCC 7.5 reports: ../drivers/net/ethernet/ibm/ibmvnic.c: In function 'ibmvnic_reset_init': ../drivers/net/ethernet/ibm/ibmvnic.c:5373:51: warning: 'old_num_tx_queues' may be used uninitialized in this function [-Wmaybe-uninitialized] ../drivers/net/ethernet/ibm/ibmvnic.c:5373:6: warning: 'old_num_rx_queues' may be used uninitialized in this function [-Wmaybe-uninitialized] The variable is initialized only if(reset) and used only if(reset && something) so this is a false positive. However, there is no reason to not initialize the variables unconditionally avoiding the warning. Fixes: 635e442f4a48 ("ibmvnic: merge ibmvnic_reset_init and ibmvnic_init") Signed-off-by: Michal Suchanek Reviewed-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 118a4bd3f877..3bad762083c5 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5219,16 +5219,14 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset) { struct device *dev = &adapter->vdev->dev; unsigned long timeout = msecs_to_jiffies(20000); - u64 old_num_rx_queues, old_num_tx_queues; + u64 old_num_rx_queues = adapter->req_rx_queues; + u64 old_num_tx_queues = adapter->req_tx_queues; int rc; adapter->from_passive_init = false; - if (reset) { - old_num_rx_queues = adapter->req_rx_queues; - old_num_tx_queues = adapter->req_tx_queues; + if (reset) reinit_completion(&adapter->init_done); - } adapter->init_done_rc = 0; rc = ibmvnic_send_crq_init(adapter); From 879c348c35bb5fb758dd881d8a97409c1862dae8 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Wed, 3 Mar 2021 20:38:40 +0530 Subject: [PATCH 089/164] net: stmmac: fix incorrect DMA channel intr enable setting of EQoS v4.10 We introduce dwmac410_dma_init_channel() here for both EQoS v4.10 and above which use different DMA_CH(n)_Interrupt_Enable bit definitions for NIE and AIE. Fixes: 48863ce5940f ("stmmac: add DMA support for GMAC 4.xx") Signed-off-by: Ong Boon Leong Signed-off-by: Ramesh Babu B Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac4_dma.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index bb29bfcd62c3..62aa0e95beb7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -124,6 +124,23 @@ static void dwmac4_dma_init_channel(void __iomem *ioaddr, ioaddr + DMA_CHAN_INTR_ENA(chan)); } +static void dwmac410_dma_init_channel(void __iomem *ioaddr, + struct stmmac_dma_cfg *dma_cfg, u32 chan) +{ + u32 value; + + /* common channel control register config */ + value = readl(ioaddr + DMA_CHAN_CONTROL(chan)); + if (dma_cfg->pblx8) + value = value | DMA_BUS_MODE_PBL; + + writel(value, ioaddr + DMA_CHAN_CONTROL(chan)); + + /* Mask interrupts by writing to CSR7 */ + writel(DMA_CHAN_INTR_DEFAULT_MASK_4_10, + ioaddr + DMA_CHAN_INTR_ENA(chan)); +} + static void dwmac4_dma_init(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg, int atds) { @@ -523,7 +540,7 @@ const struct stmmac_dma_ops dwmac4_dma_ops = { const struct stmmac_dma_ops dwmac410_dma_ops = { .reset = dwmac4_dma_reset, .init = dwmac4_dma_init, - .init_chan = dwmac4_dma_init_channel, + .init_chan = dwmac410_dma_init_channel, .init_rx_chan = dwmac4_dma_init_rx_chan, .init_tx_chan = dwmac4_dma_init_tx_chan, .axi = dwmac4_dma_axi, From dbbe7c962c3a8163bf724dbc3c9fdfc9b16d3117 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Mar 2021 18:46:43 -0800 Subject: [PATCH 090/164] docs: networking: drop special stable handling Leave it to Greg. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/netdev-FAQ.rst | 72 ++----------------- Documentation/process/stable-kernel-rules.rst | 6 -- Documentation/process/submitting-patches.rst | 5 -- 3 files changed, 6 insertions(+), 77 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index a64c01b52b4c..91b2cf712801 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -142,73 +142,13 @@ Please send incremental versions on top of what has been merged in order to fix the patches the way they would look like if your latest patch series was to be merged. -How can I tell what patches are queued up for backporting to the various stable releases? ------------------------------------------------------------------------------------------ -Normally Greg Kroah-Hartman collects stable commits himself, but for -networking, Dave collects up patches he deems critical for the -networking subsystem, and then hands them off to Greg. - -There is a patchworks queue that you can see here: - - https://patchwork.kernel.org/bundle/netdev/stable/?state=* - -It contains the patches which Dave has selected, but not yet handed off -to Greg. If Greg already has the patch, then it will be here: - - https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git - -A quick way to find whether the patch is in this stable-queue is to -simply clone the repo, and then git grep the mainline commit ID, e.g. -:: - - stable-queue$ git grep -l 284041ef21fdf2e - releases/3.0.84/ipv6-fix-possible-crashes-in-ip6_cork_release.patch - releases/3.4.51/ipv6-fix-possible-crashes-in-ip6_cork_release.patch - releases/3.9.8/ipv6-fix-possible-crashes-in-ip6_cork_release.patch - stable/stable-queue$ - -I see a network patch and I think it should be backported to stable. Should I request it via stable@vger.kernel.org like the references in the kernel's Documentation/process/stable-kernel-rules.rst file say? ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -No, not for networking. Check the stable queues as per above first -to see if it is already queued. If not, then send a mail to netdev, -listing the upstream commit ID and why you think it should be a stable -candidate. - -Before you jump to go do the above, do note that the normal stable rules -in :ref:`Documentation/process/stable-kernel-rules.rst ` -still apply. So you need to explicitly indicate why it is a critical -fix and exactly what users are impacted. In addition, you need to -convince yourself that you *really* think it has been overlooked, -vs. having been considered and rejected. - -Generally speaking, the longer it has had a chance to "soak" in -mainline, the better the odds that it is an OK candidate for stable. So -scrambling to request a commit be added the day after it appears should -be avoided. - -I have created a network patch and I think it should be backported to stable. Should I add a Cc: stable@vger.kernel.org like the references in the kernel's Documentation/ directory say? ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ -No. See above answer. In short, if you think it really belongs in -stable, then ensure you write a decent commit log that describes who -gets impacted by the bug fix and how it manifests itself, and when the -bug was introduced. If you do that properly, then the commit will get -handled appropriately and most likely get put in the patchworks stable -queue if it really warrants it. - -If you think there is some valid information relating to it being in -stable that does *not* belong in the commit log, then use the three dash -marker line as described in -:ref:`Documentation/process/submitting-patches.rst ` -to temporarily embed that information into the patch that you send. - -Are all networking bug fixes backported to all stable releases? +Are there special rules regarding stable submissions on netdev? --------------------------------------------------------------- -Due to capacity, Dave could only take care of the backports for the -last two stable releases. For earlier stable releases, each stable -branch maintainer is supposed to take care of them. If you find any -patch is missing from an earlier stable branch, please notify -stable@vger.kernel.org with either a commit ID or a formal patch -backported, and CC Dave and other relevant networking developers. +While it used to be the case that netdev submissions were not supposed +to carry explicit ``CC: stable@vger.kernel.org`` tags that is no longer +the case today. Please follow the standard stable rules in +:ref:`Documentation/process/stable-kernel-rules.rst `, +and make sure you include appropriate Fixes tags! Is the comment style convention different for the networking content? --------------------------------------------------------------------- diff --git a/Documentation/process/stable-kernel-rules.rst b/Documentation/process/stable-kernel-rules.rst index 3973556250e1..003c865e9c21 100644 --- a/Documentation/process/stable-kernel-rules.rst +++ b/Documentation/process/stable-kernel-rules.rst @@ -35,12 +35,6 @@ Rules on what kind of patches are accepted, and which ones are not, into the Procedure for submitting patches to the -stable tree ---------------------------------------------------- - - If the patch covers files in net/ or drivers/net please follow netdev stable - submission guidelines as described in - :ref:`Documentation/networking/netdev-FAQ.rst ` - after first checking the stable networking queue at - https://patchwork.kernel.org/bundle/netdev/stable/?state=* - to ensure the requested patch is not already queued up. - Security patches should not be handled (solely) by the -stable review process but should follow the procedures in :ref:`Documentation/admin-guide/security-bugs.rst `. diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index 8c991c863628..91de63b201c1 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -250,11 +250,6 @@ should also read :ref:`Documentation/process/stable-kernel-rules.rst ` in addition to this file. -Note, however, that some subsystem maintainers want to come to their own -conclusions on which patches should go to the stable trees. The networking -maintainer, in particular, would rather not see individual developers -adding lines like the above to their patches. - If changes affect userland-kernel interfaces, please send the MAN-PAGES maintainer (as listed in the MAINTAINERS file) a man-pages patch, or at least a notification of the change, so that some information makes its way From b12422362ce947098ac420ac3c975fc006af4c02 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 3 Mar 2021 11:55:49 -0800 Subject: [PATCH 091/164] net: macb: Add default usrio config to default gem config There is no usrio config defined for default gem config leading to a kernel panic devices that don't define a data. This issue can be reprdouced with microchip polar fire soc where compatible string is defined as "cdns,macb". Fixes: edac63861db7 ("add userio bits as platform configuration") Signed-off-by: Atish Patra Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 472bf8f220bc..15362d016a87 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3954,6 +3954,13 @@ static int macb_init(struct platform_device *pdev) return 0; } +static const struct macb_usrio_config macb_default_usrio = { + .mii = MACB_BIT(MII), + .rmii = MACB_BIT(RMII), + .rgmii = GEM_BIT(RGMII), + .refclk = MACB_BIT(CLKEN), +}; + #if defined(CONFIG_OF) /* 1518 rounded up */ #define AT91ETHER_MAX_RBUFF_SZ 0x600 @@ -4439,13 +4446,6 @@ static int fu540_c000_init(struct platform_device *pdev) return macb_init(pdev); } -static const struct macb_usrio_config macb_default_usrio = { - .mii = MACB_BIT(MII), - .rmii = MACB_BIT(RMII), - .rgmii = GEM_BIT(RGMII), - .refclk = MACB_BIT(CLKEN), -}; - static const struct macb_usrio_config sama7g5_usrio = { .mii = 0, .rmii = 1, @@ -4594,6 +4594,7 @@ static const struct macb_config default_gem_config = { .dma_burst_length = 16, .clk_init = macb_clk_init, .init = macb_init, + .usrio = &macb_default_usrio, .jumbo_max_len = 10240, }; From 3e59e8856758eb5a2dfe1f831ef53b168fd58105 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 3 Mar 2021 16:50:49 +0100 Subject: [PATCH 092/164] net: l2tp: reduce log level of messages in receive path, add counter instead Commit 5ee759cda51b ("l2tp: use standard API for warning log messages") changed a number of warnings about invalid packets in the receive path so that they are always shown, instead of only when a special L2TP debug flag is set. Even with rate limiting these warnings can easily cause significant log spam - potentially triggered by a malicious party sending invalid packets on purpose. In addition these warnings were noticed by projects like Tunneldigger [1], which uses L2TP for its data path, but implements its own control protocol (which is sufficiently different from L2TP data packets that it would always be passed up to userspace even with future extensions of L2TP). Some of the warnings were already redundant, as l2tp_stats has a counter for these packets. This commit adds one additional counter for invalid packets that are passed up to userspace. Packets with unknown session are not counted as invalid, as there is nothing wrong with the format of these packets. With the additional counter, all of these messages are either redundant or benign, so we reduce them to pr_debug_ratelimited(). [1] https://github.com/wlanslovenija/tunneldigger/issues/160 Fixes: 5ee759cda51b ("l2tp: use standard API for warning log messages") Signed-off-by: Matthias Schiffer Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 1 + net/l2tp/l2tp_core.c | 41 +++++++++++++++++++++------------------ net/l2tp/l2tp_core.h | 1 + net/l2tp/l2tp_netlink.c | 6 ++++++ 4 files changed, 30 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 30c80d5ba4bf..bab8c9708611 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -145,6 +145,7 @@ enum { L2TP_ATTR_RX_ERRORS, /* u64 */ L2TP_ATTR_STATS_PAD, L2TP_ATTR_RX_COOKIE_DISCARDS, /* u64 */ + L2TP_ATTR_RX_INVALID, /* u64 */ __L2TP_ATTR_STATS_MAX, }; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 7be5103ff2a8..203890e378cb 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -649,9 +649,9 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, /* Parse and check optional cookie */ if (session->peer_cookie_len > 0) { if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) { - pr_warn_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n", - tunnel->name, tunnel->tunnel_id, - session->session_id); + pr_debug_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n", + tunnel->name, tunnel->tunnel_id, + session->session_id); atomic_long_inc(&session->stats.rx_cookie_discards); goto discard; } @@ -702,8 +702,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * If user has configured mandatory sequence numbers, discard. */ if (session->recv_seq) { - pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", - session->name); + pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", + session->name); atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } @@ -718,8 +718,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, session->send_seq = 0; l2tp_session_set_header_len(session, tunnel->version); } else if (session->send_seq) { - pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", - session->name); + pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", + session->name); atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } @@ -809,9 +809,9 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Short packet? */ if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) { - pr_warn_ratelimited("%s: recv short packet (len=%d)\n", - tunnel->name, skb->len); - goto error; + pr_debug_ratelimited("%s: recv short packet (len=%d)\n", + tunnel->name, skb->len); + goto invalid; } /* Point to L2TP header */ @@ -824,9 +824,9 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Check protocol version */ version = hdrflags & L2TP_HDR_VER_MASK; if (version != tunnel->version) { - pr_warn_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n", - tunnel->name, version, tunnel->version); - goto error; + pr_debug_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n", + tunnel->name, version, tunnel->version); + goto invalid; } /* Get length of L2TP packet */ @@ -834,7 +834,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* If type is control packet, it is handled by userspace. */ if (hdrflags & L2TP_HDRFLAG_T) - goto error; + goto pass; /* Skip flags */ ptr += 2; @@ -863,21 +863,24 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) l2tp_session_dec_refcount(session); /* Not found? Pass to userspace to deal with */ - pr_warn_ratelimited("%s: no session found (%u/%u). Passing up.\n", - tunnel->name, tunnel_id, session_id); - goto error; + pr_debug_ratelimited("%s: no session found (%u/%u). Passing up.\n", + tunnel->name, tunnel_id, session_id); + goto pass; } if (tunnel->version == L2TP_HDR_VER_3 && l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) - goto error; + goto invalid; l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); return 0; -error: +invalid: + atomic_long_inc(&tunnel->stats.rx_invalid); + +pass: /* Put UDP header back */ __skb_push(skb, sizeof(struct udphdr)); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index cb21d906343e..98ea98eb9567 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -39,6 +39,7 @@ struct l2tp_stats { atomic_long_t rx_oos_packets; atomic_long_t rx_errors; atomic_long_t rx_cookie_discards; + atomic_long_t rx_invalid; }; struct l2tp_tunnel; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 83956c9ee1fc..96eb91be9238 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -428,6 +428,9 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, atomic_long_read(&tunnel->stats.rx_errors), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID, + atomic_long_read(&tunnel->stats.rx_invalid), L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); @@ -771,6 +774,9 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, atomic_long_read(&session->stats.rx_errors), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID, + atomic_long_read(&session->stats.rx_invalid), L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); From 4b5dc1a94d4f92b5845e98bd9ae344b26d933aad Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Wed, 3 Mar 2021 16:39:47 +0800 Subject: [PATCH 093/164] Revert "r8152: adjust the settings about MAC clock speed down for RTL8153" This reverts commit 134f98bcf1b898fb9d6f2b91bc85dd2e5478b4b8. The r8153_mac_clk_spd() is used for RTL8153A only, because the register table of RTL8153B is different from RTL8153A. However, this function would be called when RTL8153B calls r8153_first_init() and r8153_enter_oob(). That causes RTL8153B becomes unstable when suspending and resuming. The worst case may let the device stop working. Besides, revert this commit to disable MAC clock speed down for RTL8153A. It would avoid the known issue when enabling U1. The data of the first control transfer may be wrong when exiting U1. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index b246817f3405..90f1c0200042 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3021,29 +3021,6 @@ static void __rtl_set_wol(struct r8152 *tp, u32 wolopts) device_set_wakeup_enable(&tp->udev->dev, false); } -static void r8153_mac_clk_spd(struct r8152 *tp, bool enable) -{ - /* MAC clock speed down */ - if (enable) { - ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL, - ALDPS_SPDWN_RATIO); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL2, - EEE_SPDWN_RATIO); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, - PKT_AVAIL_SPDWN_EN | SUSPEND_SPDWN_EN | - U1U2_SPDWN_EN | L1_SPDWN_EN); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL4, - PWRSAVE_SPDWN_EN | RXDV_SPDWN_EN | TX10MIDLE_EN | - TP100_SPDWN_EN | TP500_SPDWN_EN | EEE_SPDWN_EN | - TP1000_SPDWN_EN); - } else { - ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL, 0); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL2, 0); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, 0); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL4, 0); - } -} - static void r8153_u1u2en(struct r8152 *tp, bool enable) { u8 u1u2[8]; @@ -3338,11 +3315,9 @@ static void rtl8153_runtime_enable(struct r8152 *tp, bool enable) if (enable) { r8153_u1u2en(tp, false); r8153_u2p3en(tp, false); - r8153_mac_clk_spd(tp, true); rtl_runtime_suspend_enable(tp, true); } else { rtl_runtime_suspend_enable(tp, false); - r8153_mac_clk_spd(tp, false); switch (tp->version) { case RTL_VER_03: @@ -4718,7 +4693,6 @@ static void r8153_first_init(struct r8152 *tp) { u32 ocp_data; - r8153_mac_clk_spd(tp, false); rxdy_gated_en(tp, true); r8153_teredo_off(tp); @@ -4769,8 +4743,6 @@ static void r8153_enter_oob(struct r8152 *tp) { u32 ocp_data; - r8153_mac_clk_spd(tp, true); - ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, PLA_OOB_CTRL); ocp_data &= ~NOW_IS_OOB; ocp_write_byte(tp, MCU_TYPE_PLA, PLA_OOB_CTRL, ocp_data); @@ -5496,10 +5468,15 @@ static void r8153_init(struct r8152 *tp) ocp_write_word(tp, MCU_TYPE_USB, USB_CONNECT_TIMER, 0x0001); + /* MAC clock speed down */ + ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL, 0); + ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL2, 0); + ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, 0); + ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL4, 0); + r8153_power_cut_en(tp, false); rtl_runtime_suspend_enable(tp, false); r8153_u1u2en(tp, true); - r8153_mac_clk_spd(tp, false); usb_enable_lpm(tp->udev); ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, PLA_CONFIG6); From d65614a01d24704b016635abf5cc028a54e45a62 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 2 Mar 2021 17:19:32 +0800 Subject: [PATCH 094/164] net: 9p: advance iov on empty read I met below warning when cating a small size(about 80bytes) txt file on 9pfs(msize=2097152 is passed to 9p mount option), the reason is we miss iov_iter_advance() if the read count is 0 for zerocopy case, so we didn't truncate the pipe, then iov_iter_pipe() thinks the pipe is full. Fix it by removing the exception for 0 to ensure to call iov_iter_advance() even on empty read for zerocopy case. [ 8.279568] WARNING: CPU: 0 PID: 39 at lib/iov_iter.c:1203 iov_iter_pipe+0x31/0x40 [ 8.280028] Modules linked in: [ 8.280561] CPU: 0 PID: 39 Comm: cat Not tainted 5.11.0+ #6 [ 8.281260] RIP: 0010:iov_iter_pipe+0x31/0x40 [ 8.281974] Code: 2b 42 54 39 42 5c 76 22 c7 07 20 00 00 00 48 89 57 18 8b 42 50 48 c7 47 08 b [ 8.283169] RSP: 0018:ffff888000cbbd80 EFLAGS: 00000246 [ 8.283512] RAX: 0000000000000010 RBX: ffff888000117d00 RCX: 0000000000000000 [ 8.283876] RDX: ffff88800031d600 RSI: 0000000000000000 RDI: ffff888000cbbd90 [ 8.284244] RBP: ffff888000cbbe38 R08: 0000000000000000 R09: ffff8880008d2058 [ 8.284605] R10: 0000000000000002 R11: ffff888000375510 R12: 0000000000000050 [ 8.284964] R13: ffff888000cbbe80 R14: 0000000000000050 R15: ffff88800031d600 [ 8.285439] FS: 00007f24fd8af600(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000 [ 8.285844] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8.286150] CR2: 00007f24fd7d7b90 CR3: 0000000000c97000 CR4: 00000000000406b0 [ 8.286710] Call Trace: [ 8.288279] generic_file_splice_read+0x31/0x1a0 [ 8.289273] ? do_splice_to+0x2f/0x90 [ 8.289511] splice_direct_to_actor+0xcc/0x220 [ 8.289788] ? pipe_to_sendpage+0xa0/0xa0 [ 8.290052] do_splice_direct+0x8b/0xd0 [ 8.290314] do_sendfile+0x1ad/0x470 [ 8.290576] do_syscall_64+0x2d/0x40 [ 8.290818] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 8.291409] RIP: 0033:0x7f24fd7dca0a [ 8.292511] Code: c3 0f 1f 80 00 00 00 00 4c 89 d2 4c 89 c6 e9 bd fd ff ff 0f 1f 44 00 00 31 8 [ 8.293360] RSP: 002b:00007ffc20932818 EFLAGS: 00000206 ORIG_RAX: 0000000000000028 [ 8.293800] RAX: ffffffffffffffda RBX: 0000000001000000 RCX: 00007f24fd7dca0a [ 8.294153] RDX: 0000000000000000 RSI: 0000000000000003 RDI: 0000000000000001 [ 8.294504] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 [ 8.294867] R10: 0000000001000000 R11: 0000000000000206 R12: 0000000000000003 [ 8.295217] R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000 [ 8.295782] ---[ end trace 63317af81b3ca24b ]--- Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- net/9p/client.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 4f62f299da0c..0a9019da18f3 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1623,10 +1623,6 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, } p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); - if (!count) { - p9_tag_remove(clnt, req); - return 0; - } if (non_zc) { int n = copy_to_iter(dataptr, count, to); From a9ecb0cbf03746b17a7c13bd8e3464e6789f73e8 Mon Sep 17 00:00:00 2001 From: zhang kai Date: Tue, 2 Mar 2021 18:16:07 +0800 Subject: [PATCH 095/164] rtnetlink: using dev_base_seq from target net Signed-off-by: zhang kai Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0edc0b2baaa4..1bdcb33fb561 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2147,7 +2147,7 @@ out: out_err: cb->args[1] = idx; cb->args[0] = h; - cb->seq = net->dev_base_seq; + cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) put_net(tgt_net); From 2888b080d05c819205bbfe52c624a639f44c266a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 3 Mar 2021 23:58:27 +0100 Subject: [PATCH 096/164] netfilter: nftables: fix possible double hook unregistration with table owner Skip hook unregistration of owner tables from the netns exit path, nft_rcv_nl_event() unregisters the table hooks before tearing down the table content. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b07703e19108..796ce86ef7eb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9028,8 +9028,12 @@ static void __nft_release_hooks(struct net *net) { struct nft_table *table; - list_for_each_entry(table, &net->nft.tables, list) + list_for_each_entry(table, &net->nft.tables, list) { + if (nft_table_has_owner(table)) + continue; + __nft_release_hook(net, table); + } } static void __nft_release_table(struct net *net, struct nft_table *table) From bd1777b3a88f98e223392221b330668458aac7f1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 4 Mar 2021 04:00:09 +0100 Subject: [PATCH 097/164] netfilter: nftables: bogus check for netlink portID with table owner The existing branch checks for 0 != table->nlpid which always evaluates true for tables that have an owner. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 796ce86ef7eb..224c8e537cb3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9083,13 +9083,12 @@ static void __nft_release_table(struct net *net, struct nft_table *table) nf_tables_table_destroy(&ctx); } -static void __nft_release_tables(struct net *net, u32 nlpid) +static void __nft_release_tables(struct net *net) { struct nft_table *table, *nt; list_for_each_entry_safe(table, nt, &net->nft.tables, list) { - if (nft_table_has_owner(table) && - nlpid != table->nlpid) + if (nft_table_has_owner(table)) continue; __nft_release_table(net, table); @@ -9155,7 +9154,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); - __nft_release_tables(net, 0); + __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); WARN_ON_ONCE(!list_empty(&net->nft.module_list)); From c95c34f01bbda4421c25fdc9b04a4a4aab10d36c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 3 Mar 2021 19:56:34 +0100 Subject: [PATCH 098/164] xsk: Remove dangling function declaration from header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xdp_umem_query() is dead for a long time, drop the declaration from include/linux/netdevice.h Fixes: c9b47cc1fabc ("xsk: fix bug when trying to use both copy and zero-copy on one queue id") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210303185636.18070-2-maciej.fijalkowski@intel.com --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f06fbee8638e..5b67ea89d5f2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3959,8 +3959,6 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); -int xdp_umem_query(struct net_device *dev, u16 queue_id); - int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb); From 6bc6699881012b5bd5d49fa861a69a37fc01b49c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 3 Mar 2021 19:56:35 +0100 Subject: [PATCH 099/164] samples, bpf: Add missing munmap in xdpsock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We mmap the umem region, but we never munmap it. Add the missing call at the end of the cleanup. Fixes: 3945b37a975d ("samples/bpf: use hugepages in xdpsock app") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210303185636.18070-3-maciej.fijalkowski@intel.com --- samples/bpf/xdpsock_user.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index db0cb73513a5..1e2a1105d0e6 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1699,5 +1699,7 @@ int main(int argc, char **argv) xdpsock_cleanup(); + munmap(bufs, NUM_FRAMES * opt_xsk_frame_size); + return 0; } From 2b2aedabc44e9660f90ccf7ba1ca2706d75f411f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 3 Mar 2021 19:56:36 +0100 Subject: [PATCH 100/164] libbpf: Clear map_info before each bpf_obj_get_info_by_fd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xsk_lookup_bpf_maps, based on prog_fd, looks whether current prog has a reference to XSKMAP. BPF prog can include insns that work on various BPF maps and this is covered by iterating through map_ids. The bpf_map_info that is passed to bpf_obj_get_info_by_fd for filling needs to be cleared at each iteration, so that it doesn't contain any outdated fields and that is currently missing in the function of interest. To fix that, zero-init map_info via memset before each bpf_obj_get_info_by_fd call. Also, since the area of this code is touched, in general strcmp is considered harmful, so let's convert it to strncmp and provide the size of the array name for current map_info. While at it, do s/continue/break/ once we have found the xsks_map to terminate the search. Fixes: 5750902a6e9b ("libbpf: proper XSKMAP cleanup") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210303185636.18070-4-maciej.fijalkowski@intel.com --- tools/lib/bpf/xsk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index ffbb588724d8..526fc35c0b23 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -610,15 +610,16 @@ static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) if (fd < 0) continue; + memset(&map_info, 0, map_len); err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); if (err) { close(fd); continue; } - if (!strcmp(map_info.name, "xsks_map")) { + if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) { ctx->xsks_map_fd = fd; - continue; + break; } close(fd); From 83a2881903f3d5bc08ded4fb04f6e3bedb1fba65 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 1 Mar 2021 16:40:19 +0100 Subject: [PATCH 101/164] bpf: Account for BPF_FETCH in insn_has_def32() insn_has_def32() returns false for 32-bit BPF_FETCH insns. This makes adjust_insn_aux_data() incorrectly set zext_dst, as can be seen in [1]. This happens because insn_no_def() does not know about the BPF_FETCH variants of BPF_STX. Fix in two steps. First, replace insn_no_def() with insn_def_regno(), which returns the register an insn defines. Normally insn_no_def() calls are followed by insn->dst_reg uses; replace those with the insn_def_regno() return value. Second, adjust the BPF_STX special case in is_reg64() to deal with queries made from opt_subreg_zext_lo32_rnd_hi32(), where the state information is no longer available. Add a comment, since the purpose of this special case is not clear at first glance. [1] https://lore.kernel.org/bpf/20210223150845.1857620-1-jackmanb@google.com/ Fixes: 5ffa25502b5a ("bpf: Add instructions for atomic_[cmp]xchg") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Brendan Jackman Link: https://lore.kernel.org/bpf/20210301154019.129110-1-iii@linux.ibm.com --- kernel/bpf/verifier.c | 70 ++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3d34ba492d46..bb3eaab934f3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1703,7 +1703,11 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - if (reg->type != SCALAR_VALUE) + /* BPF_STX (including atomic variants) has multiple source + * operands, one of which is a ptr. Check whether the caller is + * asking about it. + */ + if (t == SRC_OP && reg->type != SCALAR_VALUE) return true; return BPF_SIZE(code) == BPF_DW; } @@ -1735,22 +1739,38 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, return true; } -/* Return TRUE if INSN doesn't have explicit value define. */ -static bool insn_no_def(struct bpf_insn *insn) +/* Return the regno defined by the insn, or -1. */ +static int insn_def_regno(const struct bpf_insn *insn) { - u8 class = BPF_CLASS(insn->code); - - return (class == BPF_JMP || class == BPF_JMP32 || - class == BPF_STX || class == BPF_ST); + switch (BPF_CLASS(insn->code)) { + case BPF_JMP: + case BPF_JMP32: + case BPF_ST: + return -1; + case BPF_STX: + if (BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm & BPF_FETCH)) { + if (insn->imm == BPF_CMPXCHG) + return BPF_REG_0; + else + return insn->src_reg; + } else { + return -1; + } + default: + return insn->dst_reg; + } } /* Return TRUE if INSN has defined any 32-bit value explicitly. */ static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) { - if (insn_no_def(insn)) + int dst_reg = insn_def_regno(insn); + + if (dst_reg == -1) return false; - return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); + return !is_reg64(env, insn, dst_reg, NULL, DST_OP); } static void mark_insn_zext(struct bpf_verifier_env *env, @@ -11006,9 +11026,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; - u8 load_reg; + int load_reg; insn = insns[adj_idx]; + load_reg = insn_def_regno(&insn); if (!aux[adj_idx].zext_dst) { u8 code, class; u32 imm_rnd; @@ -11018,14 +11039,14 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, code = insn.code; class = BPF_CLASS(code); - if (insn_no_def(&insn)) + if (load_reg == -1) continue; /* NOTE: arg "reg" (the fourth one) is only used for - * BPF_STX which has been ruled out in above - * check, it is safe to pass NULL here. + * BPF_STX + SRC_OP, so it is safe to pass NULL + * here. */ - if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { + if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) { if (class == BPF_LD && BPF_MODE(code) == BPF_IMM) i++; @@ -11040,7 +11061,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, imm_rnd = get_random_int(); rnd_hi32_patch[0] = insn; rnd_hi32_patch[1].imm = imm_rnd; - rnd_hi32_patch[3].dst_reg = insn.dst_reg; + rnd_hi32_patch[3].dst_reg = load_reg; patch = rnd_hi32_patch; patch_len = 4; goto apply_patch_buffer; @@ -11049,22 +11070,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext()) continue; - /* zext_dst means that we want to zero-extend whatever register - * the insn defines, which is dst_reg most of the time, with - * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH. - */ - if (BPF_CLASS(insn.code) == BPF_STX && - BPF_MODE(insn.code) == BPF_ATOMIC) { - /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not - * define any registers, therefore zext_dst cannot be - * set. - */ - if (WARN_ON(!(insn.imm & BPF_FETCH))) - return -EINVAL; - load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0 - : insn.src_reg; - } else { - load_reg = insn.dst_reg; + if (WARN_ON(load_reg == -1)) { + verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); + return -EFAULT; } zext_patch[0] = insn; From d785e1fec60179f534fbe8d006c890e5ad186e51 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Wed, 14 Oct 2020 16:17:48 +0200 Subject: [PATCH 102/164] ixgbe: fail to create xfrm offload of IPsec tunnel mode SA Based on talks and indirect references ixgbe IPsec offlod do not support IPsec tunnel mode offload. It can only support IPsec transport mode offload. Now explicitly fail when creating non transport mode SA with offload to avoid false performance expectations. Fixes: 63a67fe229ea ("ixgbe: add ipsec offload add and remove SA") Signed-off-by: Antony Antony Acked-by: Shannon Nelson Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 5 +++++ drivers/net/ethernet/intel/ixgbevf/ipsec.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index eca73526ac86..54d47265a7ac 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -575,6 +575,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs) return -EINVAL; } + if (xs->props.mode != XFRM_MODE_TRANSPORT) { + netdev_err(dev, "Unsupported mode for ipsec offload\n"); + return -EINVAL; + } + if (ixgbe_ipsec_check_mgmt_ip(xs)) { netdev_err(dev, "IPsec IP addr clash with mgmt filters\n"); return -EINVAL; diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c b/drivers/net/ethernet/intel/ixgbevf/ipsec.c index 5170dd9d8705..caaea2c920a6 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c +++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c @@ -272,6 +272,11 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs) return -EINVAL; } + if (xs->props.mode != XFRM_MODE_TRANSPORT) { + netdev_err(dev, "Unsupported mode for ipsec offload\n"); + return -EINVAL; + } + if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) { struct rx_sa rsa; From 7a766381634da19fc837619b0a34590498d9d29a Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sun, 3 Jan 2021 16:08:42 +0800 Subject: [PATCH 103/164] ixgbe: Fix memleak in ixgbe_configure_clsu32 When ixgbe_fdir_write_perfect_filter_82599() fails, input allocated by kzalloc() has not been freed, which leads to memleak. Signed-off-by: Dinghao Liu Reviewed-by: Paul Menzel Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index fae84202d870..9f3f12e2ccf2 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9565,8 +9565,10 @@ static int ixgbe_configure_clsu32(struct ixgbe_adapter *adapter, ixgbe_atr_compute_perfect_hash_82599(&input->filter, mask); err = ixgbe_fdir_write_perfect_filter_82599(hw, &input->filter, input->sw_idx, queue); - if (!err) - ixgbe_update_ethtool_fdir_entry(adapter, input, input->sw_idx); + if (err) + goto err_out_w_lock; + + ixgbe_update_ethtool_fdir_entry(adapter, input, input->sw_idx); spin_unlock(&adapter->fdir_perfect_lock); if ((uhtid != 0x800) && (adapter->jump_tables[uhtid])) From d93ef301644ee82925bce1d57fdfe70475dc0bae Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Wed, 3 Mar 2021 21:55:49 -0800 Subject: [PATCH 104/164] net: sctp: trivial: fix typo in comment Fix typo of 'overflow' for comment in sctp_tsnmap_check(). Reported-by: Gustavo A. R. Silva Signed-off-by: Drew Fustini Signed-off-by: David S. Miller --- net/sctp/tsnmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c index a9c6af5795d8..5ba456727f63 100644 --- a/net/sctp/tsnmap.c +++ b/net/sctp/tsnmap.c @@ -75,7 +75,7 @@ int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn) return 1; /* Verify that we can hold this TSN and that it will not - * overlfow our map + * overflow our map */ if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) return -1; From 76c03bf8e2624076b88d93542d78e22d5345c88e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 4 Mar 2021 10:57:53 +0200 Subject: [PATCH 105/164] nexthop: Do not flush blackhole nexthops when loopback goes down As far as user space is concerned, blackhole nexthops do not have a nexthop device and therefore should not be affected by the administrative or carrier state of any netdev. However, when the loopback netdev goes down all the blackhole nexthops are flushed. This happens because internally the kernel associates blackhole nexthops with the loopback netdev. This behavior is both confusing to those not familiar with kernel internals and also diverges from the legacy API where blackhole IPv4 routes are not flushed when the loopback netdev goes down: # ip route add blackhole 198.51.100.0/24 # ip link set dev lo down # ip route show 198.51.100.0/24 blackhole 198.51.100.0/24 Blackhole IPv6 routes are flushed, but at least user space knows that they are associated with the loopback netdev: # ip -6 route show 2001:db8:1::/64 blackhole 2001:db8:1::/64 dev lo metric 1024 pref medium Fix this by only flushing blackhole nexthops when the loopback netdev is unregistered. Fixes: ab84be7e54fc ("net: Initial nexthop code") Signed-off-by: Ido Schimmel Reported-by: Donald Sharp Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f1c6cbdb9e43..743777bce179 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1399,7 +1399,7 @@ out: /* rtnl */ /* remove all nexthops tied to a device being deleted */ -static void nexthop_flush_dev(struct net_device *dev) +static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); @@ -1411,6 +1411,10 @@ static void nexthop_flush_dev(struct net_device *dev) if (nhi->fib_nhc.nhc_dev != dev) continue; + if (nhi->reject_nh && + (event == NETDEV_DOWN || event == NETDEV_CHANGE)) + continue; + remove_nexthop(net, nhi->nh_parent, NULL); } } @@ -2189,11 +2193,11 @@ static int nh_netdev_event(struct notifier_block *this, switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: - nexthop_flush_dev(dev); + nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) - nexthop_flush_dev(dev); + nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; From 3a1099d3147f391fa11320a759bbcb1bb857fca1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 4 Mar 2021 10:57:54 +0200 Subject: [PATCH 106/164] selftests: fib_nexthops: Test blackhole nexthops when loopback goes down Test that blackhole nexthops are not flushed when the loopback device goes down. Output without previous patch: # ./fib_nexthops.sh -t basic Basic functional tests ---------------------- TEST: List with nothing defined [ OK ] TEST: Nexthop get on non-existent id [ OK ] TEST: Nexthop with no device or gateway [ OK ] TEST: Nexthop with down device [ OK ] TEST: Nexthop with device that is linkdown [ OK ] TEST: Nexthop with device only [ OK ] TEST: Nexthop with duplicate id [ OK ] TEST: Blackhole nexthop [ OK ] TEST: Blackhole nexthop with other attributes [ OK ] TEST: Blackhole nexthop with loopback device down [FAIL] TEST: Create group [ OK ] TEST: Create group with blackhole nexthop [FAIL] TEST: Create multipath group where 1 path is a blackhole [ OK ] TEST: Multipath group can not have a member replaced by blackhole [ OK ] TEST: Create group with non-existent nexthop [ OK ] TEST: Create group with same nexthop multiple times [ OK ] TEST: Replace nexthop with nexthop group [ OK ] TEST: Replace nexthop group with nexthop [ OK ] TEST: Nexthop group and device [ OK ] TEST: Test proto flush [ OK ] TEST: Nexthop group and blackhole [ OK ] Tests passed: 19 Tests failed: 2 Output with previous patch: # ./fib_nexthops.sh -t basic Basic functional tests ---------------------- TEST: List with nothing defined [ OK ] TEST: Nexthop get on non-existent id [ OK ] TEST: Nexthop with no device or gateway [ OK ] TEST: Nexthop with down device [ OK ] TEST: Nexthop with device that is linkdown [ OK ] TEST: Nexthop with device only [ OK ] TEST: Nexthop with duplicate id [ OK ] TEST: Blackhole nexthop [ OK ] TEST: Blackhole nexthop with other attributes [ OK ] TEST: Blackhole nexthop with loopback device down [ OK ] TEST: Create group [ OK ] TEST: Create group with blackhole nexthop [ OK ] TEST: Create multipath group where 1 path is a blackhole [ OK ] TEST: Multipath group can not have a member replaced by blackhole [ OK ] TEST: Create group with non-existent nexthop [ OK ] TEST: Create group with same nexthop multiple times [ OK ] TEST: Replace nexthop with nexthop group [ OK ] TEST: Replace nexthop group with nexthop [ OK ] TEST: Nexthop group and device [ OK ] TEST: Test proto flush [ OK ] TEST: Nexthop group and blackhole [ OK ] Tests passed: 21 Tests failed: 0 Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 4c7d33618437..d98fb85e201c 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1524,6 +1524,14 @@ basic() run_cmd "$IP nexthop replace id 2 blackhole dev veth1" log_test $? 2 "Blackhole nexthop with other attributes" + # blackhole nexthop should not be affected by the state of the loopback + # device + run_cmd "$IP link set dev lo down" + check_nexthop "id 2" "id 2 blackhole" + log_test $? 0 "Blackhole nexthop with loopback device down" + + run_cmd "$IP link set dev lo up" + # # groups # From f1becbed411c6fa29d7ce3def3a1dcd4f63f2d74 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Mar 2021 12:29:43 +0200 Subject: [PATCH 107/164] net: mscc: ocelot: properly reject destination IP keys in VCAP IS1 An attempt is made to warn the user about the fact that VCAP IS1 cannot offload keys matching on destination IP (at least given the current half key format), but sadly that warning fails miserably in practice, due to the fact that it operates on an uninitialized "match" variable. We must first decode the keys from the flow rule. Fixes: 75944fda1dfe ("net: mscc: ocelot: offload ingress skbedit and vlan actions to VCAP IS1") Reported-by: Colin Ian King Signed-off-by: Vladimir Oltean Reviewed-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_flower.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index c3ac026f6aea..a41b458b1b3e 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -540,13 +540,14 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, return -EOPNOTSUPP; } + flow_rule_match_ipv4_addrs(rule, &match); + if (filter->block_id == VCAP_IS1 && *(u32 *)&match.mask->dst) { NL_SET_ERR_MSG_MOD(extack, "Key type S1_NORMAL cannot match on destination IP"); return -EOPNOTSUPP; } - flow_rule_match_ipv4_addrs(rule, &match); tmp = &filter->key.ipv4.sip.value.addr[0]; memcpy(tmp, &match.key->src, 4); From 053d8ad10d585adf9891fcd049637536e2fe9ea7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Mar 2021 12:56:53 +0200 Subject: [PATCH 108/164] net: dsa: sja1105: fix SGMII PCS being forced to SPEED_UNKNOWN instead of SPEED_10 When using MLO_AN_PHY or MLO_AN_FIXED, the MII_BMCR of the SGMII PCS is read before resetting the switch so it can be reprogrammed afterwards. This works for the speeds of 1Gbps and 100Mbps, but not for 10Mbps, because SPEED_10 is actually 0, so AND-ing anything with 0 is false, therefore that last branch is dead code. Do what others do (genphy_read_status_fixed, phy_mii_ioctl) and just remove the check for SPEED_10, let it fall into the default case. Fixes: ffe10e679cec ("net: dsa: sja1105: Add support for the SGMII port") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 7692338730df..c1982615c631 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1922,7 +1922,7 @@ out_unlock_ptp: speed = SPEED_1000; else if (bmcr & BMCR_SPEED100) speed = SPEED_100; - else if (bmcr & BMCR_SPEED10) + else speed = SPEED_10; sja1105_sgmii_pcs_force_speed(priv, speed); From 6a5166e07c029182ee0e15c1a97b08c3179b2aaf Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Mar 2021 12:56:54 +0200 Subject: [PATCH 109/164] net: dsa: sja1105: fix ucast/bcast flooding always remaining enabled In the blamed patch I managed to introduce a bug while moving code around: the same logic is applied to the ucast_egress_floods and bcast_egress_floods variables both on the "if" and the "else" branches. This is clearly an unintended change compared to how the code used to be prior to that bugfix, so restore it. Fixes: 7f7ccdea8c73 ("net: dsa: sja1105: fix leakage of flooded frames outside bridging domain") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index c1982615c631..51ea104c63bb 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3369,14 +3369,14 @@ static int sja1105_port_ucast_bcast_flood(struct sja1105_private *priv, int to, if (flags.val & BR_FLOOD) priv->ucast_egress_floods |= BIT(to); else - priv->ucast_egress_floods |= BIT(to); + priv->ucast_egress_floods &= ~BIT(to); } if (flags.mask & BR_BCAST_FLOOD) { if (flags.val & BR_BCAST_FLOOD) priv->bcast_egress_floods |= BIT(to); else - priv->bcast_egress_floods |= BIT(to); + priv->bcast_egress_floods &= ~BIT(to); } return sja1105_manage_flood_domains(priv); From 6c59cff38e66584ae3ac6c2f0cbd8d039c710ba7 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Thu, 4 Mar 2021 14:15:13 +0100 Subject: [PATCH 110/164] net: usb: qmi_wwan: allow qmimux add/del with master up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no reason for preventing the creation and removal of qmimux network interfaces when the underlying interface is up. This makes qmi_wwan mux implementation more similar to the rmnet one, simplifying userspace management of the same logical interfaces. Fixes: c6adf77953bc ("net: usb: qmi_wwan: add qmap mux protocol support") Reported-by: Aleksander Morgado Signed-off-by: Daniele Palmas Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 17a050521b86..6700f1970b24 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -429,13 +429,6 @@ static ssize_t add_mux_store(struct device *d, struct device_attribute *attr, c goto err; } - /* we don't want to modify a running netdev */ - if (netif_running(dev->net)) { - netdev_err(dev->net, "Cannot change a running device\n"); - ret = -EBUSY; - goto err; - } - ret = qmimux_register_device(dev->net, mux_id); if (!ret) { info->flags |= QMI_WWAN_FLAG_MUX; @@ -465,13 +458,6 @@ static ssize_t del_mux_store(struct device *d, struct device_attribute *attr, c if (!rtnl_trylock()) return restart_syscall(); - /* we don't want to modify a running netdev */ - if (netif_running(dev->net)) { - netdev_err(dev->net, "Cannot change a running device\n"); - ret = -EBUSY; - goto err; - } - del_dev = qmimux_find_dev(dev, mux_id); if (!del_dev) { netdev_err(dev->net, "mux_id not present\n"); From bfc2560563586372212b0a8aeca7428975fa91fe Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Thu, 4 Mar 2021 14:43:17 +0000 Subject: [PATCH 111/164] net: sched: avoid duplicates in classes dump This is a follow up of commit ea3274695353 ("net: sched: avoid duplicates in qdisc dump") which has fixed the issue only for the qdisc dump. The duplicate printing also occurs when dumping the classes via tc class show dev eth0 Fixes: 59cc1f61f09c ("net: sched: convert qdisc linked list to hashtable") Signed-off-by: Maximilian Heyne Signed-off-by: David S. Miller --- net/sched/sch_api.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index e2e4353db8a7..f87d07736a14 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2168,7 +2168,7 @@ static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, - int *t_p, int s_t) + int *t_p, int s_t, bool recur) { struct Qdisc *q; int b; @@ -2179,7 +2179,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) return -1; - if (!qdisc_dev(root)) + if (!qdisc_dev(root) || !recur) return 0; if (tcm->tcm_parent) { @@ -2214,13 +2214,13 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0) + if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, - &t, s_t) < 0) + &t, s_t, false) < 0) goto done; done: From e0be4931f3fee2e04dec4013ea4f27ec2db8556f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 Mar 2021 13:32:08 -0800 Subject: [PATCH 112/164] mptcp: reset last_snd on subflow close Send logic caches last active subflow in the msk, so it needs to be cleared when the cached subflow is closed. Fixes: d5f49190def61c ("mptcp: allow picking different xmit subflows") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/155 Reported-by: Christoph Paasch Acked-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c5d5e68940ea..7362a536cbc0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2126,6 +2126,8 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { + struct mptcp_sock *msk = mptcp_sk(sk); + list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); @@ -2154,6 +2156,9 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, release_sock(ssk); sock_put(ssk); + + if (ssk == msk->last_snd) + msk->last_snd = NULL; } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, From f07157792c633b528de5fc1dbe2e4ea54f8e09d4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 Mar 2021 13:32:09 -0800 Subject: [PATCH 113/164] mptcp: put subflow sock on connect error mptcp_add_pending_subflow() performs a sock_hold() on the subflow, then adds the subflow to the join list. Without a sock_put the subflow sk won't be freed in case connect() fails. unreferenced object 0xffff88810c03b100 (size 3000): [..] sk_prot_alloc.isra.0+0x2f/0x110 sk_alloc+0x5d/0xc20 inet6_create+0x2b7/0xd30 __sock_create+0x17f/0x410 mptcp_subflow_create_socket+0xff/0x9c0 __mptcp_subflow_connect+0x1da/0xaf0 mptcp_pm_nl_work+0x6e0/0x1120 mptcp_worker+0x508/0x9a0 Fixes: 5b950ff4331ddda ("mptcp: link MPC subflow into msk only after accept") Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e1fbcab257e6..41695e26c374 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1297,6 +1297,7 @@ failed_unlink: spin_lock_bh(&msk->join_list_lock); list_del(&subflow->node); spin_unlock_bh(&msk->join_list_lock); + sock_put(mptcp_subflow_tcp_sock(subflow)); failed: subflow->disposable = 1; From eaeef1ce55ec9161e0c44ff27017777b1644b421 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Mar 2021 13:32:10 -0800 Subject: [PATCH 114/164] mptcp: fix memory accounting on allocation error In case of memory pressure the MPTCP xmit path keeps at most a single skb in the tx cache, eventually freeing additional ones. The associated counter for forward memory is not update accordingly, and that causes the following splat: WARNING: CPU: 0 PID: 12 at net/core/stream.c:208 sk_stream_kill_queues+0x3ca/0x530 net/core/stream.c:208 Modules linked in: CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.11.0-rc2 #59 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: events mptcp_worker RIP: 0010:sk_stream_kill_queues+0x3ca/0x530 net/core/stream.c:208 Code: 03 0f b6 04 02 84 c0 74 08 3c 03 0f 8e 63 01 00 00 8b ab 00 01 00 00 e9 60 ff ff ff e8 2f 24 d3 fe 0f 0b eb 97 e8 26 24 d3 fe <0f> 0b eb a0 e8 1d 24 d3 fe 0f 0b e9 a5 fe ff ff 4c 89 e7 e8 0e d0 RSP: 0018:ffffc900000c7bc8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88810030ac40 RSI: ffffffff8262ca4a RDI: 0000000000000003 RBP: 0000000000000d00 R08: 0000000000000000 R09: ffffffff85095aa7 R10: ffffffff8262c9ea R11: 0000000000000001 R12: ffff888108908100 R13: ffffffff85095aa0 R14: ffffc900000c7c48 R15: 1ffff92000018f85 FS: 0000000000000000(0000) GS:ffff88811b200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa7444baef8 CR3: 0000000035ee9005 CR4: 0000000000170ef0 Call Trace: __mptcp_destroy_sock+0x4a7/0x6c0 net/mptcp/protocol.c:2547 mptcp_worker+0x7dd/0x1610 net/mptcp/protocol.c:2272 process_one_work+0x896/0x1170 kernel/workqueue.c:2275 worker_thread+0x605/0x1350 kernel/workqueue.c:2421 kthread+0x344/0x410 kernel/kthread.c:292 ret_from_fork+0x22/0x30 arch/x86/entry/entry_64.S:296 At close time, as reported by syzkaller/Christoph. This change address the issue properly updating the fwd allocated memory counter in the error path. Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/136 Fixes: 724cfd2ee8aa ("mptcp: allocate TX skbs in msk context") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7362a536cbc0..aa59101ffe54 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1189,6 +1189,7 @@ static bool mptcp_tx_cache_refill(struct sock *sk, int size, */ while (skbs->qlen > 1) { skb = __skb_dequeue_tail(skbs); + *total_ts -= skb->truesize; __kfree_skb(skb); } return skbs->qlen > 0; From 17aee05dc8822e354f5ad2d68ee39e3ba4b6acf2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 Mar 2021 13:32:11 -0800 Subject: [PATCH 115/164] mptcp: dispose initial struct socket when its subflow is closed Christoph Paasch reported following crash: dst_release underflow WARNING: CPU: 0 PID: 1319 at net/core/dst.c:175 dst_release+0xc1/0xd0 net/core/dst.c:175 CPU: 0 PID: 1319 Comm: syz-executor217 Not tainted 5.11.0-rc6af8e85128b4d0d24083c5cac646e891227052e0c #70 Call Trace: rt_cache_route+0x12e/0x140 net/ipv4/route.c:1503 rt_set_nexthop.constprop.0+0x1fc/0x590 net/ipv4/route.c:1612 __mkroute_output net/ipv4/route.c:2484 [inline] ... The worker leaves msk->subflow alone even when it happened to close the subflow ssk associated with it. Fixes: 866f26f2a9c33b ("mptcp: always graft subflow socket to parent") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/157 Reported-by: Christoph Paasch Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index aa59101ffe54..a58da04bed71 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2116,6 +2116,14 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) return backup; } +static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) +{ + if (msk->subflow) { + iput(SOCK_INODE(msk->subflow)); + msk->subflow = NULL; + } +} + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -2160,6 +2168,9 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (ssk == msk->last_snd) msk->last_snd = NULL; + + if (msk->subflow && ssk == msk->subflow->sk) + mptcp_dispose_initial_subflow(msk); } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, @@ -2529,12 +2540,6 @@ static void __mptcp_destroy_sock(struct sock *sk) might_sleep(); - /* dispose the ancillatory tcp socket, if any */ - if (msk->subflow) { - iput(SOCK_INODE(msk->subflow)); - msk->subflow = NULL; - } - /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -2559,6 +2564,7 @@ static void __mptcp_destroy_sock(struct sock *sk) sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); + mptcp_dispose_initial_subflow(msk); sock_put(sk); } From c8fe62f0768cc9378103fc89fb96804645f527c9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 Mar 2021 13:32:12 -0800 Subject: [PATCH 116/164] mptcp: reset 'first' and ack_hint on subflow close Just like with last_snd, we have to NULL 'first' on subflow close. ack_hint isn't strictly required (its never dereferenced), but better to clear this explicitly as well instead of making it an exception. msk->first is dereferenced unconditionally at accept time, but at that point the ssk is not on the conn_list yet -- this means worker can't see it when iterating the conn_list. Reported-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a58da04bed71..3dcb564b03ad 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2169,6 +2169,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (ssk == msk->last_snd) msk->last_snd = NULL; + if (ssk == msk->ack_hint) + msk->ack_hint = NULL; + + if (ssk == msk->first) + msk->first = NULL; + if (msk->subflow && ssk == msk->subflow->sk) mptcp_dispose_initial_subflow(msk); } @@ -3297,6 +3303,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* PM/worker can now acquire the first subflow socket * lock without racing with listener queue cleanup, * we can notify it, if needed. + * + * Even if remote has reset the initial subflow by now + * the refcnt is still at least one. */ subflow = mptcp_subflow_ctx(msk->first); list_add(&subflow->node, &msk->conn_list); From 2948d0a1e5aedc789fed27a4473040b6db741426 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Mar 2021 13:32:13 -0800 Subject: [PATCH 117/164] mptcp: factor out __mptcp_retrans helper() Will simplify the following patch, no functional change intended. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 93 ++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3dcb564b03ad..67aaf7154dca 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2261,14 +2261,58 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) mptcp_close_wake_up(sk); } -static void mptcp_worker(struct work_struct *work) +static void __mptcp_retrans(struct sock *sk) { - struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); - struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; + struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; size_t copied = 0; - int state, ret; + struct sock *ssk; + int ret; + + __mptcp_clean_una(sk); + dfrag = mptcp_rtx_head(sk); + if (!dfrag) + return; + + ssk = mptcp_subflow_get_retrans(msk); + if (!ssk) + goto reset_timer; + + lock_sock(ssk); + + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = dfrag->already_sent; + while (info.sent < dfrag->already_sent) { + if (!mptcp_alloc_tx_skb(sk, ssk)) + break; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + info.sent += ret; + } + if (copied) + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + +reset_timer: + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); +} + +static void mptcp_worker(struct work_struct *work) +{ + struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); + struct sock *sk = &msk->sk.icsk_inet.sk; + int state; lock_sock(sk); state = sk->sk_state; @@ -2303,45 +2347,8 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); - if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) - goto unlock; - - __mptcp_clean_una(sk); - dfrag = mptcp_rtx_head(sk); - if (!dfrag) - goto unlock; - - ssk = mptcp_subflow_get_retrans(msk); - if (!ssk) - goto reset_unlock; - - lock_sock(ssk); - - /* limit retransmission to the bytes already sent on some subflows */ - info.sent = 0; - info.limit = dfrag->already_sent; - while (info.sent < dfrag->already_sent) { - if (!mptcp_alloc_tx_skb(sk, ssk)) - break; - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); - if (ret <= 0) - break; - - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); - copied += ret; - info.sent += ret; - } - if (copied) - tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, - info.size_goal); - - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - -reset_unlock: - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) + __mptcp_retrans(sk); unlock: release_sock(sk); From c2e6048fa1cf2228063aec299f93ac6eb256b457 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Mar 2021 13:32:14 -0800 Subject: [PATCH 118/164] mptcp: fix race in release_cb If we receive a MPTCP_PUSH_PENDING even from a subflow when mptcp_release_cb() is serving the previous one, the latter will be delayed up to the next release_sock(msk). Address the issue implementing a test/serve loop for such event. Additionally rename the push helper to __mptcp_push_pending() to be more consistent with the existing code. Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 67aaf7154dca..d2a2169e6d9e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1445,7 +1445,7 @@ static void mptcp_push_release(struct sock *sk, struct sock *ssk, release_sock(ssk); } -static void mptcp_push_pending(struct sock *sk, unsigned int flags) +static void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); @@ -1697,14 +1697,14 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) wait_for_memory: mptcp_set_nospace(sk); - mptcp_push_pending(sk, msg->msg_flags); + __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; } if (copied) - mptcp_push_pending(sk, msg->msg_flags); + __mptcp_push_pending(sk, msg->msg_flags); out: release_sock(sk); @@ -2959,13 +2959,14 @@ static void mptcp_release_cb(struct sock *sk) { unsigned long flags, nflags; - /* push_pending may touch wmem_reserved, do it before the later - * cleanup - */ - if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) - __mptcp_clean_una(sk); - if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) { - /* mptcp_push_pending() acquires the subflow socket lock + for (;;) { + flags = 0; + if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) + flags |= MPTCP_PUSH_PENDING; + if (!flags) + break; + + /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope * 2) must avoid ABBA deadlock with msk socket spinlock: the RX @@ -2974,13 +2975,21 @@ static void mptcp_release_cb(struct sock *sk) */ spin_unlock_bh(&sk->sk_lock.slock); - mptcp_push_pending(sk, 0); + if (flags & MPTCP_PUSH_PENDING) + __mptcp_push_pending(sk, 0); + + cond_resched(); spin_lock_bh(&sk->sk_lock.slock); } + + if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + __mptcp_clean_una(sk); if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) __mptcp_error_report(sk); - /* clear any wmem reservation and errors */ + /* push_pending may touch wmem_reserved, ensure we do the cleanup + * later + */ __mptcp_update_wmem(sk); __mptcp_update_rmem(sk); From 417789df4a03bc820b082bcc503f0d4c5e4704b9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Mar 2021 13:32:15 -0800 Subject: [PATCH 119/164] mptcp: fix missing wakeup __mptcp_clean_una() can free write memory and should wake-up user-space processes when needed. When such function is invoked by the MPTCP receive path, the wakeup is not needed, as the TCP stack will later trigger subflow_write_space which will do the wakeup as needed. Other __mptcp_clean_una() call sites need an additional wakeup check Let's bundle the relevant code in a new helper and use it. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/165 Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks") Fixes: 64b9cea7a0af ("mptcp: fix spurious retransmissions") Tested-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d2a2169e6d9e..76958570ae7f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1061,6 +1061,12 @@ out: } } +static void __mptcp_clean_una_wakeup(struct sock *sk) +{ + __mptcp_clean_una(sk); + mptcp_write_space(sk); +} + static void mptcp_enter_memory_pressure(struct sock *sk) { struct mptcp_subflow_context *subflow; @@ -2270,7 +2276,7 @@ static void __mptcp_retrans(struct sock *sk) struct sock *ssk; int ret; - __mptcp_clean_una(sk); + __mptcp_clean_una_wakeup(sk); dfrag = mptcp_rtx_head(sk); if (!dfrag) return; @@ -2983,7 +2989,7 @@ static void mptcp_release_cb(struct sock *sk) } if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) - __mptcp_clean_una(sk); + __mptcp_clean_una_wakeup(sk); if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) __mptcp_error_report(sk); From 9238e900d6ec2e9b9ca3d8a9731acfd587fc577a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 4 Mar 2021 13:32:16 -0800 Subject: [PATCH 120/164] mptcp: free resources when the port number is mismatched When the port number is mismatched with the announced ones, use 'goto dispose_child' to free the resources instead of using 'goto out'. This patch also moves the port number checking code in subflow_syn_recv_sock before mptcp_finish_join, otherwise subflow_drop_ctx will fail in dispose_child. Fixes: 5bc56388c74f ("mptcp: add port number check for MP_JOIN") Reported-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 41695e26c374..3d47d670e665 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -687,11 +687,6 @@ create_child: /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; - if (!mptcp_finish_join(child)) - goto dispose_child; - - SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); - tcp_rsk(req)->drop_req = true; if (subflow_use_different_sport(owner, sk)) { pr_debug("ack inet_sport=%d %d", @@ -699,10 +694,16 @@ create_child: ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); - goto out; + goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } + + if (!mptcp_finish_join(child)) + goto dispose_child; + + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); + tcp_rsk(req)->drop_req = true; } } From 863a42b289c22df63db62b10fc2c2ffc237e2125 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Thu, 4 Mar 2021 10:30:09 -0800 Subject: [PATCH 121/164] netdevsim: init u64 stats for 32bit hardware Init the u64 stats in order to avoid the lockdep prints on the 32bit hardware like INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 0 PID: 4695 Comm: syz-executor.0 Not tainted 5.11.0-rc5-syzkaller #0 Hardware name: ARM-Versatile Express Backtrace: [<826fc5b8>] (dump_backtrace) from [<826fc82c>] (show_stack+0x18/0x1c arch/arm/kernel/traps.c:252) [<826fc814>] (show_stack) from [<8270d1f8>] (__dump_stack lib/dump_stack.c:79 [inline]) [<826fc814>] (show_stack) from [<8270d1f8>] (dump_stack+0xa8/0xc8 lib/dump_stack.c:120) [<8270d150>] (dump_stack) from [<802bf9c0>] (assign_lock_key kernel/locking/lockdep.c:935 [inline]) [<8270d150>] (dump_stack) from [<802bf9c0>] (register_lock_class+0xabc/0xb68 kernel/locking/lockdep.c:1247) [<802bef04>] (register_lock_class) from [<802baa2c>] (__lock_acquire+0x84/0x32d4 kernel/locking/lockdep.c:4711) [<802ba9a8>] (__lock_acquire) from [<802be840>] (lock_acquire.part.0+0xf0/0x554 kernel/locking/lockdep.c:5442) [<802be750>] (lock_acquire.part.0) from [<802bed10>] (lock_acquire+0x6c/0x74 kernel/locking/lockdep.c:5415) [<802beca4>] (lock_acquire) from [<81560548>] (seqcount_lockdep_reader_access include/linux/seqlock.h:103 [inline]) [<802beca4>] (lock_acquire) from [<81560548>] (__u64_stats_fetch_begin include/linux/u64_stats_sync.h:164 [inline]) [<802beca4>] (lock_acquire) from [<81560548>] (u64_stats_fetch_begin include/linux/u64_stats_sync.h:175 [inline]) [<802beca4>] (lock_acquire) from [<81560548>] (nsim_get_stats64+0xdc/0xf0 drivers/net/netdevsim/netdev.c:70) [<8156046c>] (nsim_get_stats64) from [<81e2efa0>] (dev_get_stats+0x44/0xd0 net/core/dev.c:10405) [<81e2ef5c>] (dev_get_stats) from [<81e53204>] (rtnl_fill_stats+0x38/0x120 net/core/rtnetlink.c:1211) [<81e531cc>] (rtnl_fill_stats) from [<81e59d58>] (rtnl_fill_ifinfo+0x6d4/0x148c net/core/rtnetlink.c:1783) [<81e59684>] (rtnl_fill_ifinfo) from [<81e5ceb4>] (rtmsg_ifinfo_build_skb+0x9c/0x108 net/core/rtnetlink.c:3798) [<81e5ce18>] (rtmsg_ifinfo_build_skb) from [<81e5d0ac>] (rtmsg_ifinfo_event net/core/rtnetlink.c:3830 [inline]) [<81e5ce18>] (rtmsg_ifinfo_build_skb) from [<81e5d0ac>] (rtmsg_ifinfo_event net/core/rtnetlink.c:3821 [inline]) [<81e5ce18>] (rtmsg_ifinfo_build_skb) from [<81e5d0ac>] (rtmsg_ifinfo+0x44/0x70 net/core/rtnetlink.c:3839) [<81e5d068>] (rtmsg_ifinfo) from [<81e45c2c>] (register_netdevice+0x664/0x68c net/core/dev.c:10103) [<81e455c8>] (register_netdevice) from [<815608bc>] (nsim_create+0xf8/0x124 drivers/net/netdevsim/netdev.c:317) [<815607c4>] (nsim_create) from [<81561184>] (__nsim_dev_port_add+0x108/0x188 drivers/net/netdevsim/dev.c:941) [<8156107c>] (__nsim_dev_port_add) from [<815620d8>] (nsim_dev_port_add_all drivers/net/netdevsim/dev.c:990 [inline]) [<8156107c>] (__nsim_dev_port_add) from [<815620d8>] (nsim_dev_probe+0x5cc/0x750 drivers/net/netdevsim/dev.c:1119) [<81561b0c>] (nsim_dev_probe) from [<815661dc>] (nsim_bus_probe+0x10/0x14 drivers/net/netdevsim/bus.c:287) [<815661cc>] (nsim_bus_probe) from [<811724c0>] (really_probe+0x100/0x50c drivers/base/dd.c:554) [<811723c0>] (really_probe) from [<811729c4>] (driver_probe_device+0xf8/0x1c8 drivers/base/dd.c:740) [<811728cc>] (driver_probe_device) from [<81172fe4>] (__device_attach_driver+0x8c/0xf0 drivers/base/dd.c:846) [<81172f58>] (__device_attach_driver) from [<8116fee0>] (bus_for_each_drv+0x88/0xd8 drivers/base/bus.c:431) [<8116fe58>] (bus_for_each_drv) from [<81172c6c>] (__device_attach+0xdc/0x1d0 drivers/base/dd.c:914) [<81172b90>] (__device_attach) from [<8117305c>] (device_initial_probe+0x14/0x18 drivers/base/dd.c:961) [<81173048>] (device_initial_probe) from [<81171358>] (bus_probe_device+0x90/0x98 drivers/base/bus.c:491) [<811712c8>] (bus_probe_device) from [<8116e77c>] (device_add+0x320/0x824 drivers/base/core.c:3109) [<8116e45c>] (device_add) from [<8116ec9c>] (device_register+0x1c/0x20 drivers/base/core.c:3182) [<8116ec80>] (device_register) from [<81566710>] (nsim_bus_dev_new drivers/net/netdevsim/bus.c:336 [inline]) [<8116ec80>] (device_register) from [<81566710>] (new_device_store+0x178/0x208 drivers/net/netdevsim/bus.c:215) [<81566598>] (new_device_store) from [<8116fcb4>] (bus_attr_store+0x2c/0x38 drivers/base/bus.c:122) [<8116fc88>] (bus_attr_store) from [<805b4b8c>] (sysfs_kf_write+0x48/0x54 fs/sysfs/file.c:139) [<805b4b44>] (sysfs_kf_write) from [<805b3c90>] (kernfs_fop_write_iter+0x128/0x1ec fs/kernfs/file.c:296) [<805b3b68>] (kernfs_fop_write_iter) from [<804d22fc>] (call_write_iter include/linux/fs.h:1901 [inline]) [<805b3b68>] (kernfs_fop_write_iter) from [<804d22fc>] (new_sync_write fs/read_write.c:518 [inline]) [<805b3b68>] (kernfs_fop_write_iter) from [<804d22fc>] (vfs_write+0x3dc/0x57c fs/read_write.c:605) [<804d1f20>] (vfs_write) from [<804d2604>] (ksys_write+0x68/0xec fs/read_write.c:658) [<804d259c>] (ksys_write) from [<804d2698>] (__do_sys_write fs/read_write.c:670 [inline]) [<804d259c>] (ksys_write) from [<804d2698>] (sys_write+0x10/0x14 fs/read_write.c:667) [<804d2688>] (sys_write) from [<80200060>] (ret_fast_syscall+0x0/0x2c arch/arm/mm/proc-v7.S:64) Fixes: 83c9e13aa39a ("netdevsim: add software driver for testing offloads") Reported-by: syzbot+e74a6857f2d0efe3ad81@syzkaller.appspotmail.com Tested-by: Dmitry Vyukov Signed-off-by: Hillf Danton Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/netdevsim/netdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index aec92440eef1..659d3dceb687 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -294,6 +294,7 @@ nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) dev_net_set(dev, nsim_dev_net(nsim_dev)); ns = netdev_priv(dev); ns->netdev = dev; + u64_stats_init(&ns->syncp); ns->nsim_dev = nsim_dev; ns->nsim_dev_port = nsim_dev_port; ns->nsim_bus_dev = nsim_dev->nsim_bus_dev; From 67eb211487f0c993d9f402d1c196ef159fd6a3b5 Mon Sep 17 00:00:00 2001 From: Jiri Wiesner Date: Thu, 4 Mar 2021 17:18:28 +0100 Subject: [PATCH 122/164] ibmvnic: always store valid MAC address The last change to ibmvnic_set_mac(), 8fc3672a8ad3, meant to prevent users from setting an invalid MAC address on an ibmvnic interface that has not been brought up yet. The change also prevented the requested MAC address from being stored by the adapter object for an ibmvnic interface when the state of the ibmvnic interface is VNIC_PROBED - that is after probing has finished but before the ibmvnic interface is brought up. The MAC address stored by the adapter object is used and sent to the hypervisor for checking when an ibmvnic interface is brought up. The ibmvnic driver ignoring the requested MAC address when in VNIC_PROBED state caused LACP bonds (bonds in 802.3ad mode) with more than one slave to malfunction. The bonding code must be able to change the MAC address of its slaves before they are brought up during enslaving. The inability of kernels with 8fc3672a8ad3 to set the MAC addresses of bonding slaves is observable in the output of "ip address show". The MAC addresses of the slaves are the same as the MAC address of the bond on a working system whereas the slaves retain their original MAC addresses on a system with a malfunctioning LACP bond. Fixes: 8fc3672a8ad3 ("ibmvnic: fix ibmvnic_set_mac") Signed-off-by: Jiri Wiesner Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 3bad762083c5..b6102ccf9b90 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1906,10 +1906,9 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; - if (adapter->state != VNIC_PROBED) { - ether_addr_copy(adapter->mac_addr, addr->sa_data); + ether_addr_copy(adapter->mac_addr, addr->sa_data); + if (adapter->state != VNIC_PROBED) rc = __ibmvnic_set_mac(netdev, addr->sa_data); - } return rc; } From ad5d07f4a9cd671233ae20983848874731102c08 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 4 Mar 2021 16:29:51 -0500 Subject: [PATCH 123/164] cipso,calipso: resolve a number of problems with the DOI refcounts The current CIPSO and CALIPSO refcounting scheme for the DOI definitions is a bit flawed in that we: 1. Don't correctly match gets/puts in netlbl_cipsov4_list(). 2. Decrement the refcount on each attempt to remove the DOI from the DOI list, only removing it from the list once the refcount drops to zero. This patch fixes these problems by adding the missing "puts" to netlbl_cipsov4_list() and introduces a more conventional, i.e. not-buggy, refcounting mechanism to the DOI definitions. Upon the addition of a DOI to the DOI list, it is initialized with a refcount of one, removing a DOI from the list removes it from the list and drops the refcount by one; "gets" and "puts" behave as expected with respect to refcounts, increasing and decreasing the DOI's refcount by one. Fixes: b1edeb102397 ("netlabel: Replace protocol/NetLabel linking with refrerence counts") Fixes: d7cce01504a0 ("netlabel: Add support for removing a CALIPSO DOI.") Reported-by: syzbot+9ec037722d2603a9f52e@syzkaller.appspotmail.com Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 11 +---------- net/ipv6/calipso.c | 14 +++++--------- net/netlabel/netlabel_cipso_v4.c | 3 +++ 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 471d33a0d095..be09c7669a79 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -519,16 +519,10 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) ret_val = -ENOENT; goto doi_remove_return; } - if (!refcount_dec_and_test(&doi_def->refcount)) { - spin_unlock(&cipso_v4_doi_list_lock); - ret_val = -EBUSY; - goto doi_remove_return; - } list_del_rcu(&doi_def->list); spin_unlock(&cipso_v4_doi_list_lock); - cipso_v4_cache_invalidate(); - call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); + cipso_v4_doi_putdef(doi_def); ret_val = 0; doi_remove_return: @@ -585,9 +579,6 @@ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) if (!refcount_dec_and_test(&doi_def->refcount)) return; - spin_lock(&cipso_v4_doi_list_lock); - list_del_rcu(&doi_def->list); - spin_unlock(&cipso_v4_doi_list_lock); cipso_v4_cache_invalidate(); call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 51184a70ac7e..1578ed9e97d8 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -83,6 +83,9 @@ struct calipso_map_cache_entry { static struct calipso_map_cache_bkt *calipso_cache; +static void calipso_cache_invalidate(void); +static void calipso_doi_putdef(struct calipso_doi *doi_def); + /* Label Mapping Cache Functions */ @@ -444,15 +447,10 @@ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) ret_val = -ENOENT; goto doi_remove_return; } - if (!refcount_dec_and_test(&doi_def->refcount)) { - spin_unlock(&calipso_doi_list_lock); - ret_val = -EBUSY; - goto doi_remove_return; - } list_del_rcu(&doi_def->list); spin_unlock(&calipso_doi_list_lock); - call_rcu(&doi_def->rcu, calipso_doi_free_rcu); + calipso_doi_putdef(doi_def); ret_val = 0; doi_remove_return: @@ -508,10 +506,8 @@ static void calipso_doi_putdef(struct calipso_doi *doi_def) if (!refcount_dec_and_test(&doi_def->refcount)) return; - spin_lock(&calipso_doi_list_lock); - list_del_rcu(&doi_def->list); - spin_unlock(&calipso_doi_list_lock); + calipso_cache_invalidate(); call_rcu(&doi_def->rcu, calipso_doi_free_rcu); } diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 726dda95934c..4f50a64315cf 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -575,6 +575,7 @@ list_start: break; } + cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); genlmsg_end(ans_skb, data); @@ -583,12 +584,14 @@ list_start: list_retry: /* XXX - this limit is a guesstimate */ if (nlsze_mult < 4) { + cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); kfree_skb(ans_skb); nlsze_mult *= 2; goto list_start; } list_failure_lock: + cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); list_failure: kfree_skb(ans_skb); From 39491867ace594b4912c35f576864d204beed2b3 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 4 Mar 2021 18:56:46 -0800 Subject: [PATCH 124/164] bpf: Explicitly zero-extend R0 after 32-bit cmpxchg As pointed out by Ilya and explained in the new comment, there's a discrepancy between x86 and BPF CMPXCHG semantics: BPF always loads the value from memory into r0, while x86 only does so when r0 and the value in memory are different. The same issue affects s390. At first this might sound like pure semantics, but it makes a real difference when the comparison is 32-bit, since the load will zero-extend r0/rax. The fix is to explicitly zero-extend rax after doing such a CMPXCHG. Since this problem affects multiple archs, this is done in the verifier by patching in a BPF_ZEXT_REG instruction after every 32-bit cmpxchg. Any archs that don't need such manual zero-extension can do a look-ahead with insn_is_zext to skip the unnecessary mov. Note this still goes on top of Ilya's patch: https://lore.kernel.org/bpf/20210301154019.129110-1-iii@linux.ibm.com/T/#u Differences v5->v6[1]: - Moved is_cmpxchg_insn and ensured it can be safely re-used. Also renamed it and removed 'inline' to match the style of the is_*_function helpers. - Fixed up comments in verifier test (thanks for the careful review, Martin!) Differences v4->v5[1]: - Moved the logic entirely into opt_subreg_zext_lo32_rnd_hi32, thanks to Martin for suggesting this. Differences v3->v4[1]: - Moved the optimization against pointless zext into the correct place: opt_subreg_zext_lo32_rnd_hi32 is called _after_ fixup_bpf_calls. Differences v2->v3[1]: - Moved patching into fixup_bpf_calls (patch incoming to rename this function) - Added extra commentary on bpf_jit_needs_zext - Added check to avoid adding a pointless zext(r0) if there's already one there. Difference v1->v2[1]: Now solved centrally in the verifier instead of specifically for the x86 JIT. Thanks to Ilya and Daniel for the suggestions! [1] v5: https://lore.kernel.org/bpf/CA+i-1C3ytZz6FjcPmUg5s4L51pMQDxWcZNvM86w4RHZ_o2khwg@mail.gmail.com/T/#t v4: https://lore.kernel.org/bpf/CA+i-1C3ytZz6FjcPmUg5s4L51pMQDxWcZNvM86w4RHZ_o2khwg@mail.gmail.com/T/#t v3: https://lore.kernel.org/bpf/08669818-c99d-0d30-e1db-53160c063611@iogearbox.net/T/#t v2: https://lore.kernel.org/bpf/08669818-c99d-0d30-e1db-53160c063611@iogearbox.net/T/#t v1: https://lore.kernel.org/bpf/d7ebaefb-bfd6-a441-3ff2-2fdfe699b1d2@iogearbox.net/T/#t Reported-by: Ilya Leoshkevich Fixes: 5ffa25502b5a ("bpf: Add instructions for atomic_[cmp]xchg") Signed-off-by: Brendan Jackman Acked-by: Martin KaFai Lau Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 +++ kernel/bpf/verifier.c | 19 +++++++++++++- .../selftests/bpf/verifier/atomic_cmpxchg.c | 25 +++++++++++++++++++ .../selftests/bpf/verifier/atomic_or.c | 25 +++++++++++++++++++ 4 files changed, 72 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa1e64196d8d..3a283bf97f2f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2344,6 +2344,10 @@ bool __weak bpf_helper_changes_pkt_data(void *func) /* Return TRUE if the JIT backend wants verifier to enable sub-register usage * analysis code and wants explicit zero extension inserted by verifier. * Otherwise, return FALSE. + * + * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if + * you don't override this. JITs that don't want these extra insns can detect + * them using insn_is_zext. */ bool __weak bpf_jit_needs_zext(void) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb3eaab934f3..c56e3fcb5f1a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,13 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } +static bool is_cmpxchg_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -11067,7 +11074,17 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, goto apply_patch_buffer; } - if (!bpf_jit_needs_zext()) + /* Add in an zero-extend instruction if a) the JIT has requested + * it or b) it's a CMPXCHG. + * + * The latter is because: BPF_CMPXCHG always loads a value into + * R0, therefore always zero-extends. However some archs' + * equivalent instruction only does this load when the + * comparison is successful. This detail of CMPXCHG is + * orthogonal to the general zero-extension behaviour of the + * CPU, so it's treated independently of bpf_jit_needs_zext. + */ + if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn)) continue; if (WARN_ON(load_reg == -1)) { diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c index 2efd8bcf57a1..6e52dfc64415 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c +++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c @@ -94,3 +94,28 @@ .result = REJECT, .errstr = "invalid read from stack", }, +{ + "BPF_W cmpxchg should zero top 32 bits", + .insns = { + /* r0 = U64_MAX; */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1), + /* u64 val = r0; */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + /* r0 = (u32)atomic_cmpxchg((u32 *)&val, r0, 1); */ + BPF_MOV32_IMM(BPF_REG_1, 1), + BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_1, -8), + /* r1 = 0x00000000FFFFFFFFull; */ + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1), + /* if (r0 != r1) exit(1); */ + BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_1, 2), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_or.c b/tools/testing/selftests/bpf/verifier/atomic_or.c index 70f982e1f9f0..9d0716ac5080 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_or.c +++ b/tools/testing/selftests/bpf/verifier/atomic_or.c @@ -75,3 +75,28 @@ }, .result = ACCEPT, }, +{ + "BPF_W atomic_fetch_or should zero top 32 bits", + .insns = { + /* r1 = U64_MAX; */ + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1), + /* u64 val = r1; */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* r1 = (u32)atomic_fetch_or((u32 *)&val, 2); */ + BPF_MOV32_IMM(BPF_REG_1, 2), + BPF_ATOMIC_OP(BPF_W, BPF_OR | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* r2 = 0x00000000FFFFFFFF; */ + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 1), + /* if (r2 != r1) exit(1); */ + BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_1, 2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, From 00ff801bb8ce6711e919af4530b6ffa14a22390a Mon Sep 17 00:00:00 2001 From: "Kevin(Yudong) Yang" Date: Wed, 3 Mar 2021 09:43:54 -0500 Subject: [PATCH 125/164] net/mlx4_en: update moderation when config reset This patch fixes a bug that the moderation config will not be applied when calling mlx4_en_reset_config. For example, when turning on rx timestamping, mlx4_en_reset_config() will be called, causing the NIC to forget previous moderation config. This fix is in phase with a previous fix: commit 79c54b6bbf06 ("net/mlx4_en: Fix TX moderation info loss after set_ringparam is called") Tested: Before this patch, on a host with NIC using mlx4, run netserver and stream TCP to the host at full utilization. $ sar -I SUM 1 INTR intr/s 14:03:56 sum 48758.00 After rx hwtstamp is enabled: $ sar -I SUM 1 14:10:38 sum 317771.00 We see the moderation is not working properly and issued 7x more interrupts. After the patch, and turned on rx hwtstamp, the rate of interrupts is as expected: $ sar -I SUM 1 14:52:11 sum 49332.00 Fixes: 79c54b6bbf06 ("net/mlx4_en: Fix TX moderation info loss after set_ringparam is called") Signed-off-by: Kevin(Yudong) Yang Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell CC: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 2 ++ drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 23849f2b9c25..1434df66fcf2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -47,7 +47,7 @@ #define EN_ETHTOOL_SHORT_MASK cpu_to_be16(0xffff) #define EN_ETHTOOL_WORD_MASK cpu_to_be32(0xffffffff) -static int mlx4_en_moderation_update(struct mlx4_en_priv *priv) +int mlx4_en_moderation_update(struct mlx4_en_priv *priv) { int i, t; int err = 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 51b9700fce83..5d0c9c62382d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -3554,6 +3554,8 @@ int mlx4_en_reset_config(struct net_device *dev, en_err(priv, "Failed starting port\n"); } + if (!err) + err = mlx4_en_moderation_update(priv); out: mutex_unlock(&mdev->state_lock); kfree(tmp); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index e8ed23190de0..f3d1a20201ef 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -775,6 +775,7 @@ void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev); #define DEV_FEATURE_CHANGED(dev, new_features, feature) \ ((dev->features & feature) ^ (new_features & feature)) +int mlx4_en_moderation_update(struct mlx4_en_priv *priv); int mlx4_en_reset_config(struct net_device *dev, struct hwtstamp_config ts_config, netdev_features_t new_features); From 38c26ff3048af50eee3fcd591921357ee5bfd9ee Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Thu, 4 Mar 2021 18:06:48 -0800 Subject: [PATCH 126/164] net: tehuti: fix error return code in bdx_probe() When bdx_read_mac() fails, no error return code of bdx_probe() is assigned. To fix this bug, err is assigned with -EFAULT as error return code. Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/tehuti/tehuti.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c index b8f4f419173f..d054c6e83b1c 100644 --- a/drivers/net/ethernet/tehuti/tehuti.c +++ b/drivers/net/ethernet/tehuti/tehuti.c @@ -2044,6 +2044,7 @@ bdx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /*bdx_hw_reset(priv); */ if (bdx_read_mac(priv)) { pr_err("load MAC address failed\n"); + err = -EFAULT; goto err_out_iomap; } SET_NETDEV_DEV(ndev, &pdev->dev); From 6650d31f21b8a0043613ae0a4a2e42e49dc20b2d Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Thu, 4 Mar 2021 19:10:10 -0800 Subject: [PATCH 127/164] net: intel: iavf: fix error return code of iavf_init_get_resources() When iavf_process_config() fails, no error return code of iavf_init_get_resources() is assigned. To fix this bug, err is assigned with the return value of iavf_process_config(), and then err is checked. Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/iavf/iavf_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 0a867d64d467..dc5b3c06d1e0 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1776,7 +1776,8 @@ static int iavf_init_get_resources(struct iavf_adapter *adapter) goto err_alloc; } - if (iavf_process_config(adapter)) + err = iavf_process_config(adapter); + if (err) goto err_alloc; adapter->current_op = VIRTCHNL_OP_UNKNOWN; From 9a7b3950c7e15968e23d83be215e95ccc7c92a53 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Fri, 5 Mar 2021 13:49:30 +0800 Subject: [PATCH 128/164] net: stmmac: Fix VLAN filter delete timeout issue in Intel mGBE SGMII For Intel mGbE controller, MAC VLAN filter delete operation will time-out if serdes power-down sequence happened first during driver remove() with below message. [82294.764958] intel-eth-pci 0000:00:1e.4 eth2: stmmac_dvr_remove: removing driver [82294.778677] intel-eth-pci 0000:00:1e.4 eth2: Timeout accessing MAC_VLAN_Tag_Filter [82294.779997] intel-eth-pci 0000:00:1e.4 eth2: failed to kill vid 0081/0 [82294.947053] intel-eth-pci 0000:00:1d.2 eth1: stmmac_dvr_remove: removing driver [82295.002091] intel-eth-pci 0000:00:1d.1 eth0: stmmac_dvr_remove: removing driver Therefore, we delay the serdes power-down to be after unregister_netdev() which triggers the VLAN filter delete. Fixes: b9663b7ca6ff ("net: stmmac: Enable SERDES power up/down sequence") Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 0eba44e9c1f8..208cae344ffa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5249,13 +5249,16 @@ int stmmac_dvr_remove(struct device *dev) netdev_info(priv->dev, "%s: removing driver", __func__); stmmac_stop_all_dma(priv); - - if (priv->plat->serdes_powerdown) - priv->plat->serdes_powerdown(ndev, priv->plat->bsp_priv); - stmmac_mac_set(priv, priv->ioaddr, false); netif_carrier_off(ndev); unregister_netdev(ndev); + + /* Serdes power down needs to happen after VLAN filter + * is deleted that is triggered by unregister_netdev(). + */ + if (priv->plat->serdes_powerdown) + priv->plat->serdes_powerdown(ndev, priv->plat->bsp_priv); + #ifdef CONFIG_DEBUG_FS stmmac_exit_fs(ndev); #endif From 8eb37ab7cc045ec6305a6a1a9c32374695a1a977 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Fri, 5 Mar 2021 14:03:42 +0800 Subject: [PATCH 129/164] stmmac: intel: Fixes clock registration error seen for multiple interfaces Issue seen when enumerating multiple Intel mGbE interfaces in EHL. [ 6.898141] intel-eth-pci 0000:00:1d.2: enabling device (0000 -> 0002) [ 6.900971] intel-eth-pci 0000:00:1d.2: Fail to register stmmac-clk [ 6.906434] intel-eth-pci 0000:00:1d.2: User ID: 0x51, Synopsys ID: 0x52 We fix it by making the clock name to be unique following the format of stmmac-pci_name(pci_dev) so that we can differentiate the clock for these Intel mGbE interfaces in EHL platform as follow: /sys/kernel/debug/clk/stmmac-0000:00:1d.1 /sys/kernel/debug/clk/stmmac-0000:00:1d.2 /sys/kernel/debug/clk/stmmac-0000:00:1e.4 Fixes: 58da0cfa6cf1 ("net: stmmac: create dwmac-intel.c to contain all Intel platform") Signed-off-by: Wong Vee Khee Signed-off-by: Voon Weifeng Co-developed-by: Ong Boon Leong Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index f2896872a86c..0b64f7710d17 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -233,6 +233,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) static int intel_mgbe_common_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { + char clk_name[20]; int ret; int i; @@ -301,8 +302,10 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->eee_usecs_rate = plat->clk_ptp_rate; /* Set system clock */ + sprintf(clk_name, "%s-%s", "stmmac", pci_name(pdev)); + plat->stmmac_clk = clk_register_fixed_rate(&pdev->dev, - "stmmac-clk", NULL, 0, + clk_name, NULL, 0, plat->clk_ptp_rate); if (IS_ERR(plat->stmmac_clk)) { From e233febda6ebab750e30662a7cc9b9efad127685 Mon Sep 17 00:00:00 2001 From: Sergey Nazarov Date: Fri, 5 Mar 2021 11:05:54 +0300 Subject: [PATCH 130/164] CIPSO: Fix unaligned memory access in cipso_v4_gentag_hdr We need to use put_unaligned when writing 32-bit DOI value in cipso_v4_gentag_hdr to avoid unaligned memory access. v2: unneeded type cast removed as Ondrej Mosnacek suggested. Signed-off-by: Sergey Nazarov Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index be09c7669a79..bfaf327e9d12 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1153,7 +1153,7 @@ static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def, { buf[0] = IPOPT_CIPSO; buf[1] = CIPSO_V4_HDR_LEN + len; - *(__be32 *)&buf[2] = htonl(doi_def->doi); + put_unaligned_be32(doi_def->doi, &buf[2]); } /** From 69cdb7947adb816fc9325b4ec02a6dddd5070b82 Mon Sep 17 00:00:00 2001 From: Junlin Yang Date: Fri, 5 Mar 2021 16:48:39 +0800 Subject: [PATCH 131/164] ibmvnic: remove excessive irqsave ibmvnic_remove locks multiple spinlocks while disabling interrupts: spin_lock_irqsave(&adapter->state_lock, flags); spin_lock_irqsave(&adapter->rwi_lock, flags); As reported by coccinelle, the second _irqsave() overwrites the value saved in 'flags' by the first _irqsave(), therefore when the second _irqrestore() comes,the value in 'flags' is not valid,the value saved by the first _irqsave() has been lost. This likely leads to IRQs remaining disabled. So remove the second _irqsave(): spin_lock_irqsave(&adapter->state_lock, flags); spin_lock(&adapter->rwi_lock); Generated by: ./scripts/coccinelle/locks/flags.cocci ./drivers/net/ethernet/ibm/ibmvnic.c:5413:1-18: ERROR: nested lock+irqsave that reuses flags from line 5404. Fixes: 4a41c421f367 ("ibmvnic: serialize access to work queue on remove") Signed-off-by: Junlin Yang Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index b6102ccf9b90..161fa95e8768 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5408,9 +5408,9 @@ static int ibmvnic_remove(struct vio_dev *dev) * after setting state, so __ibmvnic_reset() which is called * from the flush_work() below, can make progress. */ - spin_lock_irqsave(&adapter->rwi_lock, flags); + spin_lock(&adapter->rwi_lock); adapter->state = VNIC_REMOVING; - spin_unlock_irqrestore(&adapter->rwi_lock, flags); + spin_unlock(&adapter->rwi_lock); spin_unlock_irqrestore(&adapter->state_lock, flags); From 0a7e0c3b5702a6a76cf7e5b8cc10a73e51dc221e Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Fri, 5 Mar 2021 09:33:06 +0000 Subject: [PATCH 132/164] selftest/net/ipsec.c: Remove unneeded semicolon fix semicolon.cocci warning: tools/testing/selftests/net/ipsec.c:1788:2-3: Unneeded semicolon Signed-off-by: Xu Wang Signed-off-by: David S. Miller --- tools/testing/selftests/net/ipsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index 17ced7d6ce25..f23438d512c5 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -1785,7 +1785,7 @@ static void grand_child_serv(unsigned int nr, int cmd_fd, void *buf, break; default: printk("got unknown msg type %d", msg->type); - }; + } } static int grand_child_f(unsigned int nr, int cmd_fd, void *buf) From abbf9a0ef8848dca58c5b97750c1c59bbee45637 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 5 Mar 2021 17:34:41 +0800 Subject: [PATCH 133/164] r8169: fix r8168fp_adjust_ocp_cmd function The (0xBAF70000 & 0x00FFF000) << 6 should be (0xf70 << 18). Fixes: 561535b0f239 ("r8169: fix OCP access on RTL8117") Signed-off-by: Hayes Wang Acked-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index f704da3f214c..7aad0ba53372 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -767,7 +767,7 @@ static void r8168fp_adjust_ocp_cmd(struct rtl8169_private *tp, u32 *cmd, int typ if (type == ERIAR_OOB && (tp->mac_version == RTL_GIGA_MAC_VER_52 || tp->mac_version == RTL_GIGA_MAC_VER_53)) - *cmd |= 0x7f0 << 18; + *cmd |= 0xf70 << 18; } DECLARE_RTL_COND(rtl_eriar_cond) From 85554bcd123e307282631defe6bf6fac5031cf60 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 5 Mar 2021 19:26:22 +0300 Subject: [PATCH 134/164] net/hamradio/6pack: remove redundant check in sp_encaps() "len > sp->mtu" checked twice in a row in sp_encaps(). Remove the second check. Signed-off-by: Denis Efremov Signed-off-by: David S. Miller --- drivers/net/hamradio/6pack.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 71d6629e65c9..9f5b5614a150 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -171,11 +171,6 @@ static void sp_encaps(struct sixpack *sp, unsigned char *icp, int len) goto out_drop; } - if (len > sp->mtu) { /* sp->mtu = AX25_MTU = max. PACLEN = 256 */ - msg = "oversized transmit packet!"; - goto out_drop; - } - if (p[0] > 5) { msg = "invalid KISS command"; goto out_drop; From 155b23e6e53475ca3b8c2a946299b4d4dd6a5a1e Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 5 Mar 2021 20:02:12 +0300 Subject: [PATCH 135/164] sun/niu: fix wrong RXMAC_BC_FRM_CNT_COUNT count RXMAC_BC_FRM_CNT_COUNT added to mp->rx_bcasts twice in a row in niu_xmac_interrupt(). Remove the second addition. Signed-off-by: Denis Efremov Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/niu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 68695d4afacd..707ccdd03b19 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -3931,8 +3931,6 @@ static void niu_xmac_interrupt(struct niu *np) mp->rx_mcasts += RXMAC_MC_FRM_CNT_COUNT; if (val & XRXMAC_STATUS_RXBCAST_CNT_EXP) mp->rx_bcasts += RXMAC_BC_FRM_CNT_COUNT; - if (val & XRXMAC_STATUS_RXBCAST_CNT_EXP) - mp->rx_bcasts += RXMAC_BC_FRM_CNT_COUNT; if (val & XRXMAC_STATUS_RXHIST1_CNT_EXP) mp->rx_hist_cnt1 += RXMAC_HIST_CNT1_COUNT; if (val & XRXMAC_STATUS_RXHIST2_CNT_EXP) From d8861bab48b6c1fc3cdbcab8ff9d1eaea43afe7f Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Thu, 4 Mar 2021 20:52:52 +0100 Subject: [PATCH 136/164] gianfar: fix jumbo packets+napi+rx overrun crash When using jumbo packets and overrunning rx queue with napi enabled, the following sequence is observed in gfar_add_rx_frag: | lstatus | | skb | t | lstatus, size, flags | first | len, data_len, *ptr | ---+--------------------------------------+-------+-----------------------+ 13 | 18002348, 9032, INTERRUPT LAST | 0 | 9600, 8000, f554c12e | 12 | 10000640, 1600, INTERRUPT | 0 | 8000, 6400, f554c12e | 11 | 10000640, 1600, INTERRUPT | 0 | 6400, 4800, f554c12e | 10 | 10000640, 1600, INTERRUPT | 0 | 4800, 3200, f554c12e | 09 | 10000640, 1600, INTERRUPT | 0 | 3200, 1600, f554c12e | 08 | 14000640, 1600, INTERRUPT FIRST | 0 | 1600, 0, f554c12e | 07 | 14000640, 1600, INTERRUPT FIRST | 1 | 0, 0, f554c12e | 06 | 1c000080, 128, INTERRUPT LAST FIRST | 1 | 0, 0, abf3bd6e | 05 | 18002348, 9032, INTERRUPT LAST | 0 | 8000, 6400, c5a57780 | 04 | 10000640, 1600, INTERRUPT | 0 | 6400, 4800, c5a57780 | 03 | 10000640, 1600, INTERRUPT | 0 | 4800, 3200, c5a57780 | 02 | 10000640, 1600, INTERRUPT | 0 | 3200, 1600, c5a57780 | 01 | 10000640, 1600, INTERRUPT | 0 | 1600, 0, c5a57780 | 00 | 14000640, 1600, INTERRUPT FIRST | 1 | 0, 0, c5a57780 | So at t=7 a new packets is started but not finished, probably due to rx overrun - but rx overrun is not indicated in the flags. Instead a new packets starts at t=8. This results in skb->len to exceed size for the LAST fragment at t=13 and thus a negative fragment size added to the skb. This then crashes: kernel BUG at include/linux/skbuff.h:2277! Oops: Exception in kernel mode, sig: 5 [#1] ... NIP [c04689f4] skb_pull+0x2c/0x48 LR [c03f62ac] gfar_clean_rx_ring+0x2e4/0x844 Call Trace: [ec4bfd38] [c06a84c4] _raw_spin_unlock_irqrestore+0x60/0x7c (unreliable) [ec4bfda8] [c03f6a44] gfar_poll_rx_sq+0x48/0xe4 [ec4bfdc8] [c048d504] __napi_poll+0x54/0x26c [ec4bfdf8] [c048d908] net_rx_action+0x138/0x2c0 [ec4bfe68] [c06a8f34] __do_softirq+0x3a4/0x4fc [ec4bfed8] [c0040150] run_ksoftirqd+0x58/0x70 [ec4bfee8] [c0066ecc] smpboot_thread_fn+0x184/0x1cc [ec4bff08] [c0062718] kthread+0x140/0x144 [ec4bff38] [c0012350] ret_from_kernel_thread+0x14/0x1c This patch fixes this by checking for computed LAST fragment size, so a negative sized fragment is never added. In order to prevent the newer rx frame from getting corrupted, the FIRST flag is checked to discard the incomplete older frame. Signed-off-by: Michael Braun Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 541de32ea662..1cf8ef717453 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -2390,6 +2390,10 @@ static bool gfar_add_rx_frag(struct gfar_rx_buff *rxb, u32 lstatus, if (lstatus & BD_LFLAG(RXBD_LAST)) size -= skb->len; + WARN(size < 0, "gianfar: rx fragment size underflow"); + if (size < 0) + return false; + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, rxb->page_offset + RXBUF_ALIGNMENT, size, GFAR_RXB_TRUESIZE); @@ -2552,6 +2556,17 @@ static int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, if (lstatus & BD_LFLAG(RXBD_EMPTY)) break; + /* lost RXBD_LAST descriptor due to overrun */ + if (skb && + (lstatus & BD_LFLAG(RXBD_FIRST))) { + /* discard faulty buffer */ + dev_kfree_skb(skb); + skb = NULL; + rx_queue->stats.rx_dropped++; + + /* can continue normally */ + } + /* order rx buffer descriptor reads */ rmb(); From 3e21a10fdea3c2e4e4d1b72cb9d720256461af40 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Fri, 5 Mar 2021 16:24:45 -0600 Subject: [PATCH 137/164] lan743x: trim all 4 bytes of the FCS; not just 2 Trim all 4 bytes of the received FCS; not just 2 of them. Leaving 2 bytes of the FCS on the frame breaks DSA tailing tag drivers. Fixes: a8db76d40e4d ("lan743x: boost performance on cpu archs w/o dma cache snooping") Signed-off-by: George McCollister Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan743x_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index dbdfabff3b00..1c3e204d727c 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -2040,7 +2040,7 @@ lan743x_rx_trim_skb(struct sk_buff *skb, int frame_length) dev_kfree_skb_irq(skb); return NULL; } - frame_length = max_t(int, 0, frame_length - RX_HEAD_PADDING - 2); + frame_length = max_t(int, 0, frame_length - RX_HEAD_PADDING - 4); if (skb->len > frame_length) { skb->tail -= skb->len - frame_length; skb->len = frame_length; From a4dcfbc4ee2218abd567d81d795082d8d4afcdf6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 5 Mar 2021 14:17:29 -0800 Subject: [PATCH 138/164] ethernet: alx: fix order of calls on resume netif_device_attach() will unpause the queues so we can't call it before __alx_open(). This went undetected until commit b0999223f224 ("alx: add ability to allocate and free alx_napi structures") but now if stack tries to xmit immediately on resume before __alx_open() we'll crash on the NAPI being null: BUG: kernel NULL pointer dereference, address: 0000000000000198 CPU: 0 PID: 12 Comm: ksoftirqd/0 Tainted: G OE 5.10.0-3-amd64 #1 Debian 5.10.13-1 Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./H77-D3H, BIOS F15 11/14/2013 RIP: 0010:alx_start_xmit+0x34/0x650 [alx] Code: 41 56 41 55 41 54 55 53 48 83 ec 20 0f b7 57 7c 8b 8e b0 0b 00 00 39 ca 72 06 89 d0 31 d2 f7 f1 89 d2 48 8b 84 df RSP: 0018:ffffb09240083d28 EFLAGS: 00010297 RAX: 0000000000000000 RBX: ffffa04d80ae7800 RCX: 0000000000000004 RDX: 0000000000000000 RSI: ffffa04d80afa000 RDI: ffffa04e92e92a00 RBP: 0000000000000042 R08: 0000000000000100 R09: ffffa04ea3146700 R10: 0000000000000014 R11: 0000000000000000 R12: ffffa04e92e92100 R13: 0000000000000001 R14: ffffa04e92e92a00 R15: ffffa04e92e92a00 FS: 0000000000000000(0000) GS:ffffa0508f600000(0000) knlGS:0000000000000000 i915 0000:00:02.0: vblank wait timed out on crtc 0 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000198 CR3: 000000004460a001 CR4: 00000000001706f0 Call Trace: dev_hard_start_xmit+0xc7/0x1e0 sch_direct_xmit+0x10f/0x310 Cc: # 4.9+ Fixes: bc2bebe8de8e ("alx: remove WoL support") Reported-by: Zbynek Michl Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=983595 Signed-off-by: Jakub Kicinski Tested-by: Zbynek Michl Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/alx/main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c index 9b7f1af5f574..9e02f8864593 100644 --- a/drivers/net/ethernet/atheros/alx/main.c +++ b/drivers/net/ethernet/atheros/alx/main.c @@ -1894,13 +1894,16 @@ static int alx_resume(struct device *dev) if (!netif_running(alx->dev)) return 0; - netif_device_attach(alx->dev); rtnl_lock(); err = __alx_open(alx, true); rtnl_unlock(); + if (err) + return err; - return err; + netif_device_attach(alx->dev); + + return 0; } static SIMPLE_DEV_PM_OPS(alx_pm_ops, alx_suspend, alx_resume); From bf9279cd63dcc144b2a3c4c76d8b6b4c30b05c22 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Fri, 5 Mar 2021 10:14:48 +0100 Subject: [PATCH 139/164] net: dsa: bcm_sf2: simplify optional reset handling As of commit bb475230b8e5 ("reset: make optional functions really optional"), the reset framework API calls use NULL pointers to describe optional, non-present reset controls. This allows to unconditionally return errors from devm_reset_control_get_optional_exclusive. Signed-off-by: Philipp Zabel Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 5ee8103b8e9c..f277df922fcd 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -406,7 +406,7 @@ static int bcm_sf2_sw_rst(struct bcm_sf2_priv *priv) /* The watchdog reset does not work on 7278, we need to hit the * "external" reset line through the reset controller. */ - if (priv->type == BCM7278_DEVICE_ID && !IS_ERR(priv->rcdev)) { + if (priv->type == BCM7278_DEVICE_ID) { ret = reset_control_assert(priv->rcdev); if (ret) return ret; @@ -1265,7 +1265,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) priv->rcdev = devm_reset_control_get_optional_exclusive(&pdev->dev, "switch"); - if (PTR_ERR(priv->rcdev) == -EPROBE_DEFER) + if (IS_ERR(priv->rcdev)) return PTR_ERR(priv->rcdev); /* Auto-detection using standard registers will not work, so @@ -1426,7 +1426,7 @@ static int bcm_sf2_sw_remove(struct platform_device *pdev) bcm_sf2_mdio_unregister(priv); clk_disable_unprepare(priv->clk_mdiv); clk_disable_unprepare(priv->clk); - if (priv->type == BCM7278_DEVICE_ID && !IS_ERR(priv->rcdev)) + if (priv->type == BCM7278_DEVICE_ID) reset_control_assert(priv->rcdev); return 0; From a4813dc7baa4898f66c84ef68274bbbd1a0ae224 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sun, 7 Mar 2021 02:50:28 +0530 Subject: [PATCH 140/164] net: ethernet: chelsio: inline_crypto: Mundane typos fixed throughout the file chcr_ktls.c Mundane typos fixes throughout the file. s/establised/established/ s/availbale/available/ s/vaues/values/ s/Incase/In case/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: David S. Miller --- .../ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 46a809f2aeca..169e10c91378 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -672,7 +672,7 @@ static int chcr_ktls_cpl_act_open_rpl(struct adapter *adap, if (tx_info->pending_close) { spin_unlock(&tx_info->lock); if (!status) { - /* it's a late success, tcb status is establised, + /* it's a late success, tcb status is established, * mark it close. */ chcr_ktls_mark_tcb_close(tx_info); @@ -930,7 +930,7 @@ chcr_ktls_get_tx_flits(u32 nr_frags, unsigned int key_ctx_len) } /* - * chcr_ktls_check_tcp_options: To check if there is any TCP option availbale + * chcr_ktls_check_tcp_options: To check if there is any TCP option available * other than timestamp. * @skb - skb contains partial record.. * return: 1 / 0 @@ -1115,7 +1115,7 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, } if (unlikely(credits < ETHTXQ_STOP_THRES)) { - /* Credits are below the threshold vaues, stop the queue after + /* Credits are below the threshold values, stop the queue after * injecting the Work Request for this packet. */ chcr_eth_txq_stop(q); @@ -2006,7 +2006,7 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) /* TCP segments can be in received either complete or partial. * chcr_end_part_handler will handle cases if complete record or end - * part of the record is received. Incase of partial end part of record, + * part of the record is received. In case of partial end part of record, * we will send the complete record again. */ From 492bbe7f8a43ff20bb9bfc6b98220dcfb7e5992f Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Sat, 6 Mar 2021 14:12:31 -0800 Subject: [PATCH 141/164] net: usb: cdc_ncm: emit dev_err on error paths Several error paths in bind/probe code will only emit output using dev_dbg. But if we are going to fail the bind/probe, emit related output with "err" priority. Signed-off-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 4087c9e33781..8acf30115428 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -851,17 +851,17 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ /* check if we got everything */ if (!ctx->data) { - dev_dbg(&intf->dev, "CDC Union missing and no IAD found\n"); + dev_err(&intf->dev, "CDC Union missing and no IAD found\n"); goto error; } if (cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) { if (!ctx->mbim_desc) { - dev_dbg(&intf->dev, "MBIM functional descriptor missing\n"); + dev_err(&intf->dev, "MBIM functional descriptor missing\n"); goto error; } } else { if (!ctx->ether_desc || !ctx->func_desc) { - dev_dbg(&intf->dev, "NCM or ECM functional descriptors missing\n"); + dev_err(&intf->dev, "NCM or ECM functional descriptors missing\n"); goto error; } } @@ -870,7 +870,7 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ if (ctx->data != ctx->control) { temp = usb_driver_claim_interface(driver, ctx->data, dev); if (temp) { - dev_dbg(&intf->dev, "failed to claim data intf\n"); + dev_err(&intf->dev, "failed to claim data intf\n"); goto error; } } @@ -926,7 +926,7 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ if (ctx->ether_desc) { temp = usbnet_get_ethernet_addr(dev, ctx->ether_desc->iMACAddress); if (temp) { - dev_dbg(&intf->dev, "failed to get mac address\n"); + dev_err(&intf->dev, "failed to get mac address\n"); goto error2; } dev_info(&intf->dev, "MAC-Address: %pM\n", dev->net->dev_addr); From 4d8c79b7e9ff05030aad68421f7584b129933ba6 Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Sat, 6 Mar 2021 14:12:32 -0800 Subject: [PATCH 142/164] net: usb: log errors to dmesg/syslog Errors in protocol should be logged when the driver aborts operations. If the driver can carry on and "humor" the device, then emitting the message as debug output level is fine. Signed-off-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index b4c8080e6f87..f4f37ecfed58 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -887,7 +887,7 @@ int usbnet_open (struct net_device *net) // insist peer be connected if (info->check_connect && (retval = info->check_connect (dev)) < 0) { - netif_dbg(dev, ifup, dev->net, "can't open; %d\n", retval); + netif_err(dev, ifup, dev->net, "can't open; %d\n", retval); goto done; } From 143c253f42bad20357e7e4432087aca747c43384 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 7 Mar 2021 00:40:12 -0800 Subject: [PATCH 143/164] net: hisilicon: hns: fix error return code of hns_nic_clear_all_rx_fetch() When hns_assemble_skb() returns NULL to skb, no error return code of hns_nic_clear_all_rx_fetch() is assigned. To fix this bug, ret is assigned with -ENOMEM in this case. Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index 5d7824d2b4d4..c66a7a51198e 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -1663,8 +1663,10 @@ static int hns_nic_clear_all_rx_fetch(struct net_device *ndev) for (j = 0; j < fetch_num; j++) { /* alloc one skb and init */ skb = hns_assemble_skb(ndev); - if (!skb) + if (!skb) { + ret = -ENOMEM; goto out; + } rd = &tx_ring_data(priv, skb->queue_mapping); hns_nic_net_xmit_hw(ndev, skb, rd); From 62765d39553cfd1ad340124fe1e280450e8c89e2 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 7 Mar 2021 01:12:56 -0800 Subject: [PATCH 144/164] net: wan: fix error return code of uhdlc_init() When priv->rx_skbuff or priv->tx_skbuff is NULL, no error return code of uhdlc_init() is assigned. To fix this bug, ret is assigned with -ENOMEM in these cases. Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/wan/fsl_ucc_hdlc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index dca97cd7c4e7..7eac6a3e1cde 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -204,14 +204,18 @@ static int uhdlc_init(struct ucc_hdlc_private *priv) priv->rx_skbuff = kcalloc(priv->rx_ring_size, sizeof(*priv->rx_skbuff), GFP_KERNEL); - if (!priv->rx_skbuff) + if (!priv->rx_skbuff) { + ret = -ENOMEM; goto free_ucc_pram; + } priv->tx_skbuff = kcalloc(priv->tx_ring_size, sizeof(*priv->tx_skbuff), GFP_KERNEL); - if (!priv->tx_skbuff) + if (!priv->tx_skbuff) { + ret = -ENOMEM; goto free_rx_skbuff; + } priv->skb_curtx = 0; priv->skb_dirtytx = 0; From 03cbb87054c17b50a6ead63ed3ab02e094a785b1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 7 Mar 2021 12:21:56 +0200 Subject: [PATCH 145/164] net: dsa: fix switchdev objects on bridge master mistakenly being applied on ports Tobias reports that after the blamed patch, VLAN objects being added to a bridge device are being added to all slave ports instead (swp2, swp3). ip link add br0 type bridge vlan_filtering 1 ip link set swp2 master br0 ip link set swp3 master br0 bridge vlan add dev br0 vid 100 self This is because the fix was too broad: we made dsa_port_offloads_netdev say "yes, I offload the br0 bridge" for all slave ports, but we didn't add the checks whether the switchdev object was in fact meant for the physical port or for the bridge itself. So we are reacting on events in a way in which we shouldn't. The reason why the fix was too broad is because the question itself, "does this DSA port offload this netdev", was too broad in the first place. The solution is to disambiguate the question and separate it into two different functions, one to be called for each switchdev attribute / object that has an orig_dev == net_bridge (dsa_port_offloads_bridge), and the other for orig_dev == net_bridge_port (*_offloads_bridge_port). In the case of VLAN objects on the bridge interface, this solves the problem because we know that VLAN objects are per bridge port and not per bridge. And when orig_dev is equal to the net_bridge, we offload it as a bridge, but not as a bridge port; that's how we are able to skip reacting on those events. Note that this is compatible with future plans to have explicit offloading of VLAN objects on the bridge interface as a bridge port (in DSA, this signifies that we should add that VLAN towards the CPU port). Fixes: 99b8202b179f ("net: dsa: fix SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING getting ignored") Reported-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Reviewed-by: Tobias Waldekranz Tested-by: Tobias Waldekranz Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 25 +++++++++++--------- net/dsa/slave.c | 59 +++++++++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 2eeaa42f2e08..9d4b0e9b1aa1 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -230,8 +230,8 @@ int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; -static inline bool dsa_port_offloads_netdev(struct dsa_port *dp, - struct net_device *dev) +static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, + struct net_device *dev) { /* Switchdev offloading can be configured on: */ @@ -241,12 +241,6 @@ static inline bool dsa_port_offloads_netdev(struct dsa_port *dp, */ return true; - if (dp->bridge_dev == dev) - /* DSA ports connected to a bridge, and event was emitted - * for the bridge. - */ - return true; - if (dp->lag_dev == dev) /* DSA ports connected to a bridge via a LAG */ return true; @@ -254,14 +248,23 @@ static inline bool dsa_port_offloads_netdev(struct dsa_port *dp, return false; } +static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + /* DSA ports connected to a bridge, and event was emitted + * for the bridge. + */ + return dp->bridge_dev == bridge_dev; +} + /* Returns true if any port of this tree offloads the given net_device */ -static inline bool dsa_tree_offloads_netdev(struct dsa_switch_tree *dst, - struct net_device *dev) +static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, + struct net_device *dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) - if (dsa_port_offloads_netdev(dp, dev)) + if (dsa_port_offloads_bridge_port(dp, dev)) return true; return false; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 491e3761b5f4..992fcab4b552 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -278,28 +278,43 @@ static int dsa_slave_port_attr_set(struct net_device *dev, struct dsa_port *dp = dsa_slave_to_port(dev); int ret; - if (!dsa_port_offloads_netdev(dp, attr->orig_dev)) - return -EOPNOTSUPP; - switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: + if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) + return -EOPNOTSUPP; + ret = dsa_port_set_state(dp, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: + if (!dsa_port_offloads_bridge(dp, attr->orig_dev)) + return -EOPNOTSUPP; + ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: + if (!dsa_port_offloads_bridge(dp, attr->orig_dev)) + return -EOPNOTSUPP; + ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: + if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) + return -EOPNOTSUPP; + ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: + if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) + return -EOPNOTSUPP; + ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: + if (!dsa_port_offloads_bridge(dp, attr->orig_dev)) + return -EOPNOTSUPP; + ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, extack); break; default: @@ -341,9 +356,6 @@ static int dsa_slave_vlan_add(struct net_device *dev, struct switchdev_obj_port_vlan vlan; int err; - if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) - return -EOPNOTSUPP; - if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; @@ -391,27 +403,36 @@ static int dsa_slave_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: - if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; + err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: + if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) + return -EOPNOTSUPP; + /* DSA can directly translate this to a normal MDB add, * but on the CPU port. */ err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: + if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_slave_vlan_add(dev, obj, extack); break; case SWITCHDEV_OBJ_ID_MRP: - if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) return -EOPNOTSUPP; + err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: - if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) return -EOPNOTSUPP; + err = dsa_port_mrp_add_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; @@ -431,9 +452,6 @@ static int dsa_slave_vlan_del(struct net_device *dev, struct switchdev_obj_port_vlan *vlan; int err; - if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) - return -EOPNOTSUPP; - if (dsa_port_skip_vlan_configuration(dp)) return 0; @@ -459,27 +477,36 @@ static int dsa_slave_port_obj_del(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: - if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; + err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: + if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) + return -EOPNOTSUPP; + /* DSA can directly translate this to a normal MDB add, * but on the CPU port. */ err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: + if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_slave_vlan_del(dev, obj); break; case SWITCHDEV_OBJ_ID_MRP: - if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) return -EOPNOTSUPP; + err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: - if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) return -EOPNOTSUPP; + err = dsa_port_mrp_del_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; @@ -2298,7 +2325,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, * other ports bridged with the LAG should be able to * autonomously forward towards it. */ - if (dsa_tree_offloads_netdev(dp->ds->dst, dev)) + if (dsa_tree_offloads_bridge_port(dp->ds->dst, dev)) return NOTIFY_DONE; } From ac88c531a5b38877eba2365a3f28f0c8b513dc33 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 7 Mar 2021 13:17:47 +0000 Subject: [PATCH 146/164] net: davicom: Fix regulator not turned off on failed probe When the probe fails or requests to be defered, we must disable the regulator that was previously enabled. Fixes: 7994fe55a4a2 ("dm9000: Add regulator and reset support to dm9000") Signed-off-by: Paul Cercueil Signed-off-by: David S. Miller --- drivers/net/ethernet/davicom/dm9000.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index 3fdc70dab5c1..ae744826bb9e 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1449,7 +1449,7 @@ dm9000_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "failed to request reset gpio %d: %d\n", reset_gpios, ret); - return -ENODEV; + goto out_regulator_disable; } /* According to manual PWRST# Low Period Min 1ms */ @@ -1461,8 +1461,10 @@ dm9000_probe(struct platform_device *pdev) if (!pdata) { pdata = dm9000_parse_dt(&pdev->dev); - if (IS_ERR(pdata)) - return PTR_ERR(pdata); + if (IS_ERR(pdata)) { + ret = PTR_ERR(pdata); + goto out_regulator_disable; + } } /* Init network device */ @@ -1703,6 +1705,10 @@ out: dm9000_release_board(pdev, db); free_netdev(ndev); +out_regulator_disable: + if (!IS_ERR(power)) + regulator_disable(power); + return ret; } From cf9e60aa69ae6c40d3e3e4c94dd6c8de31674e9b Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 7 Mar 2021 13:17:48 +0000 Subject: [PATCH 147/164] net: davicom: Fix regulator not turned off on driver removal We must disable the regulator that was enabled in the probe function. Fixes: 7994fe55a4a2 ("dm9000: Add regulator and reset support to dm9000") Signed-off-by: Paul Cercueil Signed-off-by: David S. Miller --- drivers/net/ethernet/davicom/dm9000.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index ae744826bb9e..a95e95ce9438 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -133,6 +133,8 @@ struct board_info { u32 wake_state; int ip_summed; + + struct regulator *power_supply; }; /* debug code */ @@ -1481,6 +1483,8 @@ dm9000_probe(struct platform_device *pdev) db->dev = &pdev->dev; db->ndev = ndev; + if (!IS_ERR(power)) + db->power_supply = power; spin_lock_init(&db->lock); mutex_init(&db->addr_lock); @@ -1766,10 +1770,13 @@ static int dm9000_drv_remove(struct platform_device *pdev) { struct net_device *ndev = platform_get_drvdata(pdev); + struct board_info *dm = to_dm9000_board(ndev); unregister_netdev(ndev); - dm9000_release_board(pdev, netdev_priv(ndev)); + dm9000_release_board(pdev, dm); free_netdev(ndev); /* free device structure */ + if (dm->power_supply) + regulator_disable(dm->power_supply); dev_dbg(&pdev->dev, "released and freed device\n"); return 0; From 2e2696223676d56db1a93acfca722c1b96cd552d Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 7 Mar 2021 13:17:49 +0000 Subject: [PATCH 148/164] net: davicom: Use platform_get_irq_optional() The second IRQ line really is optional, so use platform_get_irq_optional() to obtain it. Signed-off-by: Paul Cercueil Signed-off-by: David S. Miller --- drivers/net/ethernet/davicom/dm9000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index a95e95ce9438..252adfa5d837 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1507,7 +1507,7 @@ dm9000_probe(struct platform_device *pdev) goto out; } - db->irq_wake = platform_get_irq(pdev, 1); + db->irq_wake = platform_get_irq_optional(pdev, 1); if (db->irq_wake >= 0) { dev_dbg(db->dev, "wakeup irq %d\n", db->irq_wake); From 1b2395dfff5bb40228a187f21f577cd90673d344 Mon Sep 17 00:00:00 2001 From: Alex Marginean Date: Sun, 7 Mar 2021 15:23:38 +0200 Subject: [PATCH 149/164] net: enetc: set MAC RX FIFO to recommended value On LS1028A, the MAC RX FIFO defaults to the value 2, which is too high and may lead to RX lock-up under traffic at a rate higher than 6 Gbps. Set it to 1 instead, as recommended by the hardware design team and by later versions of the ENETC block guide. Signed-off-by: Alex Marginean Reviewed-by: Claudiu Manoil Reviewed-by: Jason Liu Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_hw.h | 2 ++ drivers/net/ethernet/freescale/enetc/enetc_pf.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index de0d20b0f489..00938f7960a4 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -234,6 +234,8 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_PM0_MAXFRM 0x8014 #define ENETC_SET_TX_MTU(val) ((val) << 16) #define ENETC_SET_MAXFRM(val) ((val) & 0xffff) +#define ENETC_PM0_RX_FIFO 0x801c +#define ENETC_PM0_RX_FIFO_VAL 1 #define ENETC_PM_IMDIO_BASE 0x8030 diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index ca02f033bea2..224fc37a6757 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -490,6 +490,12 @@ static void enetc_configure_port_mac(struct enetc_hw *hw) enetc_port_wr(hw, ENETC_PM1_CMD_CFG, ENETC_PM0_CMD_PHY_TX_EN | ENETC_PM0_CMD_TXP | ENETC_PM0_PROMISC); + + /* On LS1028A, the MAC RX FIFO defaults to 2, which is too high + * and may lead to RX lock-up under traffic. Set it to 1 instead, + * as recommended by the hardware team. + */ + enetc_port_wr(hw, ENETC_PM0_RX_FIFO, ENETC_PM0_RX_FIFO_VAL); } static void enetc_mac_config(struct enetc_hw *hw, phy_interface_t phy_mode) From 29d98f54a4fe1b6a9089bec8715a1b89ff9ad59c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 7 Mar 2021 15:23:39 +0200 Subject: [PATCH 150/164] net: enetc: allow hardware timestamping on TX queues with tc-etf enabled The txtime is passed to the driver in skb->skb_mstamp_ns, which is actually in a union with skb->tstamp (the place where software timestamps are kept). Since commit b50a5c70ffa4 ("net: allow simultaneous SW and HW transmit timestamping"), __sock_recv_timestamp has some logic for making sure that the two calls to skb_tstamp_tx: skb_tx_timestamp(skb) # Software timestamp in the driver -> skb_tstamp_tx(skb, NULL) and skb_tstamp_tx(skb, &shhwtstamps) # Hardware timestamp in the driver will both do the right thing and in a race-free manner, meaning that skb_tx_timestamp will deliver a cmsg with the software timestamp only, and skb_tstamp_tx with a non-NULL hwtstamps argument will deliver a cmsg with the hardware timestamp only. Why are races even possible? Well, because although the software timestamp skb->tstamp is private per skb, the hardware timestamp skb_hwtstamps(skb) lives in skb_shinfo(skb), an area which is shared between skbs and their clones. And skb_tstamp_tx works by cloning the packets when timestamping them, therefore attempting to perform hardware timestamping on an skb's clone will also change the hardware timestamp of the original skb. And the original skb might have been yet again cloned for software timestamping, at an earlier stage. So the logic in __sock_recv_timestamp can't be as simple as saying "does this skb have a hardware timestamp? if yes I'll send the hardware timestamp to the socket, otherwise I'll send the software timestamp", precisely because the hardware timestamp is shared. Instead, it's quite the other way around: __sock_recv_timestamp says "does this skb have a software timestamp? if yes, I'll send the software timestamp, otherwise the hardware one". This works because the software timestamp is not shared with clones. But that means we have a problem when we attempt hardware timestamping with skbs that don't have the skb->tstamp == 0. __sock_recv_timestamp will say "oh, yeah, this must be some sort of odd clone" and will not deliver the hardware timestamp to the socket. And this is exactly what is happening when we have txtime enabled on the socket: as mentioned, that is put in a union with skb->tstamp, so it is quite easy to mistake it. Do what other drivers do (intel igb/igc) and write zero to skb->tstamp before taking the hardware timestamp. It's of no use to us now (we're already on the TX confirmation path). Fixes: 0d08c9ec7d6e ("enetc: add support time specific departure base on the qos etf") Cc: Vinicius Costa Gomes Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 30d7d4e83900..09471329f3a3 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -344,6 +344,12 @@ static void enetc_tstamp_tx(struct sk_buff *skb, u64 tstamp) if (skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) { memset(&shhwtstamps, 0, sizeof(shhwtstamps)); shhwtstamps.hwtstamp = ns_to_ktime(tstamp); + /* Ensure skb_mstamp_ns, which might have been populated with + * the txtime, is not mistaken for a software timestamp, + * because this will prevent the dispatch of our hardware + * timestamp to the socket. + */ + skb->tstamp = ktime_set(0, 0); skb_tstamp_tx(skb, &shhwtstamps); } } From 2055a99da8a253a357bdfd359b3338ef3375a26c Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 7 Mar 2021 19:11:02 -0800 Subject: [PATCH 151/164] net: bonding: fix error return code of bond_neigh_init() When slave is NULL or slave_ops->ndo_neigh_setup is NULL, no error return code of bond_neigh_init() is assigned. To fix this bug, ret is assigned with -EINVAL in these cases. Fixes: 9e99bfefdbce ("bonding: fix bond_neigh_init()") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 74cbbb22470b..456315bef3a8 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3978,11 +3978,15 @@ static int bond_neigh_init(struct neighbour *n) rcu_read_lock(); slave = bond_first_slave_rcu(bond); - if (!slave) + if (!slave) { + ret = -EINVAL; goto out; + } slave_ops = slave->dev->netdev_ops; - if (!slave_ops->ndo_neigh_setup) + if (!slave_ops->ndo_neigh_setup) { + ret = -EINVAL; goto out; + } /* TODO: find another way [1] to implement this. * Passing a zeroed structure is fragile, From 27ab92d9996e4e003a726d22c56d780a1655d6b4 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 8 Mar 2021 10:00:04 +0100 Subject: [PATCH 152/164] mptcp: fix length of ADD_ADDR with port sub-option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit in current Linux, MPTCP peers advertising endpoints with port numbers use a sub-option length that wrongly accounts for the trailing TCP NOP. Also, receivers will only process incoming ADD_ADDR with port having such wrong sub-option length. Fix this, making ADD_ADDR compliant to RFC8684 §3.4.1. this can be verified running tcpdump on the kselftests artifacts: unpatched kernel: [root@bottarga mptcp]# tcpdump -tnnr unpatched.pcap | grep add-addr reading from file unpatched.pcap, link-type LINUX_SLL (Linux cooked v1), snapshot length 65535 IP 10.0.1.1.10000 > 10.0.1.2.53078: Flags [.], ack 101, win 509, options [nop,nop,TS val 214459678 ecr 521312851,mptcp add-addr v1 id 1 a00:201:2774:2d88:7436:85c3:17fd:101], length 0 IP 10.0.1.2.53078 > 10.0.1.1.10000: Flags [.], ack 101, win 502, options [nop,nop,TS val 521312852 ecr 214459678,mptcp add-addr[bad opt]] patched kernel: [root@bottarga mptcp]# tcpdump -tnnr patched.pcap | grep add-addr reading from file patched.pcap, link-type LINUX_SLL (Linux cooked v1), snapshot length 65535 IP 10.0.1.1.10000 > 10.0.1.2.38178: Flags [.], ack 101, win 509, options [nop,nop,TS val 3728873902 ecr 2732713192,mptcp add-addr v1 id 1 10.0.2.1:10100 hmac 0xbccdfcbe59292a1f,nop,nop], length 0 IP 10.0.1.2.38178 > 10.0.1.1.10000: Flags [.], ack 101, win 502, options [nop,nop,TS val 2732713195 ecr 3728873902,mptcp add-addr v1-echo id 1 10.0.2.1:10100,nop,nop], length 0 Fixes: 22fb85ffaefb ("mptcp: add port support for ADD_ADDR suboption writing") CC: stable@vger.kernel.org # 5.11+ Reviewed-by: Mat Martineau Acked-and-tested-by: Geliang Tang Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 91827d949766..e21a5bc36cf0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -52,14 +52,15 @@ #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 -#define TCPOLEN_MPTCP_ADD_ADDR_PORT 20 +#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 -#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 12 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 #define TCPOLEN_MPTCP_ADD_ADDR6 28 -#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 32 +#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 -#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 -#define TCPOLEN_MPTCP_PORT_LEN 4 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 +#define TCPOLEN_MPTCP_PORT_LEN 2 +#define TCPOLEN_MPTCP_PORT_ALIGN 2 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 @@ -701,8 +702,9 @@ static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; if (!echo) len += MPTCPOPT_THMAC_LEN; + /* account for 2 trailing 'nop' options */ if (port) - len += TCPOLEN_MPTCP_PORT_LEN; + len += TCPOLEN_MPTCP_PORT_LEN + TCPOLEN_MPTCP_PORT_ALIGN; return len; } From 179d0ba0c454057a65929c46af0d6ad986754781 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Mon, 8 Mar 2021 01:13:55 -0800 Subject: [PATCH 153/164] net: qrtr: fix error return code of qrtr_sendmsg() When sock_alloc_send_skb() returns NULL to skb, no error return code of qrtr_sendmsg() is assigned. To fix this bug, rc is assigned with -ENOMEM in this case. Fixes: 194ccc88297a ("net: qrtr: Support decoding incoming v2 packets") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 82d2eb8c21d1..edb6ac17ceca 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -958,8 +958,10 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) plen = (len + 3) & ~3; skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_MAX_SIZE, msg->msg_flags & MSG_DONTWAIT, &rc); - if (!skb) + if (!skb) { + rc = -ENOMEM; goto out_node; + } skb_reserve(skb, QRTR_HDR_MAX_SIZE); From 1019d7923d9d4cc878a1a85d4fc2d6619cfe1a6a Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 7 Mar 2021 22:25:28 -0500 Subject: [PATCH 154/164] atm: fix a typo in the struct description phy_data means private PHY data not date Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- include/linux/atmdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 60cd25c0461b..9b02961d65ee 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -151,7 +151,7 @@ struct atm_dev { const char *type; /* device type name */ int number; /* device index */ void *dev_data; /* per-device data */ - void *phy_data; /* private PHY date */ + void *phy_data; /* private PHY data */ unsigned long flags; /* device flags (ATM_DF_*) */ struct list_head local; /* local ATM addresses */ struct list_head lecs; /* LECS ATM addresses learned via ILMI */ From 3153724fc084d8ef640c611f269ddfb576d1dcb1 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 7 Mar 2021 22:25:29 -0500 Subject: [PATCH 155/164] atm: uPD98402: fix incorrect allocation dev->dev_data is set in zatm.c, calling zatm_start() will overwrite this dev->dev_data in uPD98402_start() and a subsequent PRIV(dev)->lock (i.e dev->phy_data->lock) will result in a null-ptr-dereference. I believe this is a typo and what it actually want to do is to allocate phy_data instead of dev_data. Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- drivers/atm/uPD98402.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/atm/uPD98402.c b/drivers/atm/uPD98402.c index 7850758b5bb8..239852d85558 100644 --- a/drivers/atm/uPD98402.c +++ b/drivers/atm/uPD98402.c @@ -211,7 +211,7 @@ static void uPD98402_int(struct atm_dev *dev) static int uPD98402_start(struct atm_dev *dev) { DPRINTK("phy_start\n"); - if (!(dev->dev_data = kmalloc(sizeof(struct uPD98402_priv),GFP_KERNEL))) + if (!(dev->phy_data = kmalloc(sizeof(struct uPD98402_priv),GFP_KERNEL))) return -ENOMEM; spin_lock_init(&PRIV(dev)->lock); memset(&PRIV(dev)->sonet_stats,0,sizeof(struct k_sonet_stats)); From 4416e98594dc04590ebc498fc4e530009535c511 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 7 Mar 2021 22:25:30 -0500 Subject: [PATCH 156/164] atm: idt77252: fix null-ptr-dereference this one is similar to the phy_data allocation fix in uPD98402, the driver allocate the idt77105_priv and store to dev_data but later dereference using dev->dev_data, which will cause null-ptr-dereference. fix this issue by changing dev_data to phy_data so that PRIV(dev) can work correctly. Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- drivers/atm/idt77105.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c index 3c081b6171a8..bfca7b8a6f31 100644 --- a/drivers/atm/idt77105.c +++ b/drivers/atm/idt77105.c @@ -262,7 +262,7 @@ static int idt77105_start(struct atm_dev *dev) { unsigned long flags; - if (!(dev->dev_data = kmalloc(sizeof(struct idt77105_priv),GFP_KERNEL))) + if (!(dev->phy_data = kmalloc(sizeof(struct idt77105_priv),GFP_KERNEL))) return -ENOMEM; PRIV(dev)->dev = dev; spin_lock_irqsave(&idt77105_priv_lock, flags); @@ -337,7 +337,7 @@ static int idt77105_stop(struct atm_dev *dev) else idt77105_all = walk->next; dev->phy = NULL; - dev->dev_data = NULL; + dev->phy_data = NULL; kfree(walk); break; } From f7d9d4854519fdf4d45c70a4d953438cd88e7e58 Mon Sep 17 00:00:00 2001 From: Xie He Date: Sun, 7 Mar 2021 03:33:07 -0800 Subject: [PATCH 157/164] net: lapbether: Remove netif_start_queue / netif_stop_queue For the devices in this driver, the default qdisc is "noqueue", because their "tx_queue_len" is 0. In function "__dev_queue_xmit" in "net/core/dev.c", devices with the "noqueue" qdisc are specially handled. Packets are transmitted without being queued after a "dev->flags & IFF_UP" check. However, it's possible that even if this check succeeds, "ops->ndo_stop" may still have already been called. This is because in "__dev_close_many", "ops->ndo_stop" is called before clearing the "IFF_UP" flag. If we call "netif_stop_queue" in "ops->ndo_stop", then it's possible in "__dev_queue_xmit", it sees the "IFF_UP" flag is present, and then it checks "netif_xmit_stopped" and finds that the queue is already stopped. In this case, it will complain that: "Virtual device ... asks to queue packet!" To prevent "__dev_queue_xmit" from generating this complaint, we should not call "netif_stop_queue" in "ops->ndo_stop". We also don't need to call "netif_start_queue" in "ops->ndo_open", because after a netdev is allocated and registered, the "__QUEUE_STATE_DRV_XOFF" flag is initially not set, so there is no need to call "netif_start_queue" to clear it. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xie He Acked-by: Martin Schiller Signed-off-by: David S. Miller --- drivers/net/wan/lapbether.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index 605fe555e157..c3372498f4f1 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -292,7 +292,6 @@ static int lapbeth_open(struct net_device *dev) return -ENODEV; } - netif_start_queue(dev); return 0; } @@ -300,8 +299,6 @@ static int lapbeth_close(struct net_device *dev) { int err; - netif_stop_queue(dev); - if ((err = lapb_unregister(dev)) != LAPB_OK) pr_err("lapb_unregister error: %d\n", err); From 286a8624d7f9c6505cd568d947772eb59646514b Mon Sep 17 00:00:00 2001 From: George McCollister Date: Mon, 8 Mar 2021 17:38:22 -0600 Subject: [PATCH 158/164] net: dsa: xrs700x: check if partner is same as port in hsr join Don't assign dp to partner if it's the same port that xrs700x_hsr_join was called with. The partner port is supposed to be the other port in the HSR/PRP redundant pair not the same port. This fixes an issue observed in testing where forwarding between redundant HSR ports on this switch didn't work depending on the order the ports were added to the hsr device. Fixes: bd62e6f5e6a9 ("net: dsa: xrs700x: add HSR offloading support") Signed-off-by: George McCollister Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/xrs700x/xrs700x.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c index f025f968f96d..fde6e99274b6 100644 --- a/drivers/net/dsa/xrs700x/xrs700x.c +++ b/drivers/net/dsa/xrs700x/xrs700x.c @@ -528,7 +528,10 @@ static int xrs700x_hsr_join(struct dsa_switch *ds, int port, return -EOPNOTSUPP; dsa_hsr_foreach_port(dp, ds, hsr) { - partner = dp; + if (dp->index != port) { + partner = dp; + break; + } } /* We can't enable redundancy on the switch until both @@ -582,7 +585,10 @@ static int xrs700x_hsr_leave(struct dsa_switch *ds, int port, unsigned int val; dsa_hsr_foreach_port(dp, ds, hsr) { - partner = dp; + if (dp->index != port) { + partner = dp; + break; + } } if (!partner) From 924a9bc362a5223cd448ca08c3dde21235adc310 Mon Sep 17 00:00:00 2001 From: Balazs Nemeth Date: Tue, 9 Mar 2021 12:31:00 +0100 Subject: [PATCH 159/164] net: check if protocol extracted by virtio_net_hdr_set_proto is correct For gso packets, virtio_net_hdr_set_proto sets the protocol (if it isn't set) based on the type in the virtio net hdr, but the skb could contain anything since it could come from packet_snd through a raw socket. If there is a mismatch between what virtio_net_hdr_set_proto sets and the actual protocol, then the skb could be handled incorrectly later on. An example where this poses an issue is with the subsequent call to skb_flow_dissect_flow_keys_basic which relies on skb->protocol being set correctly. A specially crafted packet could fool skb_flow_dissect_flow_keys_basic preventing EINVAL to be returned. Avoid blindly trusting the information provided by the virtio net header by checking that the protocol in the packet actually matches the protocol set by virtio_net_hdr_set_proto. Note that since the protocol is only checked if skb->dev implements header_ops->parse_protocol, packets from devices without the implementation are not checked at this stage. Fixes: 9274124f023b ("net: stricter validation of untrusted gso packets") Signed-off-by: Balazs Nemeth Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index e8a924eeea3d..6b5fcfa1e555 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -79,8 +79,13 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (gso_type && skb->network_header) { struct flow_keys_basic keys; - if (!skb->protocol) + if (!skb->protocol) { + __be16 protocol = dev_parse_header_protocol(skb); + virtio_net_hdr_set_proto(skb, hdr); + if (protocol && protocol != skb->protocol) + return -EINVAL; + } retry: if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, From d348ede32e99d3a04863e9f9b28d224456118c27 Mon Sep 17 00:00:00 2001 From: Balazs Nemeth Date: Tue, 9 Mar 2021 12:31:01 +0100 Subject: [PATCH 160/164] net: avoid infinite loop in mpls_gso_segment when mpls_hlen == 0 A packet with skb_inner_network_header(skb) == skb_network_header(skb) and ETH_P_MPLS_UC will prevent mpls_gso_segment from pulling any headers from the packet. Subsequently, the call to skb_mac_gso_segment will again call mpls_gso_segment with the same packet leading to an infinite loop. In addition, ensure that the header length is a multiple of four, which should hold irrespective of the number of stacked labels. Signed-off-by: Balazs Nemeth Acked-by: Willem de Bruijn Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/mpls_gso.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index b1690149b6fa..1482259de9b5 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, netdev_features_t features) @@ -27,6 +28,8 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, skb_reset_network_header(skb); mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb); + if (unlikely(!mpls_hlen || mpls_hlen % MPLS_HLEN)) + goto out; if (unlikely(!pskb_may_pull(skb, mpls_hlen))) goto out; From e7a36d27f6b9f389e41d8189a8a08919c6835732 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 9 Mar 2021 17:52:18 +0100 Subject: [PATCH 161/164] s390/qeth: fix memory leak after failed TX Buffer allocation When qeth_alloc_qdio_queues() fails to allocate one of the buffers that back an Output Queue, the 'out_freeoutqbufs' path will free all previously allocated buffers for this queue. But it misses to free the half-finished queue struct itself. Move the buffer allocation into qeth_alloc_output_queue(), and deal with such errors internally. Fixes: 0da9581ddb0f ("qeth: exploit asynchronous delivery of storage blocks") Signed-off-by: Julian Wiedmann Reviewed-by: Alexandra Winter Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 35 +++++++++++++++---------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index b71b8902d1c4..f7bc0ca6909b 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -2634,15 +2634,28 @@ static void qeth_free_output_queue(struct qeth_qdio_out_q *q) static struct qeth_qdio_out_q *qeth_alloc_output_queue(void) { struct qeth_qdio_out_q *q = kzalloc(sizeof(*q), GFP_KERNEL); + unsigned int i; if (!q) return NULL; - if (qdio_alloc_buffers(q->qdio_bufs, QDIO_MAX_BUFFERS_PER_Q)) { - kfree(q); - return NULL; + if (qdio_alloc_buffers(q->qdio_bufs, QDIO_MAX_BUFFERS_PER_Q)) + goto err_qdio_bufs; + + for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++) { + if (qeth_init_qdio_out_buf(q, i)) + goto err_out_bufs; } + return q; + +err_out_bufs: + while (i > 0) + kmem_cache_free(qeth_qdio_outbuf_cache, q->bufs[--i]); + qdio_free_buffers(q->qdio_bufs, QDIO_MAX_BUFFERS_PER_Q); +err_qdio_bufs: + kfree(q); + return NULL; } static void qeth_tx_completion_timer(struct timer_list *timer) @@ -2655,7 +2668,7 @@ static void qeth_tx_completion_timer(struct timer_list *timer) static int qeth_alloc_qdio_queues(struct qeth_card *card) { - int i, j; + unsigned int i; QETH_CARD_TEXT(card, 2, "allcqdbf"); @@ -2689,13 +2702,6 @@ static int qeth_alloc_qdio_queues(struct qeth_card *card) queue->coalesce_usecs = QETH_TX_COALESCE_USECS; queue->max_coalesced_frames = QETH_TX_MAX_COALESCED_FRAMES; queue->priority = QETH_QIB_PQUE_PRIO_DEFAULT; - - /* give outbound qeth_qdio_buffers their qdio_buffers */ - for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j) { - WARN_ON(queue->bufs[j]); - if (qeth_init_qdio_out_buf(queue, j)) - goto out_freeoutqbufs; - } } /* completion */ @@ -2704,13 +2710,6 @@ static int qeth_alloc_qdio_queues(struct qeth_card *card) return 0; -out_freeoutqbufs: - while (j > 0) { - --j; - kmem_cache_free(qeth_qdio_outbuf_cache, - card->qdio.out_qs[i]->bufs[j]); - card->qdio.out_qs[i]->bufs[j] = NULL; - } out_freeoutq: while (i > 0) { qeth_free_output_queue(card->qdio.out_qs[--i]); From c20383ad1656b0f6354dd50e4acd894f9d94090d Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 9 Mar 2021 17:52:19 +0100 Subject: [PATCH 162/164] s390/qeth: improve completion of pending TX buffers The current design attaches a pending TX buffer to a custom single-linked list, which is anchored at the buffer's slot on the TX ring. The buffer is then checked for final completion whenever this slot is processed during a subsequent TX NAPI poll cycle. But if there's insufficient traffic on the ring, we might never make enough progress to get back to this ring slot and discover the pending buffer's final TX completion. In particular if this missing TX completion blocks the application from sending further traffic. So convert the custom single-linked list code to a per-queue list_head, and scan this list on every TX NAPI cycle. Fixes: 0da9581ddb0f ("qeth: exploit asynchronous delivery of storage blocks") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core.h | 3 +- drivers/s390/net/qeth_core_main.c | 69 +++++++++++++------------------ 2 files changed, 30 insertions(+), 42 deletions(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index a1da83b0b0ef..91acff493612 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -436,7 +436,7 @@ struct qeth_qdio_out_buffer { int is_header[QDIO_MAX_ELEMENTS_PER_BUFFER]; struct qeth_qdio_out_q *q; - struct qeth_qdio_out_buffer *next_pending; + struct list_head list_entry; }; struct qeth_card; @@ -500,6 +500,7 @@ struct qeth_qdio_out_q { struct qdio_buffer *qdio_bufs[QDIO_MAX_BUFFERS_PER_Q]; struct qeth_qdio_out_buffer *bufs[QDIO_MAX_BUFFERS_PER_Q]; struct qdio_outbuf_state *bufstates; /* convenience pointer */ + struct list_head pending_bufs; struct qeth_out_q_stats stats; spinlock_t lock; unsigned int priority; diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index f7bc0ca6909b..3763cd6d14f8 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -73,8 +73,6 @@ static void qeth_free_qdio_queues(struct qeth_card *card); static void qeth_notify_skbs(struct qeth_qdio_out_q *queue, struct qeth_qdio_out_buffer *buf, enum iucv_tx_notify notification); -static void qeth_tx_complete_buf(struct qeth_qdio_out_buffer *buf, bool error, - int budget); static void qeth_close_dev_handler(struct work_struct *work) { @@ -465,41 +463,6 @@ static enum iucv_tx_notify qeth_compute_cq_notification(int sbalf15, return n; } -static void qeth_cleanup_handled_pending(struct qeth_qdio_out_q *q, int bidx, - int forced_cleanup) -{ - if (q->card->options.cq != QETH_CQ_ENABLED) - return; - - if (q->bufs[bidx]->next_pending != NULL) { - struct qeth_qdio_out_buffer *head = q->bufs[bidx]; - struct qeth_qdio_out_buffer *c = q->bufs[bidx]->next_pending; - - while (c) { - if (forced_cleanup || - atomic_read(&c->state) == QETH_QDIO_BUF_EMPTY) { - struct qeth_qdio_out_buffer *f = c; - - QETH_CARD_TEXT(f->q->card, 5, "fp"); - QETH_CARD_TEXT_(f->q->card, 5, "%lx", (long) f); - /* release here to avoid interleaving between - outbound tasklet and inbound tasklet - regarding notifications and lifecycle */ - qeth_tx_complete_buf(c, forced_cleanup, 0); - - c = f->next_pending; - WARN_ON_ONCE(head->next_pending != f); - head->next_pending = c; - kmem_cache_free(qeth_qdio_outbuf_cache, f); - } else { - head = c; - c = c->next_pending; - } - - } - } -} - static void qeth_qdio_handle_aob(struct qeth_card *card, unsigned long phys_aob_addr) { @@ -537,7 +500,7 @@ static void qeth_qdio_handle_aob(struct qeth_card *card, qeth_notify_skbs(buffer->q, buffer, notification); /* Free dangling allocations. The attached skbs are handled by - * qeth_cleanup_handled_pending(). + * qeth_tx_complete_pending_bufs(). */ for (i = 0; i < aob->sb_count && i < QETH_MAX_BUFFER_ELEMENTS(card); @@ -1488,14 +1451,35 @@ static void qeth_clear_output_buffer(struct qeth_qdio_out_q *queue, atomic_set(&buf->state, QETH_QDIO_BUF_EMPTY); } +static void qeth_tx_complete_pending_bufs(struct qeth_card *card, + struct qeth_qdio_out_q *queue, + bool drain) +{ + struct qeth_qdio_out_buffer *buf, *tmp; + + list_for_each_entry_safe(buf, tmp, &queue->pending_bufs, list_entry) { + if (drain || atomic_read(&buf->state) == QETH_QDIO_BUF_EMPTY) { + QETH_CARD_TEXT(card, 5, "fp"); + QETH_CARD_TEXT_(card, 5, "%lx", (long) buf); + + qeth_tx_complete_buf(buf, drain, 0); + + list_del(&buf->list_entry); + kmem_cache_free(qeth_qdio_outbuf_cache, buf); + } + } +} + static void qeth_drain_output_queue(struct qeth_qdio_out_q *q, bool free) { int j; + qeth_tx_complete_pending_bufs(q->card, q, true); + for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j) { if (!q->bufs[j]) continue; - qeth_cleanup_handled_pending(q, j, 1); + qeth_clear_output_buffer(q, q->bufs[j], true, 0); if (free) { kmem_cache_free(qeth_qdio_outbuf_cache, q->bufs[j]); @@ -2615,7 +2599,6 @@ static int qeth_init_qdio_out_buf(struct qeth_qdio_out_q *q, int bidx) skb_queue_head_init(&newbuf->skb_list); lockdep_set_class(&newbuf->skb_list.lock, &qdio_out_skb_queue_key); newbuf->q = q; - newbuf->next_pending = q->bufs[bidx]; atomic_set(&newbuf->state, QETH_QDIO_BUF_EMPTY); q->bufs[bidx] = newbuf; return 0; @@ -2697,6 +2680,7 @@ static int qeth_alloc_qdio_queues(struct qeth_card *card) card->qdio.out_qs[i] = queue; queue->card = card; queue->queue_no = i; + INIT_LIST_HEAD(&queue->pending_bufs); spin_lock_init(&queue->lock); timer_setup(&queue->timer, qeth_tx_completion_timer, 0); queue->coalesce_usecs = QETH_TX_COALESCE_USECS; @@ -6106,6 +6090,8 @@ static void qeth_iqd_tx_complete(struct qeth_qdio_out_q *queue, qeth_schedule_recovery(card); } + list_add(&buffer->list_entry, + &queue->pending_bufs); /* Skip clearing the buffer: */ return; case QETH_QDIO_BUF_QAOB_OK: @@ -6161,6 +6147,8 @@ static int qeth_tx_poll(struct napi_struct *napi, int budget) unsigned int bytes = 0; int completed; + qeth_tx_complete_pending_bufs(card, queue, false); + if (qeth_out_queue_is_empty(queue)) { napi_complete(napi); return 0; @@ -6193,7 +6181,6 @@ static int qeth_tx_poll(struct napi_struct *napi, int budget) qeth_handle_send_error(card, buffer, error); qeth_iqd_tx_complete(queue, bidx, error, budget); - qeth_cleanup_handled_pending(queue, bidx, false); } netdev_tx_completed_queue(txq, packets, bytes); From 3e83d467a08e25b27c44c885f511624a71c84f7c Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 9 Mar 2021 17:52:20 +0100 Subject: [PATCH 163/164] s390/qeth: schedule TX NAPI on QAOB completion When a QAOB notifies us that a pending TX buffer has been delivered, the actual TX completion processing by qeth_tx_complete_pending_bufs() is done within the context of a TX NAPI instance. We shouldn't rely on this instance being scheduled by some other TX event, but just do it ourselves. qeth_qdio_handle_aob() is called from qeth_poll(), ie. our main NAPI instance. To avoid touching the TX queue's NAPI instance before/after it is (un-)registered, reorder the code in qeth_open() and qeth_stop() accordingly. Fixes: 0da9581ddb0f ("qeth: exploit asynchronous delivery of storage blocks") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 3763cd6d14f8..d0a56afec028 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -470,6 +470,7 @@ static void qeth_qdio_handle_aob(struct qeth_card *card, struct qaob *aob; struct qeth_qdio_out_buffer *buffer; enum iucv_tx_notify notification; + struct qeth_qdio_out_q *queue; unsigned int i; aob = (struct qaob *) phys_to_virt(phys_aob_addr); @@ -512,7 +513,9 @@ static void qeth_qdio_handle_aob(struct qeth_card *card, buffer->is_header[i] = 0; } + queue = buffer->q; atomic_set(&buffer->state, QETH_QDIO_BUF_EMPTY); + napi_schedule(&queue->napi); break; default: WARN_ON_ONCE(1); @@ -7235,9 +7238,7 @@ int qeth_open(struct net_device *dev) card->data.state = CH_STATE_UP; netif_tx_start_all_queues(dev); - napi_enable(&card->napi); local_bh_disable(); - napi_schedule(&card->napi); if (IS_IQD(card)) { struct qeth_qdio_out_q *queue; unsigned int i; @@ -7249,8 +7250,12 @@ int qeth_open(struct net_device *dev) napi_schedule(&queue->napi); } } + + napi_enable(&card->napi); + napi_schedule(&card->napi); /* kick-start the NAPI softirq: */ local_bh_enable(); + return 0; } EXPORT_SYMBOL_GPL(qeth_open); @@ -7260,6 +7265,11 @@ int qeth_stop(struct net_device *dev) struct qeth_card *card = dev->ml_priv; QETH_CARD_TEXT(card, 4, "qethstop"); + + napi_disable(&card->napi); + cancel_delayed_work_sync(&card->buffer_reclaim_work); + qdio_stop_irq(CARD_DDEV(card)); + if (IS_IQD(card)) { struct qeth_qdio_out_q *queue; unsigned int i; @@ -7280,10 +7290,6 @@ int qeth_stop(struct net_device *dev) netif_tx_disable(dev); } - napi_disable(&card->napi); - cancel_delayed_work_sync(&card->buffer_reclaim_work); - qdio_stop_irq(CARD_DDEV(card)); - return 0; } EXPORT_SYMBOL_GPL(qeth_stop); From 7eefda7f353ef86ad82a2dc8329e8a3538c08ab6 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 9 Mar 2021 17:52:21 +0100 Subject: [PATCH 164/164] s390/qeth: fix notification for pending buffers during teardown The cited commit reworked the state machine for pending TX buffers. In qeth_iqd_tx_complete() it turned PENDING into a transient state, and uses NEED_QAOB for buffers that get parked while waiting for their QAOB completion. But it missed to adjust the check in qeth_tx_complete_buf(). So if qeth_tx_complete_pending_bufs() is called during teardown to drain the parked TX buffers, we no longer raise a notification for af_iucv. Instead of updating the checked state, just move this code into qeth_tx_complete_pending_bufs() itself. This also gets rid of the special-case in the common TX completion path. Fixes: 8908f36d20d8 ("s390/qeth: fix af_iucv notification race") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index d0a56afec028..a814698387bc 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1390,9 +1390,6 @@ static void qeth_tx_complete_buf(struct qeth_qdio_out_buffer *buf, bool error, struct qeth_qdio_out_q *queue = buf->q; struct sk_buff *skb; - if (atomic_read(&buf->state) == QETH_QDIO_BUF_PENDING) - qeth_notify_skbs(queue, buf, TX_NOTIFY_GENERALERROR); - /* Empty buffer? */ if (buf->next_element_to_fill == 0) return; @@ -1465,6 +1462,9 @@ static void qeth_tx_complete_pending_bufs(struct qeth_card *card, QETH_CARD_TEXT(card, 5, "fp"); QETH_CARD_TEXT_(card, 5, "%lx", (long) buf); + if (drain) + qeth_notify_skbs(queue, buf, + TX_NOTIFY_GENERALERROR); qeth_tx_complete_buf(buf, drain, 0); list_del(&buf->list_entry);