From edbea922025169c0e5cdca5ebf7bf5374cc5566c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 3 Mar 2021 16:29:03 +0100 Subject: [PATCH 001/171] veth: Store queue_mapping independently of XDP prog presence Currently, veth_xmit() would call the skb_record_rx_queue() only when there is XDP program loaded on peer interface in native mode. If peer has XDP prog in generic mode, then netif_receive_generic_xdp() has a call to netif_get_rxqueue(skb), so for multi-queue veth it will not be possible to grab a correct rxq. To fix that, store queue_mapping independently of XDP prog presence on peer interface. Fixes: 638264dc9022 ("veth: Support per queue XDP ring") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Toshiaki Makita Link: https://lore.kernel.org/bpf/20210303152903.11172-1-maciej.fijalkowski@intel.com --- drivers/net/veth.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index aa1a66ad2ce5..34e49c75db42 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -302,8 +302,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; rcv_xdp = rcu_access_pointer(rq->xdp_prog); - if (rcv_xdp) - skb_record_rx_queue(skb, rxq); + skb_record_rx_queue(skb, rxq); } skb_tx_timestamp(skb); From 350a5c4dd2452ea999cc5e1d4a8dbf12de2f97ef Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 7 Mar 2021 14:52:48 -0800 Subject: [PATCH 002/171] bpf: Dont allow vmlinux BTF to be used in map_create and prog_load. The syzbot got FD of vmlinux BTF and passed it into map_create which caused crash in btf_type_id_size() when it tried to access resolved_ids. The vmlinux BTF doesn't have 'resolved_ids' and 'resolved_sizes' initialized to save memory. To avoid such issues disallow using vmlinux BTF in prog_load and map_create commands. Fixes: 5329722057d4 ("bpf: Assign ID to vmlinux BTF and return extra info for BTF in GET_OBJ_INFO") Reported-by: syzbot+8bab8ed346746e7540e8@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210307225248.79031-1-alexei.starovoitov@gmail.com --- kernel/bpf/syscall.c | 5 +++++ kernel/bpf/verifier.c | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c859bc46d06c..250503482cda 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -854,6 +854,11 @@ static int map_create(union bpf_attr *attr) err = PTR_ERR(btf); goto free_map; } + if (btf_is_kernel(btf)) { + btf_put(btf); + err = -EACCES; + goto free_map; + } map->btf = btf; if (attr->btf_value_type_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c56e3fcb5f1a..4192a9e56654 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9056,6 +9056,10 @@ static int check_btf_info(struct bpf_verifier_env *env, btf = btf_get_by_fd(attr->prog_btf_fd); if (IS_ERR(btf)) return PTR_ERR(btf); + if (btf_is_kernel(btf)) { + btf_put(btf); + return -EACCES; + } env->prog->aux->btf = btf; err = check_btf_func(env, attr, uattr); From 769c18b254ca191b45047e1fcb3b2ce56fada0b6 Mon Sep 17 00:00:00 2001 From: Tal Lossos Date: Sun, 7 Mar 2021 14:09:48 +0200 Subject: [PATCH 003/171] bpf: Change inode_storage's lookup_elem return value from NULL to -EBADF bpf_fd_inode_storage_lookup_elem() returned NULL when getting a bad FD, which caused -ENOENT in bpf_map_copy_value. -EBADF error is better than -ENOENT for a bad FD behaviour. The patch was partially contributed by CyberArk Software, Inc. Fixes: 8ea636848aca ("bpf: Implement bpf_local_storage for inodes") Signed-off-by: Tal Lossos Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210307120948.61414-1-tallossos@gmail.com --- kernel/bpf/bpf_inode_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6639640523c0..b58b2efb9b43 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -109,7 +109,7 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) fd = *(int *)key; f = fget_raw(fd); if (!f) - return NULL; + return ERR_PTR(-EBADF); sdata = inode_storage_lookup(f->f_inode, map, true); fput(f); From e7fb6465d4c8e767e39cbee72464e0060ab3d20c Mon Sep 17 00:00:00 2001 From: Georgi Valkov Date: Mon, 8 Mar 2021 10:30:38 -0800 Subject: [PATCH 004/171] libbpf: Fix INSTALL flag order It was reported ([0]) that having optional -m flag between source and destination arguments in install command breaks bpftools cross-build on MacOS. Move -m to the front to fix this issue. [0] https://github.com/openwrt/openwrt/pull/3959 Fixes: 7110d80d53f4 ("libbpf: Makefile set specified permission mode") Signed-off-by: Georgi Valkov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210308183038.613432-1-andrii@kernel.org --- tools/lib/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 887a494ad5fc..e9eb6a6e80d2 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -215,7 +215,7 @@ define do_install if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ fi; \ - $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2' + $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2' endef install_lib: all_cmd From e5e35e754c28724d5c619f2ec805fd221f8d59ce Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 8 Mar 2021 15:59:18 +0100 Subject: [PATCH 005/171] bpf: BPF-helper for MTU checking add length input The FIB lookup example[1] show how the IP-header field tot_len (iph->tot_len) is used as input to perform the MTU check. This patch extend the BPF-helper bpf_check_mtu() with the same ability to provide the length as user parameter input, via mtu_len parameter. This still needs to be done before the bpf_check_mtu() helper API becomes frozen. [1] samples/bpf/xdp_fwd_kern.c Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/161521555850.3515614.6533850861569774444.stgit@firesoul --- include/uapi/linux/bpf.h | 16 +++++++++++----- net/core/filter.c | 12 ++++++++++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c893310492..4ba4ef0ff63a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3850,7 +3850,7 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based + * Check packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. * @@ -3867,6 +3867,14 @@ union bpf_attr { * against the current net device. This is practical if this isn't * used prior to redirect. * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * * The Linux kernel route table can configure MTUs on a more * specific per route level, which is not provided by this helper. * For route level MTU checks use the **bpf_fib_lookup**\ () @@ -3891,11 +3899,9 @@ union bpf_attr { * * On return *mtu_len* pointer contains the MTU value of the net * device. Remember the net device configured MTU is the L3 size, - * which is returned here and XDP and TX length operate at L2. + * which is returned here and XDP and TC length operate at L2. * Helper take this into account for you, but remember when using - * MTU value in your BPF-code. On input *mtu_len* must be a valid - * pointer and be initialized (to zero), else verifier will reject - * BPF program. + * MTU value in your BPF-code. * * Return * * 0 on success, and populate MTU value in *mtu_len* pointer. diff --git a/net/core/filter.c b/net/core/filter.c index adfdad234674..9323d34d34cc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5658,7 +5658,7 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) return -EINVAL; - if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff)) + if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); @@ -5668,7 +5668,11 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; - skb_len = skb->len + len_diff; /* minus result pass check */ + + /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ + skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; + + skb_len += len_diff; /* minus result pass check */ if (skb_len <= dev_len) { ret = BPF_MTU_CHK_RET_SUCCESS; goto out; @@ -5713,6 +5717,10 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, /* Add L2-header as dev MTU is L3 size */ dev_len = mtu + dev->hard_header_len; + /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ + if (*mtu_len) + xdp_len = *mtu_len + dev->hard_header_len; + xdp_len += len_diff; /* minus result pass check */ if (xdp_len > dev_len) ret = BPF_MTU_CHK_RET_FRAG_NEEDED; From e5e010a3063ad801cb3f85793cbada9c2a654e40 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 8 Mar 2021 15:59:23 +0100 Subject: [PATCH 006/171] selftests/bpf: Tests using bpf_check_mtu BPF-helper input mtu_len param Add tests that use mtu_len as input parameter in BPF-helper bpf_check_mtu(). The BPF-helper is avail from both XDP and TC context. Add two tests per context, one that tests below MTU and one that exceeds the MTU. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/161521556358.3515614.5915221479709358964.stgit@firesoul --- .../selftests/bpf/prog_tests/check_mtu.c | 4 + .../selftests/bpf/progs/test_check_mtu.c | 92 +++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c index 36af1c138faf..b62a39315336 100644 --- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c +++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c @@ -128,6 +128,8 @@ static void test_check_mtu_xdp(__u32 mtu, __u32 ifindex) test_check_mtu_run_xdp(skel, skel->progs.xdp_use_helper, mtu); test_check_mtu_run_xdp(skel, skel->progs.xdp_exceed_mtu, mtu); test_check_mtu_run_xdp(skel, skel->progs.xdp_minus_delta, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len_exceed, mtu); cleanup: test_check_mtu__destroy(skel); @@ -187,6 +189,8 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu_da, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu); cleanup: test_check_mtu__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c index b7787b43f9db..c4a9bae96e75 100644 --- a/tools/testing/selftests/bpf/progs/test_check_mtu.c +++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c @@ -105,6 +105,54 @@ int xdp_minus_delta(struct xdp_md *ctx) return retval; } +SEC("xdp") +int xdp_input_len(struct xdp_md *ctx) +{ + int retval = XDP_PASS; /* Expected retval on successful test */ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + __u32 ifindex = GLOBAL_USER_IFINDEX; + __u32 data_len = data_end - data; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input len is L3, like MTU and iph->tot_len. + * Remember XDP data_len is L2. + */ + __u32 mtu_len = data_len - ETH_HLEN; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0)) + retval = XDP_ABORTED; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("xdp") +int xdp_input_len_exceed(struct xdp_md *ctx) +{ + int retval = XDP_ABORTED; /* Fail */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int err; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size like MTU. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + mtu_len += 1; /* Exceed with 1 */ + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0); + if (err == BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = XDP_PASS ; /* Success in exceeding MTU check */ + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + SEC("classifier") int tc_use_helper(struct __sk_buff *ctx) { @@ -196,3 +244,47 @@ int tc_minus_delta(struct __sk_buff *ctx) global_bpf_mtu_xdp = mtu_len; return retval; } + +SEC("classifier") +int tc_input_len(struct __sk_buff *ctx) +{ + int retval = BPF_OK; /* Expected retval on successful test */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0)) + retval = BPF_DROP; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("classifier") +int tc_input_len_exceed(struct __sk_buff *ctx) +{ + int retval = BPF_DROP; /* Fail */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int err; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size like MTU. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + mtu_len += 1; /* Exceed with 1 */ + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0); + if (err == BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = BPF_OK; /* Success in exceeding MTU check */ + + global_bpf_mtu_xdp = mtu_len; + return retval; +} From 05a68ce5fa51a83c360381630f823545c5757aa2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Mar 2021 10:50:28 -0800 Subject: [PATCH 007/171] bpf: Don't do bpf_cgroup_storage_set() for kuprobe/tp programs For kuprobe and tracepoint bpf programs, kernel calls trace_call_bpf() which calls BPF_PROG_RUN_ARRAY_CHECK() to run the program array. Currently, BPF_PROG_RUN_ARRAY_CHECK() also calls bpf_cgroup_storage_set() to set percpu cgroup local storage with NULL value. This is due to Commit 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") which modified __BPF_PROG_RUN_ARRAY() to call bpf_cgroup_storage_set() and this macro is also used by BPF_PROG_RUN_ARRAY_CHECK(). kuprobe and tracepoint programs are not allowed to call bpf_get_local_storage() helper hence does not access percpu cgroup local storage. Let us change BPF_PROG_RUN_ARRAY_CHECK() not to modify percpu cgroup local storage. The issue is observed when I tried to debug [1] where percpu data is overwritten due to preempt_disable -> migration_disable change. This patch does not completely fix the above issue, which will be addressed separately, e.g., multiple cgroup prog runs may preempt each other. But it does fix any potential issue caused by tracing program overwriting percpu cgroup storage: - in a busy system, a tracing program is to run between bpf_cgroup_storage_set() and the cgroup prog run. - a kprobe program is triggered by a helper in cgroup prog before bpf_get_local_storage() is called. [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T Fixes: 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20210309185028.3763817-1-yhs@fb.com --- include/linux/bpf.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cccaef1088ea..d7e0f479a5b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1093,7 +1093,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, _ret; \ }) -#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ +#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ @@ -1106,7 +1106,8 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, goto _out; \ _item = &_array->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ + if (set_cg_storage) \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ _ret &= func(_prog, ctx); \ _item++; \ } \ @@ -1153,10 +1154,10 @@ _out: \ }) #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, false) + __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true) #define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, true) + __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false) #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); From de920fc64cbaa031f947e9be964bda05fd090380 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 8 Mar 2021 17:56:47 -0800 Subject: [PATCH 008/171] bpf, x86: Use kvmalloc_array instead kmalloc_array in bpf_jit_comp x86 bpf_jit_comp.c used kmalloc_array to store jited addresses for each bpf insn. With a large bpf program, we have see the following allocation failures in our production server: page allocation failure: order:5, mode:0x40cc0(GFP_KERNEL|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0" Call Trace: dump_stack+0x50/0x70 warn_alloc.cold.120+0x72/0xd2 ? __alloc_pages_direct_compact+0x157/0x160 __alloc_pages_slowpath+0xcdb/0xd00 ? get_page_from_freelist+0xe44/0x1600 ? vunmap_page_range+0x1ba/0x340 __alloc_pages_nodemask+0x2c9/0x320 kmalloc_order+0x18/0x80 kmalloc_order_trace+0x1d/0xa0 bpf_int_jit_compile+0x1e2/0x484 ? kmalloc_order_trace+0x1d/0xa0 bpf_prog_select_runtime+0xc3/0x150 bpf_prog_load+0x480/0x720 ? __mod_memcg_lruvec_state+0x21/0x100 __do_sys_bpf+0xc31/0x2040 ? close_pdeo+0x86/0xe0 do_syscall_64+0x42/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f2f300f7fa9 Code: Bad RIP value. Dumped assembly: ffffffff810b6d70 : ; { ffffffff810b6d70: e8 eb a5 b4 00 callq 0xffffffff81c01360 <__fentry__> ffffffff810b6d75: 41 57 pushq %r15 ... ffffffff810b6f39: e9 72 fe ff ff jmp 0xffffffff810b6db0 ; addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); ffffffff810b6f3e: 8b 45 0c movl 12(%rbp), %eax ; return __kmalloc(bytes, flags); ffffffff810b6f41: be c0 0c 00 00 movl $3264, %esi ; addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); ffffffff810b6f46: 8d 78 01 leal 1(%rax), %edi ; if (unlikely(check_mul_overflow(n, size, &bytes))) ffffffff810b6f49: 48 c1 e7 02 shlq $2, %rdi ; return __kmalloc(bytes, flags); ffffffff810b6f4d: e8 8e 0c 1d 00 callq 0xffffffff81287be0 <__kmalloc> ; if (!addrs) { ffffffff810b6f52: 48 85 c0 testq %rax, %rax Change kmalloc_array() to kvmalloc_array() to avoid potential allocation error for big bpf programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210309015647.3657852-1-yhs@fb.com --- arch/x86/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6926d0ca6c71..747bba0a584a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2225,7 +2225,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2317,7 +2317,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } From 5115daa675ccf70497fe56e8916cf738d8212c10 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 12 Jan 2021 13:29:14 +0200 Subject: [PATCH 009/171] net/mlx5e: Enforce minimum value check for ICOSQ size The ICOSQ size should not go below MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE. Enforce this where it's missing. Signed-off-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index ec2fcb2a2977..5c8ffa8da6f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2368,8 +2368,9 @@ static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5e_params *params, { switch (params->rq_wq_type) { case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: - return order_base_2(MLX5E_UMR_WQEBBS) + - mlx5e_get_rq_log_wq_sz(rqp->rqc); + return max_t(u8, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE, + order_base_2(MLX5E_UMR_WQEBBS) + + mlx5e_get_rq_log_wq_sz(rqp->rqc)); default: /* MLX5_WQ_TYPE_CYCLIC */ return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE; } From d5dd03b26ba49c4ffe67ee1937add82293c19794 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 12 Jan 2021 13:21:17 +0200 Subject: [PATCH 010/171] net/mlx5e: RX, Mind the MPWQE gaps when calculating offsets Since cited patch, MLX5E_REQUIRED_WQE_MTTS is not a power of two. Hence, usage of MLX5E_LOG_ALIGNED_MPWQE_PPW should be replaced, as it lost some accuracy. Use the designated macro to calculate the number of required MTTs. This makes sure the solution in cited patch works properly. While here, un-inline mlx5e_get_mpwqe_offset(), and remove the unused RQ parameter. Fixes: c3c9402373fe ("net/mlx5e: Add resiliency in Striding RQ mode for packets larger than MTU") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 7 ++++--- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 7435fe6829b6..304b296fe8b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -92,14 +92,15 @@ struct page_pool; MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0) #define MLX5_MPWRQ_PAGES_PER_WQE BIT(MLX5_MPWRQ_WQE_PAGE_ORDER) -#define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2) +#define MLX5_ALIGN_MTTS(mtts) (ALIGN(mtts, 8)) +#define MLX5_ALIGNED_MTTS_OCTW(mtts) ((mtts) / 2) +#define MLX5_MTT_OCTW(mtts) (MLX5_ALIGNED_MTTS_OCTW(MLX5_ALIGN_MTTS(mtts))) /* Add another page to MLX5E_REQUIRED_WQE_MTTS as a buffer between * WQEs, This page will absorb write overflow by the hardware, when * receiving packets larger than MTU. These oversize packets are * dropped by the driver at a later stage. */ -#define MLX5E_REQUIRED_WQE_MTTS (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE + 1, 8)) -#define MLX5E_LOG_ALIGNED_MPWQE_PPW (ilog2(MLX5E_REQUIRED_WQE_MTTS)) +#define MLX5E_REQUIRED_WQE_MTTS (MLX5_ALIGN_MTTS(MLX5_MPWRQ_PAGES_PER_WQE + 1)) #define MLX5E_REQUIRED_MTTS(wqes) (wqes * MLX5E_REQUIRED_WQE_MTTS) #define MLX5E_MAX_RQ_NUM_MTTS \ ((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5c8ffa8da6f0..831f5495ff39 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -334,9 +334,9 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq rq->wqe_overflow.addr); } -static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix) +static u64 mlx5e_get_mpwqe_offset(u16 wqe_ix) { - return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT; + return MLX5E_REQUIRED_MTTS(wqe_ix) << PAGE_SHIFT; } static void mlx5e_init_frags_partition(struct mlx5e_rq *rq) @@ -577,7 +577,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i); u32 byte_count = rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz; - u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i); + u64 dma_offset = mlx5e_get_mpwqe_offset(i); wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom); wqe->data[0].byte_count = cpu_to_be32(byte_count); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 1b6ad94ebb10..249d8905e644 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -500,7 +500,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) struct mlx5e_icosq *sq = rq->icosq; struct mlx5_wq_cyc *wq = &sq->wq; struct mlx5e_umr_wqe *umr_wqe; - u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1); u16 pi; int err; int i; @@ -531,7 +530,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) umr_wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR); - umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset); + umr_wqe->uctrl.xlt_offset = + cpu_to_be16(MLX5_ALIGNED_MTTS_OCTW(MLX5E_REQUIRED_MTTS(ix))); sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { .wqe_type = MLX5E_ICOSQ_WQE_UMR_RX, From 354521eebd02db45168b9c8c3795078f90c327b7 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 28 Feb 2021 18:40:18 +0200 Subject: [PATCH 011/171] net/mlx5e: Accumulate port PTP TX stats with other channels stats In addition to .get_ethtool_stats, add port PTP TX stats to .ndo_get_stats64. Fixes: 145e5637d941 ("net/mlx5e: Add TX PTP port object support") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 831f5495ff39..9e2a30dc5e4f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3811,6 +3811,15 @@ void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s) for (j = 0; j < priv->max_opened_tc; j++) { struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j]; + s->tx_packets += sq_stats->packets; + s->tx_bytes += sq_stats->bytes; + s->tx_dropped += sq_stats->dropped; + } + } + if (priv->port_ptp_opened) { + for (i = 0; i < priv->max_opened_tc; i++) { + struct mlx5e_sq_stats *sq_stats = &priv->port_ptp_stats.sq[i]; + s->tx_packets += sq_stats->packets; s->tx_bytes += sq_stats->bytes; s->tx_dropped += sq_stats->dropped; From 1c2cdf0b603a3b0c763288ad92e9f3f1555925cf Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 12 Jan 2021 13:34:29 +0200 Subject: [PATCH 012/171] net/mlx5e: Set PTP channel pointer explicitly to NULL When closing the PTP channel, set its pointer explicitly to NULL. PTP channel is opened on demand, the code verify the pointer validity before access. Nullify it when closing the PTP channel to avoid unexpected behavior. Fixes: 145e5637d941 ("net/mlx5e: Add TX PTP port object support") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 9e2a30dc5e4f..66d23cd275c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2503,8 +2503,10 @@ void mlx5e_close_channels(struct mlx5e_channels *chs) { int i; - if (chs->port_ptp) + if (chs->port_ptp) { mlx5e_port_ptp_close(chs->port_ptp); + chs->port_ptp = NULL; + } for (i = 0; i < chs->num; i++) mlx5e_close_channel(chs->c[i]); From e5eb01344e9b09bb9d255b9727449186f7168df8 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 11 Feb 2021 15:51:11 +0200 Subject: [PATCH 013/171] net/mlx5e: When changing XDP program without reset, take refs for XSK RQs Each RQ (including XSK RQs) takes a reference to the XDP program. When an XDP program is attached or detached, the channels and queues are recreated, however, there is a special flow for changing an active XDP program to another one. In that flow, channels and queues stay alive, but the refcounts of the old and new XDP programs are adjusted. This flow didn't increment refcount by the number of active XSK RQs, and this commit fixes it. Fixes: db05815b36cb ("net/mlx5e: Add XSK zero-copy support") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 66d23cd275c1..6b761fb8d269 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4695,8 +4695,10 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) struct mlx5e_channel *c = priv->channels.c[i]; mlx5e_rq_replace_xdp_prog(&c->rq, prog); - if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) { + bpf_prog_inc(prog); mlx5e_rq_replace_xdp_prog(&c->xskrq, prog); + } } unlock: From 74640f09735f935437bd8df9fe61a66f03eabb34 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 29 Jan 2021 14:04:34 +0200 Subject: [PATCH 014/171] net/mlx5e: Revert parameters on errors when changing PTP state without reset Port timestamping for PTP can be enabled/disabled while the channels are closed. In that case mlx5e_safe_switch_channels is skipped, and the preactivate hook is called directly. However, if that hook returns an error, the channel parameters must be reverted back to their old values. This commit adds missing handling on this case. Fixes: 145e5637d941 ("net/mlx5e: Add TX PTP port object support") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index abdf721bb264..0e059d5c57ac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -2014,8 +2014,13 @@ static int set_pflag_tx_port_ts(struct net_device *netdev, bool enable) */ if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + struct mlx5e_params old_params; + + old_params = priv->channels.params; priv->channels.params = new_channels.params; err = mlx5e_num_channels_changed(priv); + if (err) + priv->channels.params = old_params; goto out; } From 385d40b042e60aa0b677d7b400a0fefb44bcbaf4 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Tue, 16 Feb 2021 13:39:18 +0200 Subject: [PATCH 015/171] net/mlx5e: Don't match on Geneve options in case option masks are all zero The cited change added offload support for Geneve options without verifying the validity of the options masks, this caused offload of rules with match on Geneve options with class,type and data masks which are zero to fail. Fix by ignoring the match on Geneve options in case option masks are all zero. Fixes: 9272e3df3023 ("net/mlx5e: Geneve, Add support for encap/decap flows offload") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Reviewed-by: Oz Shlomo Reviewed-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c index e472ed0eacfb..7ed3f9f79f11 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c @@ -227,6 +227,10 @@ static int mlx5e_tc_tun_parse_geneve_options(struct mlx5e_priv *priv, option_key = (struct geneve_opt *)&enc_opts.key->data[0]; option_mask = (struct geneve_opt *)&enc_opts.mask->data[0]; + if (option_mask->opt_class == 0 && option_mask->type == 0 && + !memchr_inv(option_mask->opt_data, 0, option_mask->length * 4)) + return 0; + if (option_key->length > max_tlv_option_data_len) { NL_SET_ERR_MSG_MOD(extack, "Matching on GENEVE options: unsupported option len"); From 55affa97d6758b6aeab0bc68f4884c4b5a6828af Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 1 Mar 2021 17:54:21 +0200 Subject: [PATCH 016/171] net/mlx5: Fix turn-off PPS command Fix a bug of uninitialized pin index when trying to turn off PPS out. Fixes: de19cd6cc977 ("net/mlx5: Move some PPS logic into helper functions") Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index b0e129d0f6d8..1e7f26b240de 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -495,15 +495,15 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, return -EINVAL; field_select = MLX5_MTPPS_FS_ENABLE; + pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT, rq->perout.index); + if (pin < 0) + return -EBUSY; + if (on) { bool rt_mode = mlx5_real_time_mode(mdev); u32 nsec; s64 sec; - pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT, rq->perout.index); - if (pin < 0) - return -EBUSY; - pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; ts.tv_sec = rq->perout.period.sec; From 1e74152ed065ef491c30ccbbe119992e3e5200be Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 2 Mar 2021 11:31:08 +0200 Subject: [PATCH 017/171] net/mlx5e: Check correct ip_version in decapsulation route resolution flow_attr->ip_version has the matching that should be done inner/outer. When working with chains, decapsulation is done on chain0 and next chain match on outer header which is the original inner which could be ipv4. So in tunnel route resolution we cannot use that to know which ip version we are at so save tun_ip_version when parsing the tunnel match and use that. Fixes: a508728a4c8b ("net/mlx5e: VF tunnel RX traffic offloading") Signed-off-by: Roi Dayan Reviewed-by: Dmytro Linkin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index f8075a604605..172e0474f2e6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -685,14 +685,14 @@ int mlx5e_tc_tun_route_lookup(struct mlx5e_priv *priv, u16 vport_num; int err = 0; - if (flow_attr->ip_version == 4) { + if (flow_attr->tun_ip_version == 4) { /* Addresses are swapped for decap */ attr.fl.fl4.saddr = esw_attr->rx_tun_attr->dst_ip.v4; attr.fl.fl4.daddr = esw_attr->rx_tun_attr->src_ip.v4; err = mlx5e_route_lookup_ipv4_get(priv, priv->netdev, &attr); } #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) - else if (flow_attr->ip_version == 6) { + else if (flow_attr->tun_ip_version == 6) { /* Addresses are swapped for decap */ attr.fl.fl6.saddr = esw_attr->rx_tun_attr->dst_ip.v6; attr.fl.fl6.daddr = esw_attr->rx_tun_attr->src_ip.v6; @@ -718,10 +718,10 @@ int mlx5e_tc_tun_route_lookup(struct mlx5e_priv *priv, esw_attr->rx_tun_attr->decap_vport = vport_num; out: - if (flow_attr->ip_version == 4) + if (flow_attr->tun_ip_version == 4) mlx5e_route_lookup_ipv4_put(&attr); #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) - else if (flow_attr->ip_version == 6) + else if (flow_attr->tun_ip_version == 6) mlx5e_route_lookup_ipv6_put(&attr); #endif return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 6a116335bb21..7f7b0f6dcdf9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -89,6 +89,7 @@ int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow, * required to establish routing. */ flow_flag_set(flow, TUN_RX); + flow->attr->tun_ip_version = ip_version; return 0; } @@ -1091,7 +1092,7 @@ int mlx5e_attach_decap_route(struct mlx5e_priv *priv, if (err || !esw_attr->rx_tun_attr->decap_vport) goto out; - key.ip_version = attr->ip_version; + key.ip_version = attr->tun_ip_version; if (key.ip_version == 4) key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4; else diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index 89003ae7775a..25c091795bcd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -79,6 +79,7 @@ struct mlx5_flow_attr { u8 inner_match_level; u8 outer_match_level; u8 ip_version; + u8 tun_ip_version; u32 flags; union { struct mlx5_esw_flow_attr esw_attr[0]; From f574531a0b77261478408e9c8f70d96dc701a35a Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Mon, 1 Mar 2021 13:07:08 +0200 Subject: [PATCH 018/171] net/mlx5: Disable VF tunnel TX offload if ignore_flow_level isn't supported VF tunnel TX traffic offload is adding flow which forward to flow tables with lower level, which isn't support on all FW versions and may cause firmware to fail with syndrome. Fixed by enabling VF tunnel TX offload only if flow table capability ignore_flow_level is enabled. Fixes: 10742efc20a4 ("net/mlx5e: VF tunnel TX traffic offloading") Signed-off-by: Maor Dickman Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 94cb0217b4f3..8694b83968b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -551,7 +551,8 @@ esw_setup_dests(struct mlx5_flow_destination *dest, if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) && MLX5_CAP_GEN(esw_attr->in_mdev, reg_c_preserve) && - mlx5_eswitch_vport_match_metadata_enabled(esw)) + mlx5_eswitch_vport_match_metadata_enabled(esw) && + MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level)) attr->flags |= MLX5_ESW_ATTR_FLAG_SRC_REWRITE; if (attr->dest_ft) { From 469549e4778a1e5ac4a7c6659c4b1a75a648bfdf Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 10 Feb 2021 10:33:13 +0200 Subject: [PATCH 019/171] net/mlx5e: Fix error flow in change profile Move priv memset from init to cleanup to avoid double priv cleanup that can happen on profile change if also roolback fails. Add missing cleanup flow in mlx5e_netdev_attach_profile(). Fixes: c4d7eb57687f ("net/mxl5e: Add change profile method") Signed-off-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 41 +++++++++++-------- .../ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 3 +- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 6b761fb8d269..33b418796e43 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5488,8 +5488,6 @@ int mlx5e_priv_init(struct mlx5e_priv *priv, struct net_device *netdev, struct mlx5_core_dev *mdev) { - memset(priv, 0, sizeof(*priv)); - /* priv init */ priv->mdev = mdev; priv->netdev = netdev; @@ -5522,12 +5520,18 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) { int i; + /* bail if change profile failed and also rollback failed */ + if (!priv->mdev) + return; + destroy_workqueue(priv->wq); free_cpumask_var(priv->scratchpad.cpumask); for (i = 0; i < priv->htb.max_qos_sqs; i++) kfree(priv->htb.qos_sq_stats[i]); kvfree(priv->htb.qos_sq_stats); + + memset(priv, 0, sizeof(*priv)); } struct net_device * @@ -5644,11 +5648,10 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv) } static int -mlx5e_netdev_attach_profile(struct mlx5e_priv *priv, +mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, const struct mlx5e_profile *new_profile, void *new_ppriv) { - struct net_device *netdev = priv->netdev; - struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_priv *priv = netdev_priv(netdev); int err; err = mlx5e_priv_init(priv, netdev, mdev); @@ -5661,10 +5664,16 @@ mlx5e_netdev_attach_profile(struct mlx5e_priv *priv, priv->ppriv = new_ppriv; err = new_profile->init(priv->mdev, priv->netdev); if (err) - return err; + goto priv_cleanup; err = mlx5e_attach_netdev(priv); if (err) - new_profile->cleanup(priv); + goto profile_cleanup; + return err; + +profile_cleanup: + new_profile->cleanup(priv); +priv_cleanup: + mlx5e_priv_cleanup(priv); return err; } @@ -5673,13 +5682,14 @@ int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, { unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile); const struct mlx5e_profile *orig_profile = priv->profile; + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; void *orig_ppriv = priv->ppriv; int err, rollback_err; /* sanity */ if (new_max_nch != priv->max_nch) { - netdev_warn(priv->netdev, - "%s: Replacing profile with different max channels\n", + netdev_warn(netdev, "%s: Replacing profile with different max channels\n", __func__); return -EINVAL; } @@ -5689,22 +5699,19 @@ int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, priv->profile->cleanup(priv); mlx5e_priv_cleanup(priv); - err = mlx5e_netdev_attach_profile(priv, new_profile, new_ppriv); + err = mlx5e_netdev_attach_profile(netdev, mdev, new_profile, new_ppriv); if (err) { /* roll back to original profile */ - netdev_warn(priv->netdev, "%s: new profile init failed, %d\n", - __func__, err); + netdev_warn(netdev, "%s: new profile init failed, %d\n", __func__, err); goto rollback; } return 0; rollback: - rollback_err = mlx5e_netdev_attach_profile(priv, orig_profile, orig_ppriv); - if (rollback_err) { - netdev_err(priv->netdev, - "%s: failed to rollback to orig profile, %d\n", + rollback_err = mlx5e_netdev_attach_profile(netdev, mdev, orig_profile, orig_ppriv); + if (rollback_err) + netdev_err(netdev, "%s: failed to rollback to orig profile, %d\n", __func__, rollback_err); - } return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 1eeca45cfcdf..756fa0401ab7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -694,6 +694,7 @@ static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev) static void mlx5_rdma_netdev_free(struct net_device *netdev) { struct mlx5e_priv *priv = mlx5i_epriv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; struct mlx5i_priv *ipriv = priv->ppriv; const struct mlx5e_profile *profile = priv->profile; @@ -702,7 +703,7 @@ static void mlx5_rdma_netdev_free(struct net_device *netdev) if (!ipriv->sub_interface) { mlx5i_pkey_qpn_ht_cleanup(netdev); - mlx5e_destroy_mdev_resources(priv->mdev); + mlx5e_destroy_mdev_resources(mdev); } } From 4806f1e2fee84c053cb68cd5be5817170bf0aab6 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 3 Mar 2021 14:36:16 +0200 Subject: [PATCH 020/171] net/mlx5: Set QP timestamp mode to default QPs which don't care from timestamp mode, should set the ts_format to default, otherwise the QP creation could be failed if the timestamp mode is not supported. Fixes: 2fe8d4b87802 ("RDMA/mlx5: Fail QP creation if the device can not support the CQE TS") Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 1 + include/linux/mlx5/qp.h | 7 +++++++ 4 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index 80da50e12915..bd66ab2af5b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -575,6 +575,7 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn, MLX5_SET(qpc, qpc, log_sq_size, ilog2(conn->qp.sq.size)); MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn); MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma); if (MLX5_CAP_GEN(mdev, cqe_version) == 1) MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 756fa0401ab7..6f7cef47e04c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -233,6 +233,7 @@ int mlx5i_create_underlay_qp(struct mlx5e_priv *priv) } qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(priv->mdev)); MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 83c4c877d558..8a6a56f9dc4e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -169,6 +169,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt)); MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt)); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma); if (MLX5_CAP_GEN(mdev, cqe_version) == 1) MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index d75ef8aa8fac..b7deb790f257 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -547,4 +547,11 @@ static inline const char *mlx5_qp_state_str(int state) } } +static inline int mlx5_get_qp_default_ts(struct mlx5_core_dev *dev) +{ + return !MLX5_CAP_ROCE(dev, qp_ts_format) ? + MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING : + MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT; +} + #endif /* MLX5_QP_H */ From 8256c69b2d9c35e94d0e424184c0d27b59bdee12 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 3 Mar 2021 14:39:20 +0200 Subject: [PATCH 021/171] RDMA/mlx5: Fix timestamp default mode 1. Don't set the ts_format bit to default when it reserved - device is running in the old mode (free running). 2. XRC doesn't have a CQ therefore the ts format in the QP context should be default / free running. 3. Set ts_format to WQ. Fixes: 2fe8d4b87802 ("RDMA/mlx5: Fail QP creation if the device can not support the CQE TS") Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/qp.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ec4b3f6a8222..f5a52a6fae43 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1078,7 +1078,7 @@ static int _create_kernel_qp(struct mlx5_ib_dev *dev, qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); MLX5_SET(qpc, qpc, uar_page, uar_index); - MLX5_SET(qpc, qpc, ts_format, MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev)); MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); /* Set "fast registration enabled" for all kernel QPs */ @@ -1188,7 +1188,8 @@ static int get_rq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq) } return MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING; } - return MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT; + return fr_supported ? MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING : + MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT; } static int get_sq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq) @@ -1206,7 +1207,8 @@ static int get_sq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq) } return MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING; } - return MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT; + return fr_supported ? MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING : + MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT; } static int get_qp_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq, @@ -1217,7 +1219,8 @@ static int get_qp_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq, MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING || MLX5_CAP_ROCE(dev->mdev, qp_ts_format) == MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME; - int ts_format = MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT; + int ts_format = fr_supported ? MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING : + MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT; if (recv_cq && recv_cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) @@ -1930,6 +1933,7 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp->flags & IB_QP_CREATE_MANAGED_RECV) MLX5_SET(qpc, qpc, cd_slave_receive, 1); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev)); MLX5_SET(qpc, qpc, rq_type, MLX5_SRQ_RQ); MLX5_SET(qpc, qpc, no_sq, 1); MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); @@ -4873,6 +4877,7 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, struct mlx5_ib_dev *dev; int has_net_offloads; __be64 *rq_pas0; + int ts_format; void *in; void *rqc; void *wq; @@ -4881,6 +4886,10 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, dev = to_mdev(pd->device); + ts_format = get_rq_ts_format(dev, to_mcq(init_attr->cq)); + if (ts_format < 0) + return ts_format; + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas; in = kvzalloc(inlen, GFP_KERNEL); if (!in) @@ -4890,6 +4899,7 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, ts_format, ts_format); MLX5_SET(rqc, rqc, user_index, rwq->user_index); MLX5_SET(rqc, rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); From 8b90d897823b28a51811931f3bdc79f8df79407e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 17 Feb 2021 08:23:29 +0200 Subject: [PATCH 022/171] net/mlx5e: E-switch, Fix rate calculation division do_div() returns reminder, while cited patch wanted to use quotient. Fix it by using quotient. Fixes: 0e22bfb7c046 ("net/mlx5e: E-switch, Fix rate calculation for overflow") Signed-off-by: Parav Pandit Signed-off-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 0da69b98f38f..0cacf46dc950 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4445,7 +4445,8 @@ static int apply_police_params(struct mlx5e_priv *priv, u64 rate, */ if (rate) { rate = (rate * BITS_PER_BYTE) + 500000; - rate_mbps = max_t(u64, do_div(rate, 1000000), 1); + do_div(rate, 1000000); + rate_mbps = max_t(u32, rate, 1); } err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps); From 6a3717544ce9ee8a2058fbc75c67060515435937 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 26 Feb 2021 10:15:16 +0200 Subject: [PATCH 023/171] net/mlx5: SF, Correct vhca context size Fix vhca context size as defined by device interface specification. Fixes: f3196bb0f14c ("net/mlx5: Introduce vhca state event notifier") Signed-off-by: Parav Pandit Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h index 1daf5a122ba3..4fc870140d71 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h @@ -20,7 +20,7 @@ struct mlx5_ifc_vhca_state_context_bits { u8 sw_function_id[0x20]; - u8 reserved_at_40[0x80]; + u8 reserved_at_40[0x40]; }; struct mlx5_ifc_query_vhca_state_out_bits { From 6fa37d66ef2dc850ff18b2a057a84cd7ca8499bb Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 2 Feb 2021 12:21:04 +0200 Subject: [PATCH 024/171] net/mlx5: SF: Fix memory leak of work item Cited patch in the fixes tag missed to free the allocated work. Fix it by freeing the work after work execution. Fixes: f3196bb0f14c ("net/mlx5: Introduce vhca state event notifier") Signed-off-by: Shay Drory Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c index af2f2dd9db25..f1c2068d4f2a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c @@ -94,6 +94,7 @@ static void mlx5_vhca_state_work_handler(struct work_struct *_work) struct mlx5_core_dev *dev = notifier->dev; mlx5_vhca_event_notify(dev, &work->event); + kfree(work); } static int From dc694f11a7593b7fd5aabe15a0e6c8fd2de24ebf Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Feb 2021 15:41:35 +0200 Subject: [PATCH 025/171] net/mlx5: SF: Fix error flow of SFs allocation flow When SF id is unavailable, code jumps to wrong label that accesses sw id array outside of its range. Hence, when SF id is not allocated, avoid accessing such array. Fixes: 8f0105418668 ("net/mlx5: SF, Add port add delete functionality") Signed-off-by: Shay Drory Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c index 58b6be0b03d7..0914909806cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c @@ -64,7 +64,7 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum) } if (sw_id == -ENOSPC) { err = -ENOSPC; - goto err; + goto exist_err; } hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sw_id); From 84076c4c800d1be77199a139d65b8b136a61422e Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sat, 6 Feb 2021 22:11:52 +0200 Subject: [PATCH 026/171] net/mlx5: DR, Fix potential shift wrapping of 32-bit value in STEv1 getter Fix 32-bit variable shift wrapping in dr_ste_v1_get_miss_addr. Fixes: a6098129c781 ("net/mlx5: DR, Add STEv1 setters and getters") Reported-by: Dan Carpenter Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c index 4088d6e51508..9143ec326ebf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c @@ -264,8 +264,8 @@ static void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr) static u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p) { u64 index = - (MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) | - MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32) << 26); + ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) | + ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32)) << 26); return index << 6; } From c4c877b2732466b4c63217baad05c96f775912c7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Mar 2021 01:38:09 +0100 Subject: [PATCH 027/171] net: Consolidate common blackhole dst ops Move generic blackhole dst ops to the core and use them from both ipv4_dst_blackhole_ops and ip6_dst_blackhole_ops where possible. No functional change otherwise. We need these also in other locations and having to define them over and over again is not great. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst.h | 11 +++++++++++ net/core/dst.c | 38 ++++++++++++++++++++++++++++++++++++++ net/ipv4/route.c | 45 ++++++++------------------------------------- net/ipv6/route.c | 36 +++++++++--------------------------- 4 files changed, 66 insertions(+), 64 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 26f134ad3a25..75b1e734e9c2 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -550,4 +550,15 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } +struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); +void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, bool confirm_neigh); +void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old); +struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr); +unsigned int dst_blackhole_mtu(const struct dst_entry *dst); + #endif /* _NET_DST_H */ diff --git a/net/core/dst.c b/net/core/dst.c index 0c01bd8d9d81..5f6315601776 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -237,6 +237,44 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); +struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; +} + +u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + +struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + return NULL; +} + +void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) +{ +} +EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu); + +void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} +EXPORT_SYMBOL_GPL(dst_blackhole_redirect); + +unsigned int dst_blackhole_mtu(const struct dst_entry *dst) +{ + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst->dev->mtu; +} +EXPORT_SYMBOL_GPL(dst_blackhole_mtu); + static struct dst_ops md_dst_ops = { .family = AF_UNSPEC, }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 02d81d79deeb..bba150fdd265 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2687,44 +2687,15 @@ out: return rth; } -static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) -{ - return NULL; -} - -static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) -{ - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - - return mtu ? : dst->dev->mtu; -} - -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu, - bool confirm_neigh) -{ -} - -static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - -static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, - unsigned long old) -{ - return NULL; -} - static struct dst_ops ipv4_dst_blackhole_ops = { - .family = AF_INET, - .check = ipv4_blackhole_dst_check, - .mtu = ipv4_blackhole_mtu, - .default_advmss = ipv4_default_advmss, - .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .redirect = ipv4_rt_blackhole_redirect, - .cow_metrics = ipv4_rt_blackhole_cow_metrics, - .neigh_lookup = ipv4_neigh_lookup, + .family = AF_INET, + .default_advmss = ipv4_default_advmss, + .neigh_lookup = ipv4_neigh_lookup, + .check = dst_blackhole_check, + .cow_metrics = dst_blackhole_cow_metrics, + .update_pmtu = dst_blackhole_update_pmtu, + .redirect = dst_blackhole_redirect, + .mtu = dst_blackhole_mtu, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1536f4948e86..1056b0229ffd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -260,34 +260,16 @@ static struct dst_ops ip6_dst_ops_template = { .confirm_neigh = ip6_confirm_neigh, }; -static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) -{ - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - - return mtu ? : dst->dev->mtu; -} - -static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu, - bool confirm_neigh) -{ -} - -static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - static struct dst_ops ip6_dst_blackhole_ops = { - .family = AF_INET6, - .destroy = ip6_dst_destroy, - .check = ip6_dst_check, - .mtu = ip6_blackhole_mtu, - .default_advmss = ip6_default_advmss, - .update_pmtu = ip6_rt_blackhole_update_pmtu, - .redirect = ip6_rt_blackhole_redirect, - .cow_metrics = dst_cow_metrics_generic, - .neigh_lookup = ip6_dst_neigh_lookup, + .family = AF_INET6, + .default_advmss = ip6_default_advmss, + .neigh_lookup = ip6_dst_neigh_lookup, + .check = ip6_dst_check, + .destroy = ip6_dst_destroy, + .cow_metrics = dst_cow_metrics_generic, + .update_pmtu = dst_blackhole_update_pmtu, + .redirect = dst_blackhole_redirect, + .mtu = dst_blackhole_mtu, }; static const u32 ip6_template_metrics[RTAX_MAX] = { From a188bb5638d41aa99090ebf2f85d3505ab13fba5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Mar 2021 01:38:10 +0100 Subject: [PATCH 028/171] net, bpf: Fix ip6ip6 crash with collect_md populated skbs I ran into a crash where setting up a ip6ip6 tunnel device which was /not/ set to collect_md mode was receiving collect_md populated skbs for xmit. The BPF prog was populating the skb via bpf_skb_set_tunnel_key() which is assigning special metadata dst entry and then redirecting the skb to the device, taking ip6_tnl_start_xmit() -> ipxip6_tnl_xmit() -> ip6_tnl_xmit() and in the latter it performs a neigh lookup based on skb_dst(skb) where we trigger a NULL pointer dereference on dst->ops->neigh_lookup() since the md_dst_ops do not populate neigh_lookup callback with a fake handler. Transform the md_dst_ops into generic dst_blackhole_ops that can also be reused elsewhere when needed, and use them for the metadata dst entries as callback ops. Also, remove the dst_md_discard{,_out}() ops and rely on dst_discard{,_out}() from dst_init() which free the skb the same way modulo the splat. Given we will be able to recover just fine from there, avoid any potential splats iff this gets ever triggered in future (or worse, panic on warns when set). Fixes: f38a9eb1f77b ("dst: Metadata destinations") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/dst.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index 5f6315601776..fb3bcba87744 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -275,37 +275,24 @@ unsigned int dst_blackhole_mtu(const struct dst_entry *dst) } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); -static struct dst_ops md_dst_ops = { - .family = AF_UNSPEC, +static struct dst_ops dst_blackhole_ops = { + .family = AF_UNSPEC, + .neigh_lookup = dst_blackhole_neigh_lookup, + .check = dst_blackhole_check, + .cow_metrics = dst_blackhole_cow_metrics, + .update_pmtu = dst_blackhole_update_pmtu, + .redirect = dst_blackhole_redirect, + .mtu = dst_blackhole_mtu, }; -static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - WARN_ONCE(1, "Attempting to call output on metadata dst\n"); - kfree_skb(skb); - return 0; -} - -static int dst_md_discard(struct sk_buff *skb) -{ - WARN_ONCE(1, "Attempting to call input on metadata dst\n"); - kfree_skb(skb); - return 0; -} - static void __metadata_dst_init(struct metadata_dst *md_dst, enum metadata_type type, u8 optslen) - { struct dst_entry *dst; dst = &md_dst->dst; - dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, + dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, DST_METADATA | DST_NOCOUNT); - - dst->input = dst_md_discard; - dst->output = dst_md_discard_out; - memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); md_dst->type = type; } From 28259bac7f1dde06d8ba324e222bbec9d4e92f2b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 9 Mar 2021 18:20:35 -0800 Subject: [PATCH 029/171] ipv6: fix suspecious RCU usage warning Syzbot reported the suspecious RCU usage in nexthop_fib6_nh() when called from ipv6_route_seq_show(). The reason is ipv6_route_seq_start() calls rcu_read_lock_bh(), while nexthop_fib6_nh() calls rcu_dereference_rtnl(). The fix proposed is to add a variant of nexthop_fib6_nh() to use rcu_dereference_bh_rtnl() for ipv6_route_seq_show(). The reported trace is as follows: ./include/net/nexthop.h:416 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by syz-executor.0/17895: at: seq_read+0x71/0x12a0 fs/seq_file.c:169 at: seq_file_net include/linux/seq_file_net.h:19 [inline] at: ipv6_route_seq_start+0xaf/0x300 net/ipv6/ip6_fib.c:2616 stack backtrace: CPU: 1 PID: 17895 Comm: syz-executor.0 Not tainted 4.15.0-syzkaller #0 Call Trace: [] __dump_stack lib/dump_stack.c:17 [inline] [] dump_stack+0xd8/0x147 lib/dump_stack.c:53 [] lockdep_rcu_suspicious+0x153/0x15d kernel/locking/lockdep.c:5745 [] nexthop_fib6_nh include/net/nexthop.h:416 [inline] [] ipv6_route_native_seq_show net/ipv6/ip6_fib.c:2488 [inline] [] ipv6_route_seq_show+0x436/0x7a0 net/ipv6/ip6_fib.c:2673 [] seq_read+0xccf/0x12a0 fs/seq_file.c:276 [] proc_reg_read+0x10c/0x1d0 fs/proc/inode.c:231 [] do_loop_readv_writev fs/read_write.c:714 [inline] [] do_loop_readv_writev fs/read_write.c:701 [inline] [] do_iter_read+0x49e/0x660 fs/read_write.c:935 [] vfs_readv+0xfb/0x170 fs/read_write.c:997 [] kernel_readv fs/splice.c:361 [inline] [] default_file_splice_read+0x487/0x9c0 fs/splice.c:416 [] do_splice_to+0x129/0x190 fs/splice.c:879 [] splice_direct_to_actor+0x256/0x890 fs/splice.c:951 [] do_splice_direct+0x1dd/0x2b0 fs/splice.c:1060 [] do_sendfile+0x597/0xce0 fs/read_write.c:1459 [] SYSC_sendfile64 fs/read_write.c:1520 [inline] [] SyS_sendfile64+0x155/0x170 fs/read_write.c:1506 [] do_syscall_64+0x1ff/0x310 arch/x86/entry/common.c:305 [] entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: f88d8ea67fbdb ("ipv6: Plumb support for nexthop object in a fib6_info") Reported-by: syzbot Signed-off-by: Wei Wang Cc: David Ahern Cc: Ido Schimmel Cc: Petr Machata Cc: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 24 ++++++++++++++++++++++++ net/ipv6/ip6_fib.c | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 7bc057aee40b..a10a319d7eb2 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -410,6 +410,7 @@ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel) int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack); +/* Caller should either hold rcu_read_lock(), or RTNL. */ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) { struct nh_info *nhi; @@ -430,6 +431,29 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) return NULL; } +/* Variant of nexthop_fib6_nh(). + * Caller should either hold rcu_read_lock_bh(), or RTNL. + */ +static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh) +{ + struct nh_info *nhi; + + if (nh->is_group) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp); + nh = nexthop_mpath_select(nh_grp, 0); + if (!nh) + return NULL; + } + + nhi = rcu_dereference_bh_rtnl(nh->nh_info); + if (nhi->family == AF_INET6) + return &nhi->fib6_nh; + + return NULL; +} + static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i) { struct fib6_nh *fib6_nh; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ef9d022e693f..679699e953f1 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2486,7 +2486,7 @@ static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) const struct net_device *dev; if (rt->nh) - fib6_nh = nexthop_fib6_nh(rt->nh); + fib6_nh = nexthop_fib6_nh_bh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); From 9398e9c0b1d44eeb700e9e766c02bcc765c82570 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 10 Mar 2021 12:28:01 +0200 Subject: [PATCH 030/171] drop_monitor: Perform cleanup upon probe registration failure In the rare case that drop_monitor fails to register its probe on the 'napi_poll' tracepoint, it will not deactivate its hysteresis timer as part of the error path. If the hysteresis timer was armed by the shortly lived 'kfree_skb' probe and user space retries to initiate tracing, a warning will be emitted for trying to initialize an active object [1]. Fix this by properly undoing all the operations that were done prior to probe registration, in both software and hardware code paths. Note that syzkaller managed to fail probe registration by injecting a slab allocation failure [2]. [1] ODEBUG: init active (active state 0) object type: timer_list hint: sched_send_work+0x0/0x60 include/linux/list.h:135 WARNING: CPU: 1 PID: 8649 at lib/debugobjects.c:505 debug_print_object+0x16e/0x250 lib/debugobjects.c:505 Modules linked in: CPU: 1 PID: 8649 Comm: syz-executor.0 Not tainted 5.11.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:debug_print_object+0x16e/0x250 lib/debugobjects.c:505 [...] Call Trace: __debug_object_init+0x524/0xd10 lib/debugobjects.c:588 debug_timer_init kernel/time/timer.c:722 [inline] debug_init kernel/time/timer.c:770 [inline] init_timer_key+0x2d/0x340 kernel/time/timer.c:814 net_dm_trace_on_set net/core/drop_monitor.c:1111 [inline] set_all_monitor_traces net/core/drop_monitor.c:1188 [inline] net_dm_monitor_start net/core/drop_monitor.c:1295 [inline] net_dm_cmd_trace+0x720/0x1220 net/core/drop_monitor.c:1339 genl_family_rcv_msg_doit+0x228/0x320 net/netlink/genetlink.c:739 genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] genl_rcv_msg+0x328/0x580 net/netlink/genetlink.c:800 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2502 genl_rcv+0x24/0x40 net/netlink/genetlink.c:811 netlink_unicast_kernel net/netlink/af_netlink.c:1312 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1338 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1927 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2348 ___sys_sendmsg+0xf3/0x170 net/socket.c:2402 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2435 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae [2] FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 1 CPU: 1 PID: 8645 Comm: syz-executor.0 Not tainted 5.11.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: dump_stack+0xfa/0x151 should_fail.cold+0x5/0xa should_failslab+0x5/0x10 __kmalloc+0x72/0x3f0 tracepoint_add_func+0x378/0x990 tracepoint_probe_register+0x9c/0xe0 net_dm_cmd_trace+0x7fc/0x1220 genl_family_rcv_msg_doit+0x228/0x320 genl_rcv_msg+0x328/0x580 netlink_rcv_skb+0x153/0x420 genl_rcv+0x24/0x40 netlink_unicast+0x533/0x7d0 netlink_sendmsg+0x856/0xd90 sock_sendmsg+0xcf/0x120 ____sys_sendmsg+0x6e8/0x810 ___sys_sendmsg+0xf3/0x170 __sys_sendmsg+0xe5/0x1b0 do_syscall_64+0x2d/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 70c69274f354 ("drop_monitor: Initialize timer and work item upon tracing enable") Fixes: 8ee2267ad33e ("drop_monitor: Convert to using devlink tracepoint") Reported-by: syzbot+779559d6503f3a56213d@syzkaller.appspotmail.com Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 571f191c06d9..db65ce62b625 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1053,6 +1053,20 @@ static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) return 0; err_module_put: + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct sk_buff *skb; + + del_timer_sync(&hw_data->send_timer); + cancel_work_sync(&hw_data->dm_alert_work); + while ((skb = __skb_dequeue(&hw_data->drop_queue))) { + struct devlink_trap_metadata *hw_metadata; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + net_dm_hw_metadata_free(hw_metadata); + consume_skb(skb); + } + } module_put(THIS_MODULE); return rc; } @@ -1134,6 +1148,15 @@ static int net_dm_trace_on_set(struct netlink_ext_ack *extack) err_unregister_trace: unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); err_module_put: + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + del_timer_sync(&data->send_timer); + cancel_work_sync(&data->dm_alert_work); + while ((skb = __skb_dequeue(&data->drop_queue))) + consume_skb(skb); + } module_put(THIS_MODULE); return rc; } From dd4fa1dae9f4847cc1fd78ca468ad69e16e5db3e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Mar 2021 01:56:36 -0800 Subject: [PATCH 031/171] macvlan: macvlan_count_rx() needs to be aware of preemption macvlan_count_rx() can be called from process context, it is thus necessary to disable preemption before calling u64_stats_update_begin() syzbot was able to spot this on 32bit arch: WARNING: CPU: 1 PID: 4632 at include/linux/seqlock.h:271 __seqprop_assert include/linux/seqlock.h:271 [inline] WARNING: CPU: 1 PID: 4632 at include/linux/seqlock.h:271 __seqprop_assert.constprop.0+0xf0/0x11c include/linux/seqlock.h:269 Modules linked in: Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 4632 Comm: kworker/1:3 Not tainted 5.12.0-rc2-syzkaller #0 Hardware name: ARM-Versatile Express Workqueue: events macvlan_process_broadcast Backtrace: [<82740468>] (dump_backtrace) from [<827406dc>] (show_stack+0x18/0x1c arch/arm/kernel/traps.c:252) r7:00000080 r6:60000093 r5:00000000 r4:8422a3c4 [<827406c4>] (show_stack) from [<82751b58>] (__dump_stack lib/dump_stack.c:79 [inline]) [<827406c4>] (show_stack) from [<82751b58>] (dump_stack+0xb8/0xe8 lib/dump_stack.c:120) [<82751aa0>] (dump_stack) from [<82741270>] (panic+0x130/0x378 kernel/panic.c:231) r7:830209b4 r6:84069ea4 r5:00000000 r4:844350d0 [<82741140>] (panic) from [<80244924>] (__warn+0xb0/0x164 kernel/panic.c:605) r3:8404ec8c r2:00000000 r1:00000000 r0:830209b4 r7:0000010f [<80244874>] (__warn) from [<82741520>] (warn_slowpath_fmt+0x68/0xd4 kernel/panic.c:628) r7:81363f70 r6:0000010f r5:83018e50 r4:00000000 [<827414bc>] (warn_slowpath_fmt) from [<81363f70>] (__seqprop_assert include/linux/seqlock.h:271 [inline]) [<827414bc>] (warn_slowpath_fmt) from [<81363f70>] (__seqprop_assert.constprop.0+0xf0/0x11c include/linux/seqlock.h:269) r8:5a109000 r7:0000000f r6:a568dac0 r5:89802300 r4:00000001 [<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (u64_stats_update_begin include/linux/u64_stats_sync.h:128 [inline]) [<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (macvlan_count_rx include/linux/if_macvlan.h:47 [inline]) [<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (macvlan_broadcast+0x154/0x26c drivers/net/macvlan.c:291) r5:89802300 r4:8a927740 [<8136499c>] (macvlan_broadcast) from [<81365020>] (macvlan_process_broadcast+0x258/0x2d0 drivers/net/macvlan.c:317) r10:81364f78 r9:8a86d000 r8:8a9c7e7c r7:8413aa5c r6:00000000 r5:00000000 r4:89802840 [<81364dc8>] (macvlan_process_broadcast) from [<802696a4>] (process_one_work+0x2d4/0x998 kernel/workqueue.c:2275) r10:00000008 r9:8404ec98 r8:84367a02 r7:ddfe6400 r6:ddfe2d40 r5:898dac80 r4:8a86d43c [<802693d0>] (process_one_work) from [<80269dcc>] (worker_thread+0x64/0x54c kernel/workqueue.c:2421) r10:00000008 r9:8a9c6000 r8:84006d00 r7:ddfe2d78 r6:898dac94 r5:ddfe2d40 r4:898dac80 [<80269d68>] (worker_thread) from [<80271f40>] (kthread+0x184/0x1a4 kernel/kthread.c:292) r10:85247e64 r9:898dac80 r8:80269d68 r7:00000000 r6:8a9c6000 r5:89a2ee40 r4:8a97bd00 [<80271dbc>] (kthread) from [<80200114>] (ret_from_fork+0x14/0x20 arch/arm/kernel/entry-common.S:158) Exception stack(0x8a9c7fb0 to 0x8a9c7ff8) Fixes: 412ca1550cbe ("macvlan: Move broadcasts into a work queue") Signed-off-by: Eric Dumazet Cc: Herbert Xu Reported-by: syzbot Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/if_macvlan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 96556c64c95d..10c94a3936ca 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -43,13 +43,14 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, if (likely(success)) { struct vlan_pcpu_stats *pcpu_stats; - pcpu_stats = this_cpu_ptr(vlan->pcpu_stats); + pcpu_stats = get_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); pcpu_stats->rx_packets++; pcpu_stats->rx_bytes += len; if (multicast) pcpu_stats->rx_multicast++; u64_stats_update_end(&pcpu_stats->syncp); + put_cpu_ptr(vlan->pcpu_stats); } else { this_cpu_inc(vlan->pcpu_stats->rx_errors); } From 0571a753cb07982cc82f4a5115e0b321da89e1f3 Mon Sep 17 00:00:00 2001 From: Pavel Andrianov Date: Wed, 10 Mar 2021 11:10:46 +0300 Subject: [PATCH 032/171] net: pxa168_eth: Fix a potential data race in pxa168_eth_remove pxa168_eth_remove() firstly calls unregister_netdev(), then cancels a timeout work. unregister_netdev() shuts down a device interface and removes it from the kernel tables. If the timeout occurs in parallel, the timeout work (pxa168_eth_tx_timeout_task) performs stop and open of the device. It may lead to an inconsistent state and memory leaks. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Pavel Andrianov Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/pxa168_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index d1e4d42e497d..3712e1786091 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -1544,8 +1544,8 @@ static int pxa168_eth_remove(struct platform_device *pdev) clk_disable_unprepare(pep->clk); mdiobus_unregister(pep->smi_bus); mdiobus_free(pep->smi_bus); - unregister_netdev(dev); cancel_work_sync(&pep->tx_timeout_task); + unregister_netdev(dev); free_netdev(dev); return 0; } From 8373a0fe9c7160a55482effa8a3f725efd3f8434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 10 Mar 2021 13:51:59 +0100 Subject: [PATCH 033/171] net: dsa: bcm_sf2: use 2 Gbps IMP port link on BCM4908 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM4908 uses 2 Gbps link between switch and the Ethernet interface. Without this BCM4908 devices were able to achieve only 2 x ~895 Mb/s. This allows handling e.g. NAT traffic with 940 Mb/s. Signed-off-by: Rafał Miłecki Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index f277df922fcd..4dedd6e0b11b 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -114,7 +114,10 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) /* Force link status for IMP port */ reg = core_readl(priv, offset); reg |= (MII_SW_OR | LINK_STS); - reg &= ~GMII_SPEED_UP_2G; + if (priv->type == BCM4908_DEVICE_ID) + reg |= GMII_SPEED_UP_2G; + else + reg &= ~GMII_SPEED_UP_2G; core_writel(priv, reg, offset); /* Enable Broadcast, Multicast, Unicast forwarding to IMP port */ From e323d865b36134e8c5c82c834df89109a5c60dab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Mar 2021 08:26:41 -0800 Subject: [PATCH 034/171] net: sched: validate stab values iproute2 package is well behaved, but malicious user space can provide illegal shift values and trigger UBSAN reports. Add stab parameter to red_check_params() to validate user input. syzbot reported: UBSAN: shift-out-of-bounds in ./include/net/red.h:312:18 shift exponent 111 is too large for 64-bit type 'long unsigned int' CPU: 1 PID: 14662 Comm: syz-executor.3 Not tainted 5.12.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:327 red_calc_qavg_from_idle_time include/net/red.h:312 [inline] red_calc_qavg include/net/red.h:353 [inline] choke_enqueue.cold+0x18/0x3dd net/sched/sch_choke.c:221 __dev_xmit_skb net/core/dev.c:3837 [inline] __dev_queue_xmit+0x1943/0x2e00 net/core/dev.c:4150 neigh_hh_output include/net/neighbour.h:499 [inline] neigh_output include/net/neighbour.h:508 [inline] ip6_finish_output2+0x911/0x1700 net/ipv6/ip6_output.c:117 __ip6_finish_output net/ipv6/ip6_output.c:182 [inline] __ip6_finish_output+0x4c1/0xe10 net/ipv6/ip6_output.c:161 ip6_finish_output+0x35/0x200 net/ipv6/ip6_output.c:192 NF_HOOK_COND include/linux/netfilter.h:290 [inline] ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:215 dst_output include/net/dst.h:448 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] NF_HOOK include/linux/netfilter.h:295 [inline] ip6_xmit+0x127e/0x1eb0 net/ipv6/ip6_output.c:320 inet6_csk_xmit+0x358/0x630 net/ipv6/inet6_connection_sock.c:135 dccp_transmit_skb+0x973/0x12c0 net/dccp/output.c:138 dccp_send_reset+0x21b/0x2b0 net/dccp/output.c:535 dccp_finish_passive_close net/dccp/proto.c:123 [inline] dccp_finish_passive_close+0xed/0x140 net/dccp/proto.c:118 dccp_terminate_connection net/dccp/proto.c:958 [inline] dccp_close+0xb3c/0xe60 net/dccp/proto.c:1028 inet_release+0x12e/0x280 net/ipv4/af_inet.c:431 inet6_release+0x4c/0x70 net/ipv6/af_inet6.c:478 __sock_release+0xcd/0x280 net/socket.c:599 sock_close+0x18/0x20 net/socket.c:1258 __fput+0x288/0x920 fs/file_table.c:280 task_work_run+0xdd/0x1a0 kernel/task_work.c:140 tracehook_notify_resume include/linux/tracehook.h:189 [inline] Fixes: 8afa10cbe281 ("net_sched: red: Avoid illegal values") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/red.h | 10 +++++++++- net/sched/sch_choke.c | 7 ++++--- net/sched/sch_gred.c | 2 +- net/sched/sch_red.c | 7 +++++-- net/sched/sch_sfq.c | 2 +- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/include/net/red.h b/include/net/red.h index 932f0d79d60c..9e6647c4ccd1 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -168,7 +168,8 @@ static inline void red_set_vars(struct red_vars *v) v->qcount = -1; } -static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_log) +static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, + u8 Scell_log, u8 *stab) { if (fls(qth_min) + Wlog > 32) return false; @@ -178,6 +179,13 @@ static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_ return false; if (qth_max < qth_min) return false; + if (stab) { + int i; + + for (i = 0; i < RED_STAB_SIZE; i++) + if (stab[i] >= 32) + return false; + } return true; } diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 50f680f03a54..2adbd945bf15 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -345,6 +345,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, struct sk_buff **old = NULL; unsigned int mask; u32 max_P; + u8 *stab; if (opt == NULL) return -EINVAL; @@ -361,8 +362,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; ctl = nla_data(tb[TCA_CHOKE_PARMS]); - - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) + stab = nla_data(tb[TCA_CHOKE_STAB]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) return -EINVAL; if (ctl->limit > CHOKE_MAX_QUEUE) @@ -412,7 +413,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, - nla_data(tb[TCA_CHOKE_STAB]), + stab, max_P); red_set_vars(&q->vars); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index e0bc77533acc..f4132dc25ac0 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -480,7 +480,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) { + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) { NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index b4ae34d7aa96..40adf1f07a82 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -242,6 +242,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, unsigned char flags; int err; u32 max_P; + u8 *stab; if (tb[TCA_RED_PARMS] == NULL || tb[TCA_RED_STAB] == NULL) @@ -250,7 +251,9 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) + stab = nla_data(tb[TCA_RED_STAB]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Scell_log, stab)) return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, @@ -288,7 +291,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, - nla_data(tb[TCA_RED_STAB]), + stab, max_P); red_set_vars(&q->vars); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b25e51440623..066754a18569 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -647,7 +647,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) } if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, - ctl_v1->Wlog, ctl_v1->Scell_log)) + ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); From d45c36bafb94e72fdb6dee437279b61b6d97e706 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 10 Mar 2021 10:46:10 -0800 Subject: [PATCH 035/171] net: dsa: b53: VLAN filtering is global to all users The bcm_sf2 driver uses the b53 driver as a library but does not make usre of the b53_setup() function, this made it fail to inherit the vlan_filtering_is_global attribute. Fix this by moving the assignment to b53_switch_alloc() which is used by bcm_sf2. Fixes: 7228b23e68f7 ("net: dsa: b53: Let DSA handle mismatched VLAN filtering settings") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index a162499bcafc..eb443721c58e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1105,13 +1105,6 @@ static int b53_setup(struct dsa_switch *ds) b53_disable_port(ds, port); } - /* Let DSA handle the case were multiple bridges span the same switch - * device and different VLAN awareness settings are requested, which - * would be breaking filtering semantics for any of the other bridge - * devices. (not hardware supported) - */ - ds->vlan_filtering_is_global = true; - return b53_setup_devlink_resources(ds); } @@ -2664,6 +2657,13 @@ struct b53_device *b53_switch_alloc(struct device *base, ds->ops = &b53_switch_ops; ds->untag_bridge_pvid = true; dev->vlan_enabled = true; + /* Let DSA handle the case were multiple bridges span the same switch + * device and different VLAN awareness settings are requested, which + * would be breaking filtering semantics for any of the other bridge + * devices. (not hardware supported) + */ + ds->vlan_filtering_is_global = true; + mutex_init(&dev->reg_mutex); mutex_init(&dev->stats_mutex); From 47142ed6c34d544ae9f0463e58d482289cbe0d46 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 10 Mar 2021 14:17:58 -0800 Subject: [PATCH 036/171] net: dsa: bcm_sf2: Qualify phydev->dev_flags based on port Similar to commit 92696286f3bb37ba50e4bd8d1beb24afb759a799 ("net: bcmgenet: Set phydev->dev_flags only for internal PHYs") we need to qualify the phydev->dev_flags based on whether the port is connected to an internal or external PHY otherwise we risk having a flags collision with a completely different interpretation depending on the driver. Fixes: aa9aef77c761 ("net: dsa: bcm_sf2: communicate integrated PHY revision to PHY driver") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 4dedd6e0b11b..ba5d546d06aa 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -588,8 +588,10 @@ static u32 bcm_sf2_sw_get_phy_flags(struct dsa_switch *ds, int port) * in bits 15:8 and the patch level in bits 7:0 which is exactly what * the REG_PHY_REVISION register layout is. */ - - return priv->hw_params.gphy_rev; + if (priv->int_phy_mask & BIT(port)) + return priv->hw_params.gphy_rev; + else + return 0; } static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port, From 6da262378c99b17b1a1ac2e42aa65acc1bd471c7 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 20 Oct 2020 16:34:00 +0300 Subject: [PATCH 037/171] igc: reinit_locked() should be called with rtnl_lock This commit applies to the igc_reset_task the same changes that were applied to the igb driver in commit 024a8168b749 ("igb: reinit_locked() should be called with rtnl_lock") and fix possible race in reset subtask. Fixes: 0507ef8a0372 ("igc: Add transmit and receive fastpath and interrupt handlers") Suggested-by: Jakub Kicinski Signed-off-by: Sasha Neftin Tested-by: Dvora Fuxbrumer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 7ac9597ddb84..4d989ebc9713 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -3831,10 +3831,19 @@ static void igc_reset_task(struct work_struct *work) adapter = container_of(work, struct igc_adapter, reset_task); + rtnl_lock(); + /* If we're already down or resetting, just bail */ + if (test_bit(__IGC_DOWN, &adapter->state) || + test_bit(__IGC_RESETTING, &adapter->state)) { + rtnl_unlock(); + return; + } + igc_rings_dump(adapter); igc_regs_dump(adapter); netdev_err(adapter->netdev, "Reset adapter\n"); igc_reinit_locked(adapter); + rtnl_unlock(); } /** From 8876529465c368beafd51a70f79d7a738f2aadf4 Mon Sep 17 00:00:00 2001 From: Muhammad Husaini Zulkifli Date: Sat, 20 Feb 2021 00:36:47 +0800 Subject: [PATCH 038/171] igc: Fix Pause Frame Advertising Fix Pause Frame Advertising when getting the advertisement via ethtool. Remove setting the "advertising" bit in link_ksettings during default case when Tx and Rx are in off state with Auto Negotiate off. Below is the original output of advertisement link during Tx and Rx off: Advertised pause frame use: Symmetric Receive-only Expected output: Advertised pause frame use: No Fixes: 8c5ad0dae93c ("igc: Add ethtool support") Signed-off-by: Muhammad Husaini Zulkifli Reviewed-by: Malli C Acked-by: Sasha Neftin Tested-by: Dvora Fuxbrumer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 824a6c454bca..67a4aed45fc2 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1725,9 +1725,7 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev, Asym_Pause); break; default: - ethtool_link_ksettings_add_link_mode(cmd, advertising, Pause); - ethtool_link_ksettings_add_link_mode(cmd, advertising, - Asym_Pause); + break; } status = pm_runtime_suspended(&adapter->pdev->dev) ? From 9a4a1cdc5ab52118c1f2b216f4240830b6528d32 Mon Sep 17 00:00:00 2001 From: Muhammad Husaini Zulkifli Date: Sat, 20 Feb 2021 00:36:48 +0800 Subject: [PATCH 039/171] igc: Fix Supported Pause Frame Link Setting The Supported Pause Frame always display "No" even though the Advertised pause frame showing the correct setting based on the pause parameters via ethtool. Set bit in link_ksettings to "Supported" for Pause Frame. Before output: Supported pause frame use: No Expected output: Supported pause frame use: Symmetric Fixes: 8c5ad0dae93c ("igc: Add ethtool support") Signed-off-by: Muhammad Husaini Zulkifli Reviewed-by: Malli C Tested-by: Dvora Fuxbrumer Acked-by: Sasha Neftin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 67a4aed45fc2..8722294ab90c 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1711,6 +1711,9 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev, Autoneg); } + /* Set pause flow control settings */ + ethtool_link_ksettings_add_link_mode(cmd, supported, Pause); + switch (hw->fc.requested_mode) { case igc_fc_full: ethtool_link_ksettings_add_link_mode(cmd, advertising, Pause); From fc9e5020971d57d7d0b3fef9e2ab2108fcb5588b Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Tue, 9 Mar 2021 22:42:56 -0800 Subject: [PATCH 040/171] igc: Fix igc_ptp_rx_pktstamp() The comment describing the timestamps layout in the packet buffer is wrong and the code is actually retrieving the timestamp in Timer 1 reference instead of Timer 0. This hasn't been a big issue so far because hardware is configured to report both timestamps using Timer 0 (see IGC_SRRCTL register configuration in igc_ptp_enable_rx_timestamp() helper). This patch fixes the comment and the code so we retrieve the timestamp in Timer 0 reference as expected. This patch also takes the opportunity to get rid of the hw.mac.type check since it is not required. Fixes: 81b055205e8ba ("igc: Add support for RX timestamping") Signed-off-by: Andre Guedes Signed-off-by: Vedang Patel Signed-off-by: Jithu Joseph Reviewed-by: Maciej Fijalkowski Tested-by: Dvora Fuxbrumer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 2 +- drivers/net/ethernet/intel/igc/igc_ptp.c | 68 +++++++++++++----------- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 5d2809dfd06a..1b08a7dc7bc4 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -547,7 +547,7 @@ void igc_ptp_init(struct igc_adapter *adapter); void igc_ptp_reset(struct igc_adapter *adapter); void igc_ptp_suspend(struct igc_adapter *adapter); void igc_ptp_stop(struct igc_adapter *adapter); -void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, void *va, +void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, __le32 *va, struct sk_buff *skb); int igc_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr); int igc_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr); diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index ac0b9c85da7c..545f4d0e67cf 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -152,46 +152,54 @@ static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, } /** - * igc_ptp_rx_pktstamp - retrieve Rx per packet timestamp + * igc_ptp_rx_pktstamp - Retrieve timestamp from Rx packet buffer * @q_vector: Pointer to interrupt specific structure * @va: Pointer to address containing Rx buffer * @skb: Buffer containing timestamp and packet * - * This function is meant to retrieve the first timestamp from the - * first buffer of an incoming frame. The value is stored in little - * endian format starting on byte 0. There's a second timestamp - * starting on byte 8. - **/ -void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, void *va, + * This function retrieves the timestamp saved in the beginning of packet + * buffer. While two timestamps are available, one in timer0 reference and the + * other in timer1 reference, this function considers only the timestamp in + * timer0 reference. + */ +void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, __le32 *va, struct sk_buff *skb) { struct igc_adapter *adapter = q_vector->adapter; - __le64 *regval = (__le64 *)va; - int adjust = 0; + u64 regval; + int adjust; - /* The timestamp is recorded in little endian format. - * DWORD: | 0 | 1 | 2 | 3 - * Field: | Timer0 Low | Timer0 High | Timer1 Low | Timer1 High + /* Timestamps are saved in little endian at the beginning of the packet + * buffer following the layout: + * + * DWORD: | 0 | 1 | 2 | 3 | + * Field: | Timer1 SYSTIML | Timer1 SYSTIMH | Timer0 SYSTIML | Timer0 SYSTIMH | + * + * SYSTIML holds the nanoseconds part while SYSTIMH holds the seconds + * part of the timestamp. */ - igc_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb), - le64_to_cpu(regval[0])); + regval = le32_to_cpu(va[2]); + regval |= (u64)le32_to_cpu(va[3]) << 32; + igc_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb), regval); - /* adjust timestamp for the RX latency based on link speed */ - if (adapter->hw.mac.type == igc_i225) { - switch (adapter->link_speed) { - case SPEED_10: - adjust = IGC_I225_RX_LATENCY_10; - break; - case SPEED_100: - adjust = IGC_I225_RX_LATENCY_100; - break; - case SPEED_1000: - adjust = IGC_I225_RX_LATENCY_1000; - break; - case SPEED_2500: - adjust = IGC_I225_RX_LATENCY_2500; - break; - } + /* Adjust timestamp for the RX latency based on link speed */ + switch (adapter->link_speed) { + case SPEED_10: + adjust = IGC_I225_RX_LATENCY_10; + break; + case SPEED_100: + adjust = IGC_I225_RX_LATENCY_100; + break; + case SPEED_1000: + adjust = IGC_I225_RX_LATENCY_1000; + break; + case SPEED_2500: + adjust = IGC_I225_RX_LATENCY_2500; + break; + default: + adjust = 0; + netdev_warn_once(adapter->netdev, "Imprecise timestamp\n"); + break; } skb_hwtstamps(skb)->hwtstamp = ktime_sub_ns(skb_hwtstamps(skb)->hwtstamp, adjust); From 21f857f0321d0d0ea9b1a758bd55dc63d1cb2437 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Wed, 21 Oct 2020 14:59:37 +0300 Subject: [PATCH 041/171] e1000e: add rtnl_lock() to e1000_reset_task A possible race condition was found in e1000_reset_task, after discovering a similar issue in igb driver via commit 024a8168b749 ("igb: reinit_locked() should be called with rtnl_lock"). Added rtnl_lock() and rtnl_unlock() to avoid this. Fixes: bc7f75fa9788 ("[E1000E]: New pci-express e1000 driver (currently for ICH9 devices only)") Suggested-by: Jakub Kicinski Signed-off-by: Vitaly Lifshits Tested-by: Dvora Fuxbrumer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/netdev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index e9b82c209c2d..a0948002ddf8 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5974,15 +5974,19 @@ static void e1000_reset_task(struct work_struct *work) struct e1000_adapter *adapter; adapter = container_of(work, struct e1000_adapter, reset_task); + rtnl_lock(); /* don't run the task if already down */ - if (test_bit(__E1000_DOWN, &adapter->state)) + if (test_bit(__E1000_DOWN, &adapter->state)) { + rtnl_unlock(); return; + } if (!(adapter->flags & FLAG_RESTART_NOW)) { e1000e_dump(adapter); e_err("Reset adapter unexpectedly\n"); } e1000e_reinit_locked(adapter); + rtnl_unlock(); } /** From b52912b8293f2c496f42583e65599aee606a0c18 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sun, 28 Feb 2021 17:44:23 +0800 Subject: [PATCH 042/171] e1000e: Fix error handling in e1000_set_d0_lplu_state_82571 There is one e1e_wphy() call in e1000_set_d0_lplu_state_82571 that we have caught its return value but lack further handling. Check and terminate the execution flow just like other e1e_wphy() in this function. Fixes: bc7f75fa9788 ("[E1000E]: New pci-express e1000 driver (currently for ICH9 devices only)") Signed-off-by: Dinghao Liu Acked-by: Sasha Neftin Tested-by: Dvora Fuxbrumer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/82571.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c index 88faf05e23ba..0b1e890dd583 100644 --- a/drivers/net/ethernet/intel/e1000e/82571.c +++ b/drivers/net/ethernet/intel/e1000e/82571.c @@ -899,6 +899,8 @@ static s32 e1000_set_d0_lplu_state_82571(struct e1000_hw *hw, bool active) } else { data &= ~IGP02E1000_PM_D0_LPLU; ret_val = e1e_wphy(hw, IGP02E1000_PHY_POWER_MGMT, data); + if (ret_val) + return ret_val; /* LPLU and SmartSpeed are mutually exclusive. LPLU is used * during Dx states where the power conservation is most * important. During driver activity we should enable From b80350f393703fa2e733921430276c98bbc092de Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 11 Mar 2021 10:57:36 +0800 Subject: [PATCH 043/171] net: sock: simplify tw proto registration Introduce the new function tw_prot_init (inspired by req_prot_init) to simplify "proto_register" function. tw_prot_cleanup will take care of a partially initialized timewait_sock_ops. Signed-off-by: Tonghao Zhang Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/sock.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 0ed98f20448a..cc31b601ae10 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3440,6 +3440,32 @@ static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) twsk_prot->twsk_slab = NULL; } +static int tw_prot_init(const struct proto *prot) +{ + struct timewait_sock_ops *twsk_prot = prot->twsk_prot; + + if (!twsk_prot) + return 0; + + twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", + prot->name); + if (!twsk_prot->twsk_slab_name) + return -ENOMEM; + + twsk_prot->twsk_slab = + kmem_cache_create(twsk_prot->twsk_slab_name, + twsk_prot->twsk_obj_size, 0, + SLAB_ACCOUNT | prot->slab_flags, + NULL); + if (!twsk_prot->twsk_slab) { + pr_crit("%s: Can't create timewait sock SLAB cache!\n", + prot->name); + return -ENOMEM; + } + + return 0; +} + static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) @@ -3496,22 +3522,8 @@ int proto_register(struct proto *prot, int alloc_slab) if (req_prot_init(prot)) goto out_free_request_sock_slab; - if (prot->twsk_prot != NULL) { - prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); - - if (prot->twsk_prot->twsk_slab_name == NULL) - goto out_free_request_sock_slab; - - prot->twsk_prot->twsk_slab = - kmem_cache_create(prot->twsk_prot->twsk_slab_name, - prot->twsk_prot->twsk_obj_size, - 0, - SLAB_ACCOUNT | - prot->slab_flags, - NULL); - if (prot->twsk_prot->twsk_slab == NULL) - goto out_free_timewait_sock_slab; - } + if (tw_prot_init(prot)) + goto out_free_timewait_sock_slab; } mutex_lock(&proto_list_mutex); From db74623a3850db99cb9692fda9e836a56b74198d Mon Sep 17 00:00:00 2001 From: Lv Yunlong Date: Wed, 10 Mar 2021 20:01:40 -0800 Subject: [PATCH 044/171] net/qlcnic: Fix a use after free in qlcnic_83xx_get_minidump_template In qlcnic_83xx_get_minidump_template, fw_dump->tmpl_hdr was freed by vfree(). But unfortunately, it is used when extended is true. Fixes: 7061b2bdd620e ("qlogic: Deletion of unnecessary checks before two function calls") Signed-off-by: Lv Yunlong Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c index 7760a3394e93..7ecb3dfe30bd 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c @@ -1425,6 +1425,7 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter) if (fw_dump->tmpl_hdr == NULL || current_version > prev_version) { vfree(fw_dump->tmpl_hdr); + fw_dump->tmpl_hdr = NULL; if (qlcnic_83xx_md_check_extended_dump_capability(adapter)) extended = !qlcnic_83xx_extend_md_capab(adapter); @@ -1443,6 +1444,8 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter) struct qlcnic_83xx_dump_template_hdr *hdr; hdr = fw_dump->tmpl_hdr; + if (!hdr) + return; hdr->drv_cap_mask = 0x1f; fw_dump->cap_mask = 0x1f; dev_info(&pdev->dev, From a9f81244d2e33e6dfcef120fefd30c96b3f7cdb0 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Wed, 10 Mar 2021 23:27:35 -0500 Subject: [PATCH 045/171] mISDN: fix crash in fritzpci setup_fritz() in avmfritz.c might fail with -EIO and in this case the isac.type and isac.write_reg is not initialized and remains 0(NULL). A subsequent call to isac_release() will dereference isac->write_reg and crash. [ 1.737444] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 1.737809] #PF: supervisor instruction fetch in kernel mode [ 1.738106] #PF: error_code(0x0010) - not-present page [ 1.738378] PGD 0 P4D 0 [ 1.738515] Oops: 0010 [#1] SMP NOPTI [ 1.738711] CPU: 0 PID: 180 Comm: systemd-udevd Not tainted 5.12.0-rc2+ #78 [ 1.739077] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-p rebuilt.qemu.org 04/01/2014 [ 1.739664] RIP: 0010:0x0 [ 1.739807] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6. [ 1.740200] RSP: 0018:ffffc9000027ba10 EFLAGS: 00010202 [ 1.740478] RAX: 0000000000000000 RBX: ffff888102f41840 RCX: 0000000000000027 [ 1.740853] RDX: 00000000000000ff RSI: 0000000000000020 RDI: ffff888102f41800 [ 1.741226] RBP: ffffc9000027ba20 R08: ffff88817bc18440 R09: ffffc9000027b808 [ 1.741600] R10: 0000000000000001 R11: 0000000000000001 R12: ffff888102f41840 [ 1.741976] R13: 00000000fffffffb R14: ffff888102f41800 R15: ffff8881008b0000 [ 1.742351] FS: 00007fda3a38a8c0(0000) GS:ffff88817bc00000(0000) knlGS:0000000000000000 [ 1.742774] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.743076] CR2: ffffffffffffffd6 CR3: 00000001021ec000 CR4: 00000000000006f0 [ 1.743452] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1.743828] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1.744206] Call Trace: [ 1.744339] isac_release+0xcc/0xe0 [mISDNipac] [ 1.744582] fritzpci_probe.cold+0x282/0x739 [avmfritz] [ 1.744861] local_pci_probe+0x48/0x80 [ 1.745063] pci_device_probe+0x10f/0x1c0 [ 1.745278] really_probe+0xfb/0x420 [ 1.745471] driver_probe_device+0xe9/0x160 [ 1.745693] device_driver_attach+0x5d/0x70 [ 1.745917] __driver_attach+0x8f/0x150 [ 1.746123] ? device_driver_attach+0x70/0x70 [ 1.746354] bus_for_each_dev+0x7e/0xc0 [ 1.746560] driver_attach+0x1e/0x20 [ 1.746751] bus_add_driver+0x152/0x1f0 [ 1.746957] driver_register+0x74/0xd0 [ 1.747157] ? 0xffffffffc00d8000 [ 1.747334] __pci_register_driver+0x54/0x60 [ 1.747562] AVM_init+0x36/0x1000 [avmfritz] [ 1.747791] do_one_initcall+0x48/0x1d0 [ 1.747997] ? __cond_resched+0x19/0x30 [ 1.748206] ? kmem_cache_alloc_trace+0x390/0x440 [ 1.748458] ? do_init_module+0x28/0x250 [ 1.748669] do_init_module+0x62/0x250 [ 1.748870] load_module+0x23ee/0x26a0 [ 1.749073] __do_sys_finit_module+0xc2/0x120 [ 1.749307] ? __do_sys_finit_module+0xc2/0x120 [ 1.749549] __x64_sys_finit_module+0x1a/0x20 [ 1.749782] do_syscall_64+0x38/0x90 Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- drivers/isdn/hardware/mISDN/mISDNipac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/isdn/hardware/mISDN/mISDNipac.c b/drivers/isdn/hardware/mISDN/mISDNipac.c index ec475087fbf9..39f841b42488 100644 --- a/drivers/isdn/hardware/mISDN/mISDNipac.c +++ b/drivers/isdn/hardware/mISDN/mISDNipac.c @@ -694,7 +694,7 @@ isac_release(struct isac_hw *isac) { if (isac->type & IPAC_TYPE_ISACX) WriteISAC(isac, ISACX_MASK, 0xff); - else + else if (isac->type != 0) WriteISAC(isac, ISAC_MASK, 0xff); if (isac->dch.timer.function != NULL) { del_timer(&isac->dch.timer); From 7a1468ba0e02eee24ae1353e8933793a27198e20 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 10 Mar 2021 20:53:42 -0800 Subject: [PATCH 046/171] net: phy: broadcom: Add power down exit reset state delay Per the datasheet, when we clear the power down bit, the PHY remains in an internal reset state for 40us and then resume normal operation. Account for that delay to avoid any issues in the future if genphy_resume() changes. Fixes: fe26821fa614 ("net: phy: broadcom: Wire suspend/resume for BCM54810") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index fa0be591ae79..ad51f1889435 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -399,6 +399,11 @@ static int bcm54xx_resume(struct phy_device *phydev) if (ret < 0) return ret; + /* Upon exiting power down, the PHY remains in an internal reset state + * for 40us + */ + fsleep(40); + return bcm54xx_config_init(phydev); } From 93bde210c4341e79f0cd9cb160d889f4577e40b1 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 11 Mar 2021 16:42:05 +0200 Subject: [PATCH 047/171] sch_htb: Fix select_queue for non-offload mode htb_select_queue assumes it's always the offload mode, and it ends up in calling ndo_setup_tc without any checks. It may lead to a NULL pointer dereference if ndo_setup_tc is not implemented, or to an error returned from the driver, which will prevent attaching qdiscs to HTB classes in the non-offload mode. This commit fixes the bug by adding the missing check to htb_select_queue. In the non-offload mode it will return sch->dev_queue, mimicking tc_modify_qdisc's behavior for the case where select_queue is not implemented. Reported-by: syzbot+b53a709f04722ca12a3c@syzkaller.appspotmail.com Fixes: d03b195b5aa0 ("sch_htb: Hierarchical QoS hardware offload") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index dff3adf5a915..b23203159996 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1340,8 +1340,12 @@ htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { struct net_device *dev = qdisc_dev(sch); struct tc_htb_qopt_offload offload_opt; + struct htb_sched *q = qdisc_priv(sch); int err; + if (!q->offload) + return sch->dev_queue; + offload_opt = (struct tc_htb_qopt_offload) { .command = TC_HTB_LEAF_QUERY_QUEUE, .classid = TC_H_MIN(tcm->tcm_parent), From fb3a3e37de337ec2941c71ff0bcb83e701f3c9f4 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 11 Mar 2021 16:42:06 +0200 Subject: [PATCH 048/171] sch_htb: Fix offload cleanup in htb_destroy on htb_init failure htb_init may fail to do the offload if it's not supported or if a runtime error happens when allocating direct qdiscs. In those cases TC_HTB_CREATE command is not sent to the driver, however, htb_destroy gets called anyway and attempts to send TC_HTB_DESTROY. It shouldn't happen, because the driver didn't receive TC_HTB_CREATE, and also because the driver may not support ndo_setup_tc at all, while q->offload is true, and htb_destroy mistakenly thinks the offload is supported. Trying to call ndo_setup_tc in the latter case will lead to a NULL pointer dereference. This commit fixes the issues with htb_destroy by deferring assignment of q->offload until after the TC_HTB_CREATE command. The necessary cleanup of the offload entities is already done in htb_init. Reported-by: syzbot+b53a709f04722ca12a3c@syzkaller.appspotmail.com Fixes: d03b195b5aa0 ("sch_htb: Hierarchical QoS hardware offload") Suggested-by: Eric Dumazet Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index b23203159996..62e12cb41a3e 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1020,6 +1020,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; unsigned int ntx; + bool offload; int err; qdisc_watchdog_init(&q->watchdog, sch); @@ -1044,9 +1045,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, if (gopt->version != HTB_VER >> 16) return -EINVAL; - q->offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]); + offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]); - if (q->offload) { + if (offload) { if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; @@ -1076,7 +1077,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, q->rate2quantum = 1; q->defcls = gopt->defcls; - if (!q->offload) + if (!offload) return 0; for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) { @@ -1107,12 +1108,14 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, if (err) goto err_free_qdiscs; + /* Defer this assignment, so that htb_destroy skips offload-related + * parts (especially calling ndo_setup_tc) on errors. + */ + q->offload = true; + return 0; err_free_qdiscs: - /* TC_HTB_CREATE call failed, avoid any further calls to the driver. */ - q->offload = false; - for (ntx = 0; ntx < q->num_direct_qdiscs && q->direct_qdiscs[ntx]; ntx++) qdisc_put(q->direct_qdiscs[ntx]); From ed0907e3bdcfc7fe1c1756a480451e757b207a69 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 5 Feb 2021 10:09:04 +0100 Subject: [PATCH 049/171] ice: fix napi work done reporting in xsk path Fix the wrong napi work done reporting in the xsk path of the ice driver. The code in the main Rx processing loop was written to assume that the buffer allocation code returns true if all allocations where successful and false if not. In contrast with all other Intel NIC xsk drivers, the ice_alloc_rx_bufs_zc() has the inverted logic messing up the work done reporting in the napi loop. This can be fixed either by inverting the return value from ice_alloc_rx_bufs_zc() in the function that uses this in an incorrect way, or by changing the return value of ice_alloc_rx_bufs_zc(). We chose the latter as it makes all the xsk allocation functions for Intel NICs behave in the same way. My guess is that it was this unexpected discrepancy that gave rise to this bug in the first place. Fixes: 5bb0c4b5eb61 ("ice, xsk: Move Rx allocation out of while-loop") Reported-by: Maciej Fijalkowski Signed-off-by: Magnus Karlsson Tested-by: Kiran Bhandare Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 6 ++++-- drivers/net/ethernet/intel/ice/ice_xsk.c | 10 +++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 3124a3bf519a..952e41a1e001 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -418,6 +418,8 @@ int ice_setup_rx_ctx(struct ice_ring *ring) writel(0, ring->tail); if (ring->xsk_pool) { + bool ok; + if (!xsk_buff_can_alloc(ring->xsk_pool, num_bufs)) { dev_warn(dev, "XSK buffer pool does not provide enough addresses to fill %d buffers on Rx ring %d\n", num_bufs, ring->q_index); @@ -426,8 +428,8 @@ int ice_setup_rx_ctx(struct ice_ring *ring) return 0; } - err = ice_alloc_rx_bufs_zc(ring, num_bufs); - if (err) + ok = ice_alloc_rx_bufs_zc(ring, num_bufs); + if (!ok) dev_info(dev, "Failed to allocate some buffers on XSK buffer pool enabled Rx ring %d (pf_q %d)\n", ring->q_index, pf_q); return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 83f3c9574ed1..9f94d9159acd 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -358,18 +358,18 @@ xsk_pool_if_up: * This function allocates a number of Rx buffers from the fill ring * or the internal recycle mechanism and places them on the Rx ring. * - * Returns false if all allocations were successful, true if any fail. + * Returns true if all allocations were successful, false if any fail. */ bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count) { union ice_32b_rx_flex_desc *rx_desc; u16 ntu = rx_ring->next_to_use; struct ice_rx_buf *rx_buf; - bool ret = false; + bool ok = true; dma_addr_t dma; if (!count) - return false; + return true; rx_desc = ICE_RX_DESC(rx_ring, ntu); rx_buf = &rx_ring->rx_buf[ntu]; @@ -377,7 +377,7 @@ bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count) do { rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_pool); if (!rx_buf->xdp) { - ret = true; + ok = false; break; } @@ -402,7 +402,7 @@ bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count) ice_release_rx_desc(rx_ring, ntu); } - return ret; + return ok; } /** From a86606268ec0c809f341cda3771ae53460e064ab Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 3 Mar 2021 16:39:26 +0100 Subject: [PATCH 050/171] i40e: move headroom initialization to i40e_configure_rx_ring i40e_rx_offset(), that is supposed to initialize the Rx buffer headroom, relies on I40E_RXR_FLAGS_BUILD_SKB_ENABLED flag. Currently, the callsite of mentioned function is placed incorrectly within i40e_setup_rx_descriptors() where Rx ring's build skb flag is not set yet. This causes the XDP_REDIRECT to be partially broken due to inability to create xdp_frame in the headroom space, as the headroom is 0. For the record, below is the call graph: i40e_vsi_open i40e_vsi_setup_rx_resources i40e_setup_rx_descriptors i40e_rx_offset() <-- sets offset to 0 as build_skb flag is set below i40e_vsi_configure_rx i40e_configure_rx_ring set_ring_build_skb_enabled(ring) <-- set build_skb flag Fix this by moving i40e_rx_offset() to i40e_configure_rx_ring() after the flag setting. Fixes: f7bb0d71d658 ("i40e: store the result of i40e_rx_offset() onto i40e_ring") Reported-by: Jesper Dangaard Brouer Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Maciej Fijalkowski Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Kiran Bhandare Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 13 +++++++++++++ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 12 ------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 353deae139f9..17f3b800640e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3258,6 +3258,17 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring) return 0; } +/** + * i40e_rx_offset - Return expected offset into page to access data + * @rx_ring: Ring we are requesting offset of + * + * Returns the offset value for ring into the data buffer. + */ +static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring) +{ + return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0; +} + /** * i40e_configure_rx_ring - Configure a receive ring context * @ring: The Rx ring to configure @@ -3369,6 +3380,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) else set_ring_build_skb_enabled(ring); + ring->rx_offset = i40e_rx_offset(ring); + /* cache tail for quicker writes, and clear the reg before use */ ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q); writel(0, ring->tail); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 627794b31e33..5747a99122fb 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1569,17 +1569,6 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring) } } -/** - * i40e_rx_offset - Return expected offset into page to access data - * @rx_ring: Ring we are requesting offset of - * - * Returns the offset value for ring into the data buffer. - */ -static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring) -{ - return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0; -} - /** * i40e_setup_rx_descriptors - Allocate Rx descriptors * @rx_ring: Rx descriptor ring (for a specific queue) to setup @@ -1608,7 +1597,6 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) rx_ring->next_to_alloc = 0; rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; - rx_ring->rx_offset = i40e_rx_offset(rx_ring); /* XDP RX-queue info only needed for RX rings exposed to XDP */ if (rx_ring->vsi->type == I40E_VSI_MAIN) { From 89861c485c6a384e298fb78660d6a773339e42b1 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 3 Mar 2021 16:39:27 +0100 Subject: [PATCH 051/171] ice: move headroom initialization to ice_setup_rx_ctx ice_rx_offset(), that is supposed to initialize the Rx buffer headroom, relies on ICE_RX_FLAGS_RING_BUILD_SKB flag as well as XDP prog presence. Currently, the callsite of mentioned function is placed incorrectly within ice_setup_rx_ring() where Rx ring's build skb flag is not set yet. This causes the XDP_REDIRECT to be partially broken due to inability to create xdp_frame in the headroom space, as the headroom is 0. Fix this by moving ice_rx_offset() to ice_setup_rx_ctx() after the flag setting. Fixes: f1b1f409bf79 ("ice: store the result of ice_rx_offset() onto ice_ring") Signed-off-by: Maciej Fijalkowski Tested-by: Kiran Bhandare Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 18 ++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_txrx.c | 17 ----------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 952e41a1e001..1148d768f8ed 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -274,6 +274,22 @@ ice_setup_tx_ctx(struct ice_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf_q) tlan_ctx->legacy_int = ICE_TX_LEGACY; } +/** + * ice_rx_offset - Return expected offset into page to access data + * @rx_ring: Ring we are requesting offset of + * + * Returns the offset value for ring into the data buffer. + */ +static unsigned int ice_rx_offset(struct ice_ring *rx_ring) +{ + if (ice_ring_uses_build_skb(rx_ring)) + return ICE_SKB_PAD; + else if (ice_is_xdp_ena_vsi(rx_ring->vsi)) + return XDP_PACKET_HEADROOM; + + return 0; +} + /** * ice_setup_rx_ctx - Configure a receive ring context * @ring: The Rx ring to configure @@ -413,6 +429,8 @@ int ice_setup_rx_ctx(struct ice_ring *ring) else ice_set_ring_build_skb_ena(ring); + ring->rx_offset = ice_rx_offset(ring); + /* init queue specific tail register */ ring->tail = hw->hw_addr + QRX_TAIL(pf_q); writel(0, ring->tail); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index b7dc25da1202..b91dcfd12727 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -443,22 +443,6 @@ void ice_free_rx_ring(struct ice_ring *rx_ring) } } -/** - * ice_rx_offset - Return expected offset into page to access data - * @rx_ring: Ring we are requesting offset of - * - * Returns the offset value for ring into the data buffer. - */ -static unsigned int ice_rx_offset(struct ice_ring *rx_ring) -{ - if (ice_ring_uses_build_skb(rx_ring)) - return ICE_SKB_PAD; - else if (ice_is_xdp_ena_vsi(rx_ring->vsi)) - return XDP_PACKET_HEADROOM; - - return 0; -} - /** * ice_setup_rx_ring - Allocate the Rx descriptors * @rx_ring: the Rx ring to set up @@ -493,7 +477,6 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring) rx_ring->next_to_use = 0; rx_ring->next_to_clean = 0; - rx_ring->rx_offset = ice_rx_offset(rx_ring); if (ice_is_xdp_ena_vsi(rx_ring->vsi)) WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog); From 76064573b121a376fe54a2799ee6b5bb91632a1f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 3 Mar 2021 16:39:28 +0100 Subject: [PATCH 052/171] ixgbe: move headroom initialization to ixgbe_configure_rx_ring ixgbe_rx_offset(), that is supposed to initialize the Rx buffer headroom, relies on __IXGBE_RX_BUILD_SKB_ENABLED flag. Currently, the callsite of mentioned function is placed incorrectly within ixgbe_setup_rx_resources() where Rx ring's build skb flag is not set yet. This causes the XDP_REDIRECT to be partially broken due to inability to create xdp_frame in the headroom space, as the headroom is 0. Fix this by moving ixgbe_rx_offset() to ixgbe_configure_rx_ring() after the flag setting, which happens to be set in ixgbe_set_rx_buffer_len. Fixes: c0d4e9d223c5 ("ixgbe: store the result of ixgbe_rx_offset() onto ixgbe_ring") Signed-off-by: Maciej Fijalkowski Tested-by: Vishakha Jambekar Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9f3f12e2ccf2..03d9aad516d4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -4118,6 +4118,8 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter, #endif } + ring->rx_offset = ixgbe_rx_offset(ring); + if (ring->xsk_pool && hw->mac.type != ixgbe_mac_82599EB) { u32 xsk_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); @@ -6578,7 +6580,6 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; - rx_ring->rx_offset = ixgbe_rx_offset(rx_ring); /* XDP RX-queue info */ if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, From 98dfb02aa22280bd8833836d1b00ab0488fa951f Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 21 Jan 2021 13:54:23 -0800 Subject: [PATCH 053/171] igb: avoid premature Rx buffer reuse Igb needs a similar fix as commit 75aab4e10ae6a ("i40e: avoid premature Rx buffer reuse") The page recycle code, incorrectly, relied on that a page fragment could not be freed inside xdp_do_redirect(). This assumption leads to that page fragments that are used by the stack/XDP redirect can be reused and overwritten. To avoid this, store the page count prior invoking xdp_do_redirect(). Longer explanation: Intel NICs have a recycle mechanism. The main idea is that a page is split into two parts. One part is owned by the driver, one part might be owned by someone else, such as the stack. t0: Page is allocated, and put on the Rx ring +--------------- used by NIC ->| upper buffer (rx_buffer) +--------------- | lower buffer +--------------- page count == USHRT_MAX rx_buffer->pagecnt_bias == USHRT_MAX t1: Buffer is received, and passed to the stack (e.g.) +--------------- | upper buff (skb) +--------------- used by NIC ->| lower buffer (rx_buffer) +--------------- page count == USHRT_MAX rx_buffer->pagecnt_bias == USHRT_MAX - 1 t2: Buffer is received, and redirected +--------------- | upper buff (skb) +--------------- used by NIC ->| lower buffer (rx_buffer) +--------------- Now, prior calling xdp_do_redirect(): page count == USHRT_MAX rx_buffer->pagecnt_bias == USHRT_MAX - 2 This means that buffer *cannot* be flipped/reused, because the skb is still using it. The problem arises when xdp_do_redirect() actually frees the segment. Then we get: page count == USHRT_MAX - 1 rx_buffer->pagecnt_bias == USHRT_MAX - 2 From a recycle perspective, the buffer can be flipped and reused, which means that the skb data area is passed to the Rx HW ring! To work around this, the page count is stored prior calling xdp_do_redirect(). Fixes: 9cbc948b5a20 ("igb: add XDP support") Signed-off-by: Li RongQing Reviewed-by: Alexander Duyck Tested-by: Vishakha Jambekar Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 878b31d534ec..d3b37761f637 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8214,7 +8214,8 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring, new_buff->pagecnt_bias = old_buff->pagecnt_bias; } -static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer) +static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, + int rx_buf_pgcnt) { unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; struct page *page = rx_buffer->page; @@ -8225,7 +8226,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer) #if (PAGE_SIZE < 8192) /* if we are only owner of page we can reuse it */ - if (unlikely((page_ref_count(page) - pagecnt_bias) > 1)) + if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1)) return false; #else #define IGB_LAST_OFFSET \ @@ -8614,11 +8615,17 @@ static unsigned int igb_rx_offset(struct igb_ring *rx_ring) } static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring, - const unsigned int size) + const unsigned int size, int *rx_buf_pgcnt) { struct igb_rx_buffer *rx_buffer; rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; + *rx_buf_pgcnt = +#if (PAGE_SIZE < 8192) + page_count(rx_buffer->page); +#else + 0; +#endif prefetchw(rx_buffer->page); /* we are reusing so sync this buffer for CPU use */ @@ -8634,9 +8641,9 @@ static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring, } static void igb_put_rx_buffer(struct igb_ring *rx_ring, - struct igb_rx_buffer *rx_buffer) + struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt) { - if (igb_can_reuse_rx_page(rx_buffer)) { + if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) { /* hand second half of page back to the ring */ igb_reuse_rx_page(rx_ring, rx_buffer); } else { @@ -8664,6 +8671,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) unsigned int xdp_xmit = 0; struct xdp_buff xdp; u32 frame_sz = 0; + int rx_buf_pgcnt; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) @@ -8693,7 +8701,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) */ dma_rmb(); - rx_buffer = igb_get_rx_buffer(rx_ring, size); + rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt); /* retrieve a buffer from the ring */ if (!skb) { @@ -8736,7 +8744,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) break; } - igb_put_rx_buffer(rx_ring, rx_buffer); + igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt); cleaned_count++; /* fetch next buffer in frame if non-eop */ From 080bfa1e6d928a5d1f185cc44e5f3c251df06df5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 12 Mar 2021 12:15:03 -0800 Subject: [PATCH 054/171] Revert "net: bonding: fix error return code of bond_neigh_init()" This reverts commit 2055a99da8a253a357bdfd359b3338ef3375a26c. This change rejects legitimate configurations. A slave doesn't need to exist nor implement ndo_slave_setup. Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 456315bef3a8..74cbbb22470b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3978,15 +3978,11 @@ static int bond_neigh_init(struct neighbour *n) rcu_read_lock(); slave = bond_first_slave_rcu(bond); - if (!slave) { - ret = -EINVAL; + if (!slave) goto out; - } slave_ops = slave->dev->netdev_ops; - if (!slave_ops->ndo_neigh_setup) { - ret = -EINVAL; + if (!slave_ops->ndo_neigh_setup) goto out; - } /* TODO: find another way [1] to implement this. * Passing a zeroed structure is fragile, From f211ac154577ec9ccf07c15f18a6abf0d9bdb4ab Mon Sep 17 00:00:00 2001 From: liuyacan Date: Fri, 12 Mar 2021 00:32:25 +0800 Subject: [PATCH 055/171] net: correct sk_acceptq_is_full() The "backlog" argument in listen() specifies the maximom length of pending connections, so the accept queue should be considered full if there are exactly "backlog" elements. Signed-off-by: liuyacan Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 636810ddcd9b..0b6266fd6bf6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -936,7 +936,7 @@ static inline void sk_acceptq_added(struct sock *sk) static inline bool sk_acceptq_is_full(const struct sock *sk) { - return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); + return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); } /* From 59cd4f19267a0aab87a8c07e4426eb7187ee548d Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Thu, 11 Mar 2021 14:05:18 -0600 Subject: [PATCH 056/171] net: axienet: Fix probe error cleanup The driver did not always clean up all allocated resources when probe failed. Fix the probe cleanup path to clean up everything that was allocated. Fixes: 57baf8cc70ea ("net: axienet: Handle deferred probe on clock properly") Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- .../net/ethernet/xilinx/xilinx_axienet_main.c | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 3a8775e0ca55..5d677db0aee5 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1880,7 +1880,7 @@ static int axienet_probe(struct platform_device *pdev) if (IS_ERR(lp->regs)) { dev_err(&pdev->dev, "could not map Axi Ethernet regs.\n"); ret = PTR_ERR(lp->regs); - goto free_netdev; + goto cleanup_clk; } lp->regs_start = ethres->start; @@ -1958,18 +1958,18 @@ static int axienet_probe(struct platform_device *pdev) break; default: ret = -EINVAL; - goto free_netdev; + goto cleanup_clk; } } else { ret = of_get_phy_mode(pdev->dev.of_node, &lp->phy_mode); if (ret) - goto free_netdev; + goto cleanup_clk; } if (lp->switch_x_sgmii && lp->phy_mode != PHY_INTERFACE_MODE_SGMII && lp->phy_mode != PHY_INTERFACE_MODE_1000BASEX) { dev_err(&pdev->dev, "xlnx,switch-x-sgmii only supported with SGMII or 1000BaseX\n"); ret = -EINVAL; - goto free_netdev; + goto cleanup_clk; } /* Find the DMA node, map the DMA registers, and decode the DMA IRQs */ @@ -1982,7 +1982,7 @@ static int axienet_probe(struct platform_device *pdev) dev_err(&pdev->dev, "unable to get DMA resource\n"); of_node_put(np); - goto free_netdev; + goto cleanup_clk; } lp->dma_regs = devm_ioremap_resource(&pdev->dev, &dmares); @@ -2002,12 +2002,12 @@ static int axienet_probe(struct platform_device *pdev) if (IS_ERR(lp->dma_regs)) { dev_err(&pdev->dev, "could not map DMA regs\n"); ret = PTR_ERR(lp->dma_regs); - goto free_netdev; + goto cleanup_clk; } if ((lp->rx_irq <= 0) || (lp->tx_irq <= 0)) { dev_err(&pdev->dev, "could not determine irqs\n"); ret = -ENOMEM; - goto free_netdev; + goto cleanup_clk; } /* Autodetect the need for 64-bit DMA pointers. @@ -2037,7 +2037,7 @@ static int axienet_probe(struct platform_device *pdev) ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(addr_width)); if (ret) { dev_err(&pdev->dev, "No suitable DMA available\n"); - goto free_netdev; + goto cleanup_clk; } /* Check for Ethernet core IRQ (optional) */ @@ -2068,12 +2068,12 @@ static int axienet_probe(struct platform_device *pdev) if (!lp->phy_node) { dev_err(&pdev->dev, "phy-handle required for 1000BaseX/SGMII\n"); ret = -EINVAL; - goto free_netdev; + goto cleanup_mdio; } lp->pcs_phy = of_mdio_find_device(lp->phy_node); if (!lp->pcs_phy) { ret = -EPROBE_DEFER; - goto free_netdev; + goto cleanup_mdio; } lp->phylink_config.pcs_poll = true; } @@ -2087,17 +2087,30 @@ static int axienet_probe(struct platform_device *pdev) if (IS_ERR(lp->phylink)) { ret = PTR_ERR(lp->phylink); dev_err(&pdev->dev, "phylink_create error (%i)\n", ret); - goto free_netdev; + goto cleanup_mdio; } ret = register_netdev(lp->ndev); if (ret) { dev_err(lp->dev, "register_netdev() error (%i)\n", ret); - goto free_netdev; + goto cleanup_phylink; } return 0; +cleanup_phylink: + phylink_destroy(lp->phylink); + +cleanup_mdio: + if (lp->pcs_phy) + put_device(&lp->pcs_phy->dev); + if (lp->mii_bus) + axienet_mdio_teardown(lp); + of_node_put(lp->phy_node); + +cleanup_clk: + clk_disable_unprepare(lp->clk); + free_netdev: free_netdev(ndev); From 6897087323a2fde46df32917462750c069668b2f Mon Sep 17 00:00:00 2001 From: Dylan Hung Date: Fri, 12 Mar 2021 11:04:05 +1030 Subject: [PATCH 057/171] ftgmac100: Restart MAC HW once The interrupt handler may set the flag to reset the mac in the future, but that flag is not cleared once the reset has occurred. Fixes: 10cbd6407609 ("ftgmac100: Rework NAPI & interrupts handling") Signed-off-by: Dylan Hung Acked-by: Benjamin Herrenschmidt Reviewed-by: Joel Stanley Signed-off-by: Joel Stanley Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 88bfe2107938..04421aec2dfd 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1337,6 +1337,7 @@ static int ftgmac100_poll(struct napi_struct *napi, int budget) */ if (unlikely(priv->need_mac_restart)) { ftgmac100_start_hw(priv); + priv->need_mac_restart = false; /* Re-enable "bad" interrupts */ iowrite32(FTGMAC100_INT_BAD, From b1dd9bf688b0dcc5a34dca660de46c7570bd9243 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 11 Mar 2021 16:52:50 -0800 Subject: [PATCH 058/171] net: phy: broadcom: Fix RGMII delays for BCM50160 and BCM50610M The PHY driver entry for BCM50160 and BCM50610M calls bcm54xx_config_init() but does not call bcm54xx_config_clock_delay() in order to configuration appropriate clock delays on the PHY, fix that. Fixes: 733336262b28 ("net: phy: Allow BCM5481x PHYs to setup internal TX/RX clock delay") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index ad51f1889435..82fe5f43f0e9 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -342,6 +342,10 @@ static int bcm54xx_config_init(struct phy_device *phydev) bcm54xx_adjust_rxrefclk(phydev); switch (BRCM_PHY_MODEL(phydev)) { + case PHY_ID_BCM50610: + case PHY_ID_BCM50610M: + err = bcm54xx_config_clock_delay(phydev); + break; case PHY_ID_BCM54210E: err = bcm54210e_config_init(phydev); break; From 2e5de7e0c8d2caa860e133ef71fc94671cb8e0bf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 12 Mar 2021 10:41:12 +0300 Subject: [PATCH 059/171] mptcp: fix bit MPTCP_PUSH_PENDING tests The MPTCP_PUSH_PENDING define is 6 and these tests should be testing if BIT(6) is set. Fixes: c2e6048fa1cf ("mptcp: fix race in release_cb") Signed-off-by: Dan Carpenter Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 76958570ae7f..1590b9d4cde2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2968,7 +2968,7 @@ static void mptcp_release_cb(struct sock *sk) for (;;) { flags = 0; if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) - flags |= MPTCP_PUSH_PENDING; + flags |= BIT(MPTCP_PUSH_PENDING); if (!flags) break; @@ -2981,7 +2981,7 @@ static void mptcp_release_cb(struct sock *sk) */ spin_unlock_bh(&sk->sk_lock.slock); - if (flags & MPTCP_PUSH_PENDING) + if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); cond_resched(); From c3b8e07909dbe67b0d580416c1a5257643a73be7 Mon Sep 17 00:00:00 2001 From: Ilya Lipnitskiy Date: Fri, 12 Mar 2021 00:07:03 -0800 Subject: [PATCH 060/171] net: dsa: mt7530: setup core clock even in TRGMII mode A recent change to MIPS ralink reset logic made it so mt7530 actually resets the switch on platforms such as mt7621 (where bit 2 is the reset line for the switch). That exposed an issue where the switch would not function properly in TRGMII mode after a reset. Reconfigure core clock in TRGMII mode to fix the issue. Tested on Ubiquiti ER-X (MT7621) with TRGMII mode enabled. Fixes: 3f9ef7785a9c ("MIPS: ralink: manage low reset lines") Signed-off-by: Ilya Lipnitskiy Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 46 +++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index f06f5fa2f898..9871d7cff93a 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -436,34 +436,32 @@ mt7530_pad_clk_setup(struct dsa_switch *ds, phy_interface_t interface) TD_DM_DRVP(8) | TD_DM_DRVN(8)); /* Setup core clock for MT7530 */ - if (!trgint) { - /* Disable MT7530 core clock */ - core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN); + /* Disable MT7530 core clock */ + core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN); - /* Disable PLL, since phy_device has not yet been created - * provided for phy_[read,write]_mmd_indirect is called, we - * provide our own core_write_mmd_indirect to complete this - * function. - */ - core_write_mmd_indirect(priv, - CORE_GSWPLL_GRP1, - MDIO_MMD_VEND2, - 0); + /* Disable PLL, since phy_device has not yet been created + * provided for phy_[read,write]_mmd_indirect is called, we + * provide our own core_write_mmd_indirect to complete this + * function. + */ + core_write_mmd_indirect(priv, + CORE_GSWPLL_GRP1, + MDIO_MMD_VEND2, + 0); - /* Set core clock into 500Mhz */ - core_write(priv, CORE_GSWPLL_GRP2, - RG_GSWPLL_POSDIV_500M(1) | - RG_GSWPLL_FBKDIV_500M(25)); + /* Set core clock into 500Mhz */ + core_write(priv, CORE_GSWPLL_GRP2, + RG_GSWPLL_POSDIV_500M(1) | + RG_GSWPLL_FBKDIV_500M(25)); - /* Enable PLL */ - core_write(priv, CORE_GSWPLL_GRP1, - RG_GSWPLL_EN_PRE | - RG_GSWPLL_POSDIV_200M(2) | - RG_GSWPLL_FBKDIV_200M(32)); + /* Enable PLL */ + core_write(priv, CORE_GSWPLL_GRP1, + RG_GSWPLL_EN_PRE | + RG_GSWPLL_POSDIV_200M(2) | + RG_GSWPLL_FBKDIV_200M(32)); - /* Enable MT7530 core clock */ - core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN); - } + /* Enable MT7530 core clock */ + core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN); /* Setup the MT7530 TRGMII Tx Clock */ core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN); From a673321aa74fc5604643d6a4653684c0bc9fa617 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 12 Mar 2021 16:43:52 -0800 Subject: [PATCH 061/171] selftests: mptcp: Restore packet capture option in join tests The join self tests previously used the '-c' command line option to enable creation of pcap files for the tests that run, but the change to allow running a subset of the join tests made overlapping use of that option. Restore the capture functionality with '-c' and move the syncookie test option to '-k'. Fixes: 1002b89f23ea ("selftests: mptcp: add command line arguments for mptcp_join.sh") Acked-and-tested-by: Geliang Tang Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- .../testing/selftests/net/mptcp/mptcp_join.sh | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 964db9ed544f..ad32240fbfda 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -11,6 +11,7 @@ ksft_skip=4 timeout=30 mptcp_connect="" capture=0 +do_all_tests=1 TEST_COUNT=0 @@ -121,12 +122,6 @@ reset_with_add_addr_timeout() -j DROP } -for arg in "$@"; do - if [ "$arg" = "-c" ]; then - capture=1 - fi -done - ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" @@ -1221,7 +1216,8 @@ usage() echo " -4 v4mapped_tests" echo " -b backup_tests" echo " -p add_addr_ports_tests" - echo " -c syncookies_tests" + echo " -k syncookies_tests" + echo " -c capture pcap files" echo " -h help" } @@ -1235,12 +1231,24 @@ make_file "$cin" "client" 1 make_file "$sin" "server" 1 trap cleanup EXIT -if [ -z $1 ]; then +for arg in "$@"; do + # check for "capture" arg before launching tests + if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then + capture=1 + fi + + # exception for the capture option, the rest means: a part of the tests + if [ "${arg}" != "-c" ]; then + do_all_tests=0 + fi +done + +if [ $do_all_tests -eq 1 ]; then all_tests exit $ret fi -while getopts 'fsltra64bpch' opt; do +while getopts 'fsltra64bpkch' opt; do case $opt in f) subflows_tests @@ -1272,9 +1280,11 @@ while getopts 'fsltra64bpch' opt; do p) add_addr_ports_tests ;; - c) + k) syncookies_tests ;; + c) + ;; h | *) usage ;; From 6afa455e6153bcbde879dd408f7ac83668b0ac4a Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Fri, 12 Mar 2021 12:45:30 -0600 Subject: [PATCH 062/171] ibmvnic: update MAINTAINERS Tom wrote most of the driver code and his experience is valuable to us. Add him as a Reviewer so that patches will be Cc'ed and reviewed by him. Signed-off-by: Lijun Pan Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f2e34ede1ab3..6acc3b839187 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8521,6 +8521,7 @@ IBM Power SRIOV Virtual NIC Device Driver M: Dany Madden M: Lijun Pan M: Sukadev Bhattiprolu +R: Thomas Falcon L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/ibm/ibmvnic.* From 3a9ef3e11c5d33e5cb355b4aad1a4caad2407541 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 12 Mar 2021 09:12:48 -0600 Subject: [PATCH 063/171] net: ipa: terminate message handler arrays When a QMI handle is initialized, an array of message handler structures is provided, defining how any received message should be handled based on its type and message ID. The QMI core code traverses this array when a message arrives and calls the function associated with the (type, msg_id) found in the array. The array is supposed to be terminated with an empty (all zero) entry though. Without it, an unsupported message will cause the QMI core code to go past the end of the array. Fix this bug, by properly terminating the message handler arrays provided when QMI handles are set up by the IPA driver. Fixes: 530f9216a9537 ("soc: qcom: ipa: AP/modem communications") Reported-by: Sujit Kautkar Signed-off-by: Alex Elder Reviewed-by: Bjorn Andersson Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_qmi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ipa/ipa_qmi.c b/drivers/net/ipa/ipa_qmi.c index 2fc64483f275..e594bf3b600f 100644 --- a/drivers/net/ipa/ipa_qmi.c +++ b/drivers/net/ipa/ipa_qmi.c @@ -249,6 +249,7 @@ static const struct qmi_msg_handler ipa_server_msg_handlers[] = { .decoded_size = IPA_QMI_DRIVER_INIT_COMPLETE_REQ_SZ, .fn = ipa_server_driver_init_complete, }, + { }, }; /* Handle an INIT_DRIVER response message from the modem. */ @@ -269,6 +270,7 @@ static const struct qmi_msg_handler ipa_client_msg_handlers[] = { .decoded_size = IPA_QMI_INIT_DRIVER_RSP_SZ, .fn = ipa_client_init_driver, }, + { }, }; /* Return a pointer to an init modem driver request structure, which contains From ad236ccde19a93309cba25fb8c9e789b9c69397c Mon Sep 17 00:00:00 2001 From: Eva Dengler Date: Sat, 13 Mar 2021 01:04:13 +0100 Subject: [PATCH 064/171] devlink: fix typo in documentation This commit fixes three spelling typos in devlink-dpipe.rst and devlink-port.rst. Signed-off-by: Eva Dengler Acked-by: Randy Dunlap Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-dpipe.rst | 2 +- Documentation/networking/devlink/devlink-port.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/devlink/devlink-dpipe.rst b/Documentation/networking/devlink/devlink-dpipe.rst index 468fe1001b74..af37f250df43 100644 --- a/Documentation/networking/devlink/devlink-dpipe.rst +++ b/Documentation/networking/devlink/devlink-dpipe.rst @@ -52,7 +52,7 @@ purposes as a standard complementary tool. The system's view from ``devlink-dpipe`` should change according to the changes done by the standard configuration tools. -For example, it’s quiet common to implement Access Control Lists (ACL) +For example, it’s quite common to implement Access Control Lists (ACL) using Ternary Content Addressable Memory (TCAM). The TCAM memory can be divided into TCAM regions. Complex TC filters can have multiple rules with different priorities and different lookup keys. On the other hand hardware diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index e99b41599465..ab790e7980b8 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -151,7 +151,7 @@ representor netdevice. ------------- A subfunction devlink port is created but it is not active yet. That means the entities are created on devlink side, the e-switch port representor is created, -but the subfunction device itself it not created. A user might use e-switch port +but the subfunction device itself is not created. A user might use e-switch port representor to do settings, putting it into bridge, adding TC rules, etc. A user might as well configure the hardware address (such as MAC address) of the subfunction while subfunction is inactive. @@ -173,7 +173,7 @@ Terms and Definitions * - Term - Definitions * - ``PCI device`` - - A physical PCI device having one or more PCI bus consists of one or + - A physical PCI device having one or more PCI buses consists of one or more PCI controllers. * - ``PCI controller`` - A controller consists of potentially multiple physical functions, From 6577b9a551aedb86bca6d4438c28386361845108 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 14 Mar 2021 14:08:36 -0400 Subject: [PATCH 065/171] net: arcnet: com20020 fix error handling There are two issues when handling error case in com20020pci_probe() 1. priv might be not initialized yet when calling com20020pci_remove() from com20020pci_probe(), since the priv is set at the very last but it can jump to error handling in the middle and priv remains NULL. 2. memory leak - the net device is allocated in alloc_arcdev but not properly released if error happens in the middle of the big for loop [ 1.529110] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 1.531447] RIP: 0010:com20020pci_remove+0x15/0x60 [com20020_pci] [ 1.536805] Call Trace: [ 1.536939] com20020pci_probe+0x3f2/0x48c [com20020_pci] [ 1.537226] local_pci_probe+0x48/0x80 [ 1.539918] com20020pci_init+0x3f/0x1000 [com20020_pci] Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 34 +++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 8bdc44b7e09a..3c8f665c1558 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -127,6 +127,8 @@ static int com20020pci_probe(struct pci_dev *pdev, int i, ioaddr, ret; struct resource *r; + ret = 0; + if (pci_enable_device(pdev)) return -EIO; @@ -139,6 +141,8 @@ static int com20020pci_probe(struct pci_dev *pdev, priv->ci = ci; mm = &ci->misc_map; + pci_set_drvdata(pdev, priv); + INIT_LIST_HEAD(&priv->list_dev); if (mm->size) { @@ -161,7 +165,7 @@ static int com20020pci_probe(struct pci_dev *pdev, dev = alloc_arcdev(device); if (!dev) { ret = -ENOMEM; - goto out_port; + break; } dev->dev_port = i; @@ -178,7 +182,7 @@ static int com20020pci_probe(struct pci_dev *pdev, pr_err("IO region %xh-%xh already allocated\n", ioaddr, ioaddr + cm->size - 1); ret = -EBUSY; - goto out_port; + goto err_free_arcdev; } /* Dummy access after Reset @@ -216,18 +220,18 @@ static int com20020pci_probe(struct pci_dev *pdev, if (arcnet_inb(ioaddr, COM20020_REG_R_STATUS) == 0xFF) { pr_err("IO address %Xh is empty!\n", ioaddr); ret = -EIO; - goto out_port; + goto err_free_arcdev; } if (com20020_check(dev)) { ret = -EIO; - goto out_port; + goto err_free_arcdev; } card = devm_kzalloc(&pdev->dev, sizeof(struct com20020_dev), GFP_KERNEL); if (!card) { ret = -ENOMEM; - goto out_port; + goto err_free_arcdev; } card->index = i; @@ -253,29 +257,29 @@ static int com20020pci_probe(struct pci_dev *pdev, ret = devm_led_classdev_register(&pdev->dev, &card->tx_led); if (ret) - goto out_port; + goto err_free_arcdev; ret = devm_led_classdev_register(&pdev->dev, &card->recon_led); if (ret) - goto out_port; + goto err_free_arcdev; dev_set_drvdata(&dev->dev, card); ret = com20020_found(dev, IRQF_SHARED); if (ret) - goto out_port; + goto err_free_arcdev; devm_arcnet_led_init(dev, dev->dev_id, i); list_add(&card->list, &priv->list_dev); + continue; + +err_free_arcdev: + free_arcdev(dev); + break; } - - pci_set_drvdata(pdev, priv); - - return 0; - -out_port: - com20020pci_remove(pdev); + if (ret) + com20020pci_remove(pdev); return ret; } From 50535249f624d0072cd885bcdce4e4b6fb770160 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Mar 2021 08:59:48 -0800 Subject: [PATCH 066/171] net: qrtr: fix a kernel-infoleak in qrtr_recvmsg() struct sockaddr_qrtr has a 2-byte hole, and qrtr_recvmsg() currently does not clear it before copying kernel data to user space. It might be too late to name the hole since sockaddr_qrtr structure is uapi. BUG: KMSAN: kernel-infoleak in kmsan_copy_to_user+0x9c/0xb0 mm/kmsan/kmsan_hooks.c:249 CPU: 0 PID: 29705 Comm: syz-executor.3 Not tainted 5.11.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x21c/0x280 lib/dump_stack.c:120 kmsan_report+0xfb/0x1e0 mm/kmsan/kmsan_report.c:118 kmsan_internal_check_memory+0x202/0x520 mm/kmsan/kmsan.c:402 kmsan_copy_to_user+0x9c/0xb0 mm/kmsan/kmsan_hooks.c:249 instrument_copy_to_user include/linux/instrumented.h:121 [inline] _copy_to_user+0x1ac/0x270 lib/usercopy.c:33 copy_to_user include/linux/uaccess.h:209 [inline] move_addr_to_user+0x3a2/0x640 net/socket.c:237 ____sys_recvmsg+0x696/0xd50 net/socket.c:2575 ___sys_recvmsg net/socket.c:2610 [inline] do_recvmmsg+0xa97/0x22d0 net/socket.c:2710 __sys_recvmmsg net/socket.c:2789 [inline] __do_sys_recvmmsg net/socket.c:2812 [inline] __se_sys_recvmmsg+0x24a/0x410 net/socket.c:2805 __x64_sys_recvmmsg+0x62/0x80 net/socket.c:2805 do_syscall_64+0x9f/0x140 arch/x86/entry/common.c:48 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x465f69 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f43659d6188 EFLAGS: 00000246 ORIG_RAX: 000000000000012b RAX: ffffffffffffffda RBX: 000000000056bf60 RCX: 0000000000465f69 RDX: 0000000000000008 RSI: 0000000020003e40 RDI: 0000000000000003 RBP: 00000000004bfa8f R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000010060 R11: 0000000000000246 R12: 000000000056bf60 R13: 0000000000a9fb1f R14: 00007f43659d6300 R15: 0000000000022000 Local variable ----addr@____sys_recvmsg created at: ____sys_recvmsg+0x168/0xd50 net/socket.c:2550 ____sys_recvmsg+0x168/0xd50 net/socket.c:2550 Bytes 2-3 of 12 are uninitialized Memory access of size 12 starts at ffff88817c627b40 Data copied to user address 0000000020000140 Fixes: bdabad3e363d ("net: Add Qualcomm IPC router") Signed-off-by: Eric Dumazet Cc: Courtney Cavin Reported-by: syzbot Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index edb6ac17ceca..dfc820ee553a 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1058,6 +1058,11 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, rc = copied; if (addr) { + /* There is an anonymous 2-byte hole after sq_family, + * make sure to clear it. + */ + memset(addr, 0, sizeof(*addr)); + addr->sq_family = AF_QIPCRTR; addr->sq_node = cb->src_node; addr->sq_port = cb->src_port; From a25f822285420486f5da434efc8d940d42a83bce Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 12 Mar 2021 20:08:57 +0000 Subject: [PATCH 067/171] flow_dissector: fix byteorder of dissected ICMP ID flow_dissector_key_icmp::id is of type u16 (CPU byteorder), ICMP header has its ID field in network byteorder obviously. Sparse says: net/core/flow_dissector.c:178:43: warning: restricted __be16 degrades to integer Convert ID value to CPU byteorder when storing it into flow_dissector_key_icmp. Fixes: 5dec597e5cd0 ("flow_dissector: extract more ICMP information") Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2ef2224b3bff..a96a4f5de0ce 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -176,7 +176,7 @@ void skb_flow_get_icmp_tci(const struct sk_buff *skb, * avoid confusion with packets without such field */ if (icmp_has_id(ih->type)) - key_icmp->id = ih->un.echo.id ? : 1; + key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1; else key_icmp->id = 0; } From 31254dc9566221429d2cfb45fd5737985d70f2b6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 9 Mar 2021 11:22:14 +0800 Subject: [PATCH 068/171] selftests/bpf: Set gopt opt_class to 0 if get tunnel opt failed When fixing the bpf test_tunnel.sh geneve failure. I only fixed the IPv4 part but forgot the IPv6 issue. Similar with the IPv4 fixes 557c223b643a ("selftests/bpf: No need to drop the packet when there is no geneve opt"), when there is no tunnel option and bpf_skb_get_tunnel_opt() returns error, there is no need to drop the packets and break all geneve rx traffic. Just set opt_class to 0 and keep returning TC_ACT_OK at the end. Fixes: 557c223b643a ("selftests/bpf: No need to drop the packet when there is no geneve opt") Fixes: 933a741e3b82 ("selftests/bpf: bpf tunnel test.") Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Acked-by: William Tu Link: https://lore.kernel.org/bpf/20210309032214.2112438-1-liuhangbin@gmail.com --- tools/testing/selftests/bpf/progs/test_tunnel_kern.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 9afe947cfae9..ba6eadfec565 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -508,10 +508,8 @@ int _ip6geneve_get_tunnel(struct __sk_buff *skb) } ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } + if (ret < 0) + gopt.opt_class = 0; bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4, gopt.opt_class); From abe7034b9a8d57737e80cc16d60ed3666990bdbf Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Mon, 8 Mar 2021 14:24:11 +1300 Subject: [PATCH 069/171] Revert "netfilter: x_tables: Update remaining dereference to RCU" This reverts commit 443d6e86f821a165fae3fc3fc13086d27ac140b1. This (and the following) patch basically re-implemented the RCU mechanisms of patch 784544739a25. That patch was replaced because of the performance problems that it created when replacing tables. Now, we have the same issue: the call to synchronize_rcu() makes replacing tables slower by as much as an order of magnitude. Revert these patches and fix the issue in a different way. Signed-off-by: Mark Tomlinson Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 2 +- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c576a63d09db..563b62b76a5f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1379,7 +1379,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = xt_table_get_private_protected(t); + const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e8f6f9d86237..6e2851f8d3a3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1589,7 +1589,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = xt_table_get_private_protected(t); + const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 0d453fa9e327..c4f532f4d311 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1598,7 +1598,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = xt_table_get_private_protected(t); + const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) From d3d40f237480abf3268956daf18cdc56edd32834 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Mon, 8 Mar 2021 14:24:12 +1300 Subject: [PATCH 070/171] Revert "netfilter: x_tables: Switch synchronization to RCU" This reverts commit cc00bcaa589914096edef7fb87ca5cee4a166b5c. This (and the preceding) patch basically re-implemented the RCU mechanisms of patch 784544739a25. That patch was replaced because of the performance problems that it created when replacing tables. Now, we have the same issue: the call to synchronize_rcu() makes replacing tables slower by as much as an order of magnitude. Prior to using RCU a script calling "iptables" approx. 200 times was taking 1.16s. With RCU this increased to 11.59s. Revert these patches and fix the issue in a different way. Signed-off-by: Mark Tomlinson Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 5 +-- net/ipv4/netfilter/arp_tables.c | 14 ++++----- net/ipv4/netfilter/ip_tables.c | 14 ++++----- net/ipv6/netfilter/ip6_tables.c | 14 ++++----- net/netfilter/x_tables.c | 49 +++++++++++++++++++++--------- 5 files changed, 56 insertions(+), 40 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 8ebb64193757..5deb099d156d 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -227,7 +227,7 @@ struct xt_table { unsigned int valid_hooks; /* Man behind the curtain... */ - struct xt_table_info __rcu *private; + struct xt_table_info *private; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; @@ -448,9 +448,6 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); -struct xt_table_info -*xt_table_get_private_protected(const struct xt_table *table); - #ifdef CONFIG_COMPAT #include diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 563b62b76a5f..d1e04d2b5170 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = rcu_access_pointer(table->private); + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; @@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = xt_table_get_private_protected(table); + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct arpt_entry *e; struct xt_counters *counters; - struct xt_table_info *private = xt_table_get_private_protected(table); + struct xt_table_info *private = table->private; int ret = 0; void *loc_cpu_entry; @@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (!IS_ERR(t)) { struct arpt_getinfo info; - const struct xt_table_info *private = xt_table_get_private_protected(t); + const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = xt_table_get_private_protected(t); + const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = xt_table_get_private_protected(t); + private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = xt_table_get_private_protected(table); + const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6e2851f8d3a3..f15bc21d7301 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); - private = rcu_access_pointer(table->private); + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; @@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = xt_table_get_private_protected(table); + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct ipt_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = xt_table_get_private_protected(table); + const struct xt_table_info *private = table->private; int ret = 0; const void *loc_cpu_entry; @@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET, name); if (!IS_ERR(t)) { struct ipt_getinfo info; - const struct xt_table_info *private = xt_table_get_private_protected(t); + const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = xt_table_get_private_protected(t); + const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = xt_table_get_private_protected(t); + private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = xt_table_get_private_protected(table); + const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c4f532f4d311..2e2119bfcf13 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -280,7 +280,7 @@ ip6t_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = rcu_access_pointer(table->private); + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; @@ -807,7 +807,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = xt_table_get_private_protected(table); + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -831,7 +831,7 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct ip6t_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = xt_table_get_private_protected(table); + const struct xt_table_info *private = table->private; int ret = 0; const void *loc_cpu_entry; @@ -980,7 +980,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET6, name); if (!IS_ERR(t)) { struct ip6t_getinfo info; - const struct xt_table_info *private = xt_table_get_private_protected(t); + const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -1035,7 +1035,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - struct xt_table_info *private = xt_table_get_private_protected(t); + struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1189,7 +1189,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = xt_table_get_private_protected(t); + private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1552,7 +1552,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = xt_table_get_private_protected(table); + const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index bce6ca203d46..7df3aef39c5c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1351,14 +1351,6 @@ struct xt_counters *xt_counters_alloc(unsigned int counters) } EXPORT_SYMBOL(xt_counters_alloc); -struct xt_table_info -*xt_table_get_private_protected(const struct xt_table *table) -{ - return rcu_dereference_protected(table->private, - mutex_is_locked(&xt[table->af].mutex)); -} -EXPORT_SYMBOL(xt_table_get_private_protected); - struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, @@ -1366,6 +1358,7 @@ xt_replace_table(struct xt_table *table, int *error) { struct xt_table_info *private; + unsigned int cpu; int ret; ret = xt_jumpstack_alloc(newinfo); @@ -1375,20 +1368,47 @@ xt_replace_table(struct xt_table *table, } /* Do the substitution. */ - private = xt_table_get_private_protected(table); + local_bh_disable(); + private = table->private; /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { pr_debug("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); + local_bh_enable(); *error = -EAGAIN; return NULL; } newinfo->initial_entries = private->initial_entries; + /* + * Ensure contents of newinfo are visible before assigning to + * private. + */ + smp_wmb(); + table->private = newinfo; - rcu_assign_pointer(table->private, newinfo); - synchronize_rcu(); + /* make sure all cpus see new ->private value */ + smp_wmb(); + + /* + * Even though table entries have now been swapped, other CPU's + * may still be using the old entries... + */ + local_bh_enable(); + + /* ... so wait for even xt_recseq on all cpus */ + for_each_possible_cpu(cpu) { + seqcount_t *s = &per_cpu(xt_recseq, cpu); + u32 seq = raw_read_seqcount(s); + + if (seq & 1) { + do { + cond_resched(); + cpu_relax(); + } while (seq == raw_read_seqcount(s)); + } + } audit_log_nfcfg(table->name, table->af, private->number, !private->number ? AUDIT_XT_OP_REGISTER : @@ -1424,12 +1444,12 @@ struct xt_table *xt_register_table(struct net *net, } /* Simplifies replace_table code. */ - rcu_assign_pointer(table->private, bootstrap); + table->private = bootstrap; if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; - private = xt_table_get_private_protected(table); + private = table->private; pr_debug("table->private->number = %u\n", private->number); /* save number of initial entries */ @@ -1452,8 +1472,7 @@ void *xt_unregister_table(struct xt_table *table) struct xt_table_info *private; mutex_lock(&xt[table->af].mutex); - private = xt_table_get_private_protected(table); - RCU_INIT_POINTER(table->private, NULL); + private = table->private; list_del(&table->list); mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, From 175e476b8cdf2a4de7432583b49c871345e4f8a1 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Mon, 8 Mar 2021 14:24:13 +1300 Subject: [PATCH 071/171] netfilter: x_tables: Use correct memory barriers. When a new table value was assigned, it was followed by a write memory barrier. This ensured that all writes before this point would complete before any writes after this point. However, to determine whether the rules are unused, the sequence counter is read. To ensure that all writes have been done before these reads, a full memory barrier is needed, not just a write memory barrier. The same argument applies when incrementing the counter, before the rules are read. Changing to using smp_mb() instead of smp_wmb() fixes the kernel panic reported in cc00bcaa5899 (which is still present), while still maintaining the same speed of replacing tables. The smb_mb() barriers potentially slow the packet path, however testing has shown no measurable change in performance on a 4-core MIPS64 platform. Fixes: 7f5c6d4f665b ("netfilter: get rid of atomic ops in fast path") Signed-off-by: Mark Tomlinson Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- net/netfilter/x_tables.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 5deb099d156d..8ec48466410a 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -376,7 +376,7 @@ static inline unsigned int xt_write_recseq_begin(void) * since addend is most likely 1 */ __this_cpu_add(xt_recseq.sequence, addend); - smp_wmb(); + smp_mb(); return addend; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 7df3aef39c5c..6bd31a7a27fc 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1389,7 +1389,7 @@ xt_replace_table(struct xt_table *table, table->private = newinfo; /* make sure all cpus see new ->private value */ - smp_wmb(); + smp_mb(); /* * Even though table entries have now been swapped, other CPU's From b58f33d49e426dc66e98ed73afb5d97b15a25f2d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 15 Mar 2021 11:31:09 +0100 Subject: [PATCH 072/171] netfilter: ctnetlink: fix dump of the expect mask attribute Before this change, the mask is never included in the netlink message, so "conntrack -E expect" always prints 0.0.0.0. In older kernels the l3num callback struct was passed as argument, based on tuple->src.l3num. After the l3num indirection got removed, the call chain is based on m.src.l3num, but this value is 0xffff. Init l3num to the correct value. Fixes: f957be9d349a3 ("netfilter: conntrack: remove ctnetlink callbacks from l3 protocol trackers") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1469365bac7e..1d519b0e51a5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2962,6 +2962,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, memset(&m, 0xFF, sizeof(m)); memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); m.src.u.all = mask->src.u.all; + m.src.l3num = tuple->src.l3num; m.dst.protonum = tuple->dst.protonum; nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK); From bf0ffea336b493c0a8c8bc27b46683ecf1e8f294 Mon Sep 17 00:00:00 2001 From: Xie He Date: Sun, 14 Mar 2021 04:21:01 -0700 Subject: [PATCH 073/171] net: hdlc_x25: Prevent racing between "x25_close" and "x25_xmit"/"x25_rx" "x25_close" is called by "hdlc_close" in "hdlc.c", which is called by hardware drivers' "ndo_stop" function. "x25_xmit" is called by "hdlc_start_xmit" in "hdlc.c", which is hardware drivers' "ndo_start_xmit" function. "x25_rx" is called by "hdlc_rcv" in "hdlc.c", which receives HDLC frames from "net/core/dev.c". "x25_close" races with "x25_xmit" and "x25_rx" because their callers race. However, we need to ensure that the LAPB APIs called in "x25_xmit" and "x25_rx" are called before "lapb_unregister" is called in "x25_close". This patch adds locking to ensure when "x25_xmit" and "x25_rx" are doing their work, "lapb_unregister" is not yet called in "x25_close". Reasons for not solving the racing between "x25_close" and "x25_xmit" by calling "netif_tx_disable" in "x25_close": 1. We still need to solve the racing between "x25_close" and "x25_rx"; 2. The design of the HDLC subsystem assumes the HDLC hardware drivers have full control over the TX queue, and the HDLC protocol drivers (like this driver) have no control. Controlling the queue here in the protocol driver may interfere with hardware drivers' control of the queue. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xie He Signed-off-by: David S. Miller --- drivers/net/wan/hdlc_x25.c | 44 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c index 4aaa6388b9ee..5a6a945f6c81 100644 --- a/drivers/net/wan/hdlc_x25.c +++ b/drivers/net/wan/hdlc_x25.c @@ -23,6 +23,8 @@ struct x25_state { x25_hdlc_proto settings; + bool up; + spinlock_t up_lock; /* Protects "up" */ }; static int x25_ioctl(struct net_device *dev, struct ifreq *ifr); @@ -104,6 +106,8 @@ static void x25_data_transmit(struct net_device *dev, struct sk_buff *skb) static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev) { + hdlc_device *hdlc = dev_to_hdlc(dev); + struct x25_state *x25st = state(hdlc); int result; /* There should be a pseudo header of 1 byte added by upper layers. @@ -114,11 +118,19 @@ static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } + spin_lock_bh(&x25st->up_lock); + if (!x25st->up) { + spin_unlock_bh(&x25st->up_lock); + kfree_skb(skb); + return NETDEV_TX_OK; + } + switch (skb->data[0]) { case X25_IFACE_DATA: /* Data to be transmitted */ skb_pull(skb, 1); if ((result = lapb_data_request(dev, skb)) != LAPB_OK) dev_kfree_skb(skb); + spin_unlock_bh(&x25st->up_lock); return NETDEV_TX_OK; case X25_IFACE_CONNECT: @@ -147,6 +159,7 @@ static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev) break; } + spin_unlock_bh(&x25st->up_lock); dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -164,6 +177,7 @@ static int x25_open(struct net_device *dev) .data_transmit = x25_data_transmit, }; hdlc_device *hdlc = dev_to_hdlc(dev); + struct x25_state *x25st = state(hdlc); struct lapb_parms_struct params; int result; @@ -190,6 +204,10 @@ static int x25_open(struct net_device *dev) if (result != LAPB_OK) return -EINVAL; + spin_lock_bh(&x25st->up_lock); + x25st->up = true; + spin_unlock_bh(&x25st->up_lock); + return 0; } @@ -197,6 +215,13 @@ static int x25_open(struct net_device *dev) static void x25_close(struct net_device *dev) { + hdlc_device *hdlc = dev_to_hdlc(dev); + struct x25_state *x25st = state(hdlc); + + spin_lock_bh(&x25st->up_lock); + x25st->up = false; + spin_unlock_bh(&x25st->up_lock); + lapb_unregister(dev); } @@ -205,15 +230,28 @@ static void x25_close(struct net_device *dev) static int x25_rx(struct sk_buff *skb) { struct net_device *dev = skb->dev; + hdlc_device *hdlc = dev_to_hdlc(dev); + struct x25_state *x25st = state(hdlc); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { dev->stats.rx_dropped++; return NET_RX_DROP; } - if (lapb_data_received(dev, skb) == LAPB_OK) - return NET_RX_SUCCESS; + spin_lock_bh(&x25st->up_lock); + if (!x25st->up) { + spin_unlock_bh(&x25st->up_lock); + kfree_skb(skb); + dev->stats.rx_dropped++; + return NET_RX_DROP; + } + if (lapb_data_received(dev, skb) == LAPB_OK) { + spin_unlock_bh(&x25st->up_lock); + return NET_RX_SUCCESS; + } + + spin_unlock_bh(&x25st->up_lock); dev->stats.rx_errors++; dev_kfree_skb_any(skb); return NET_RX_DROP; @@ -298,6 +336,8 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr) return result; memcpy(&state(hdlc)->settings, &new_settings, size); + state(hdlc)->up = false; + spin_lock_init(&state(hdlc)->up_lock); /* There's no header_ops so hard_header_len should be 0. */ dev->hard_header_len = 0; From d82c6c1aaccd2877b6082cebcb1746a13648a16d Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Mon, 15 Mar 2021 12:33:42 +0800 Subject: [PATCH 074/171] net: phylink: Fix phylink_err() function name error in phylink_major_config if pl->mac_ops->mac_finish() failed, phylink_err should use "mac_finish" instead of "mac_prepare". Fixes: b7ad14c2fe2d4 ("net: phylink: re-implement interface configuration with PCS") Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 053c92e02cd8..dc2800beacc3 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -476,7 +476,7 @@ static void phylink_major_config(struct phylink *pl, bool restart, err = pl->mac_ops->mac_finish(pl->config, pl->cur_link_an_mode, state->interface); if (err < 0) - phylink_err(pl, "mac_prepare failed: %pe\n", + phylink_err(pl, "mac_finish failed: %pe\n", ERR_PTR(err)); } } From 0217ed2848e8538bcf9172d97ed2eeb4a26041bb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Mar 2021 03:06:58 -0700 Subject: [PATCH 075/171] tipc: better validate user input in tipc_nl_retrieve_key() Before calling tipc_aead_key_size(ptr), we need to ensure we have enough data to dereference ptr->keylen. We probably also want to make sure tipc_aead_key_size() wont overflow with malicious ptr->keylen values. Syzbot reported: BUG: KMSAN: uninit-value in __tipc_nl_node_set_key net/tipc/node.c:2971 [inline] BUG: KMSAN: uninit-value in tipc_nl_node_set_key+0x9bf/0x13b0 net/tipc/node.c:3023 CPU: 0 PID: 21060 Comm: syz-executor.5 Not tainted 5.11.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x21c/0x280 lib/dump_stack.c:120 kmsan_report+0xfb/0x1e0 mm/kmsan/kmsan_report.c:118 __msan_warning+0x5f/0xa0 mm/kmsan/kmsan_instr.c:197 __tipc_nl_node_set_key net/tipc/node.c:2971 [inline] tipc_nl_node_set_key+0x9bf/0x13b0 net/tipc/node.c:3023 genl_family_rcv_msg_doit net/netlink/genetlink.c:739 [inline] genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] genl_rcv_msg+0x1319/0x1610 net/netlink/genetlink.c:800 netlink_rcv_skb+0x6fa/0x810 net/netlink/af_netlink.c:2494 genl_rcv+0x63/0x80 net/netlink/genetlink.c:811 netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline] netlink_unicast+0x11d6/0x14a0 net/netlink/af_netlink.c:1330 netlink_sendmsg+0x1740/0x1840 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg net/socket.c:672 [inline] ____sys_sendmsg+0xcfc/0x12f0 net/socket.c:2345 ___sys_sendmsg net/socket.c:2399 [inline] __sys_sendmsg+0x714/0x830 net/socket.c:2432 __compat_sys_sendmsg net/compat.c:347 [inline] __do_compat_sys_sendmsg net/compat.c:354 [inline] __se_compat_sys_sendmsg+0xa7/0xc0 net/compat.c:351 __ia32_compat_sys_sendmsg+0x4a/0x70 net/compat.c:351 do_syscall_32_irqs_on arch/x86/entry/common.c:79 [inline] __do_fast_syscall_32+0x102/0x160 arch/x86/entry/common.c:141 do_fast_syscall_32+0x6a/0xc0 arch/x86/entry/common.c:166 do_SYSENTER_32+0x73/0x90 arch/x86/entry/common.c:209 entry_SYSENTER_compat_after_hwframe+0x4d/0x5c RIP: 0023:0xf7f60549 Code: 03 74 c0 01 10 05 03 74 b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d b4 26 00 00 00 00 8d b4 26 00 00 00 00 RSP: 002b:00000000f555a5fc EFLAGS: 00000296 ORIG_RAX: 0000000000000172 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000020000200 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:121 [inline] kmsan_internal_poison_shadow+0x5c/0xf0 mm/kmsan/kmsan.c:104 kmsan_slab_alloc+0x8d/0xe0 mm/kmsan/kmsan_hooks.c:76 slab_alloc_node mm/slub.c:2907 [inline] __kmalloc_node_track_caller+0xa37/0x1430 mm/slub.c:4527 __kmalloc_reserve net/core/skbuff.c:142 [inline] __alloc_skb+0x2f8/0xb30 net/core/skbuff.c:210 alloc_skb include/linux/skbuff.h:1099 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1176 [inline] netlink_sendmsg+0xdbc/0x1840 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg net/socket.c:672 [inline] ____sys_sendmsg+0xcfc/0x12f0 net/socket.c:2345 ___sys_sendmsg net/socket.c:2399 [inline] __sys_sendmsg+0x714/0x830 net/socket.c:2432 __compat_sys_sendmsg net/compat.c:347 [inline] __do_compat_sys_sendmsg net/compat.c:354 [inline] __se_compat_sys_sendmsg+0xa7/0xc0 net/compat.c:351 __ia32_compat_sys_sendmsg+0x4a/0x70 net/compat.c:351 do_syscall_32_irqs_on arch/x86/entry/common.c:79 [inline] __do_fast_syscall_32+0x102/0x160 arch/x86/entry/common.c:141 do_fast_syscall_32+0x6a/0xc0 arch/x86/entry/common.c:166 do_SYSENTER_32+0x73/0x90 arch/x86/entry/common.c:209 entry_SYSENTER_compat_after_hwframe+0x4d/0x5c Fixes: e1f32190cf7d ("tipc: add support for AEAD key setting via netlink") Signed-off-by: Eric Dumazet Cc: Tuong Lien Cc: Jon Maloy Cc: Ying Xue Reported-by: syzbot Signed-off-by: David S. Miller --- net/tipc/node.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index 008670d1f43e..136338b85504 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2895,17 +2895,22 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, #ifdef CONFIG_TIPC_CRYPTO static int tipc_nl_retrieve_key(struct nlattr **attrs, - struct tipc_aead_key **key) + struct tipc_aead_key **pkey) { struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; + struct tipc_aead_key *key; if (!attr) return -ENODATA; - *key = (struct tipc_aead_key *)nla_data(attr); - if (nla_len(attr) < tipc_aead_key_size(*key)) + if (nla_len(attr) < sizeof(*key)) + return -EINVAL; + key = (struct tipc_aead_key *)nla_data(attr); + if (key->keylen > TIPC_AEAD_KEYLEN_MAX || + nla_len(attr) < tipc_aead_key_size(key)) return -EINVAL; + *pkey = key; return 0; } From 7233da86697efef41288f8b713c10c2499cffe85 Mon Sep 17 00:00:00 2001 From: Alexander Ovechkin Date: Mon, 15 Mar 2021 14:05:45 +0300 Subject: [PATCH 076/171] tcp: relookup sock for RST+ACK packets handled by obsolete req sock Currently tcp_check_req can be called with obsolete req socket for which big socket have been already created (because of CPU race or early demux assigning req socket to multiple packets in gro batch). Commit e0f9759f530bf789e984 ("tcp: try to keep packet if SYN_RCV race is lost") added retry in case when tcp_check_req is called for PSH|ACK packet. But if client sends RST+ACK immediatly after connection being established (it is performing healthcheck, for example) retry does not occur. In that case tcp_check_req tries to close req socket, leaving big socket active. Fixes: e0f9759f530 ("tcp: try to keep packet if SYN_RCV race is lost") Signed-off-by: Alexander Ovechkin Reported-by: Oleg Senin Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- net/ipv4/inet_connection_sock.c | 7 +++++-- net/ipv4/tcp_minisocks.c | 7 +++++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 10a625760de9..3c8c59471bc1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; } -void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); +bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6bd7ca09af03..fd472eae4f5c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -705,12 +705,15 @@ static bool reqsk_queue_unlink(struct request_sock *req) return found; } -void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) +bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { - if (reqsk_queue_unlink(req)) { + bool unlinked = reqsk_queue_unlink(req); + + if (unlinked) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } + return unlinked; } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0055ae0a3bf8..7513ba45553d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -804,8 +804,11 @@ embryonic_reset: tcp_reset(sk, skb); } if (!fastopen) { - inet_csk_reqsk_queue_drop(sk, req); - __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + bool unlinked = inet_csk_reqsk_queue_drop(sk, req); + + if (unlinked) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + *req_stolen = !unlinked; } return NULL; } From 13832ae2755395b2585500c85b64f5109a44227e Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 15 Mar 2021 11:41:16 +0100 Subject: [PATCH 077/171] mptcp: fix ADD_ADDR HMAC in case port is specified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, Linux computes the HMAC contained in ADD_ADDR sub-option using the Address Id and the IP Address, and hardcodes a destination port equal to zero. This is not ok for ADD_ADDR with port: ensure to account for the endpoint port when computing the HMAC, in compliance with RFC8684 §3.4.1. Fixes: 22fb85ffaefb ("mptcp: add port support for ADD_ADDR suboption writing") Reviewed-by: Mat Martineau Acked-by: Geliang Tang Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/mptcp/options.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 444a38681e93..89a4225ed321 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -567,15 +567,15 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, } static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, - struct in_addr *addr) + struct in_addr *addr, u16 port) { u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[7]; msg[0] = addr_id; memcpy(&msg[1], &addr->s_addr, 4); - msg[5] = 0; - msg[6] = 0; + msg[5] = port >> 8; + msg[6] = port & 0xFF; mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); @@ -584,15 +584,15 @@ static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, #if IS_ENABLED(CONFIG_MPTCP_IPV6) static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, - struct in6_addr *addr) + struct in6_addr *addr, u16 port) { u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; msg[0] = addr_id; memcpy(&msg[1], &addr->s6_addr, 16); - msg[17] = 0; - msg[18] = 0; + msg[17] = port >> 8; + msg[18] = port & 0xFF; mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); @@ -646,7 +646,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->ahmac = add_addr_generate_hmac(msk->local_key, msk->remote_key, opts->addr_id, - &opts->addr); + &opts->addr, + opts->port); } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -657,7 +658,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->ahmac = add_addr6_generate_hmac(msk->local_key, msk->remote_key, opts->addr_id, - &opts->addr6); + &opts->addr6, + opts->port); } } #endif @@ -962,12 +964,14 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) hmac = add_addr_generate_hmac(msk->remote_key, msk->local_key, - mp_opt->addr_id, &mp_opt->addr); + mp_opt->addr_id, &mp_opt->addr, + mp_opt->port); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else hmac = add_addr6_generate_hmac(msk->remote_key, msk->local_key, - mp_opt->addr_id, &mp_opt->addr6); + mp_opt->addr_id, &mp_opt->addr6, + mp_opt->port); #endif pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", From 3a5ca857079ea022e0b1b17fc154f7ad7dbc150f Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 2 Mar 2021 13:24:23 +0100 Subject: [PATCH 078/171] can: dev: Move device back to init netns on owning netns delete When a non-initial netns is destroyed, the usual policy is to delete all virtual network interfaces contained, but move physical interfaces back to the initial netns. This keeps the physical interface visible on the system. CAN devices are somewhat special, as they define rtnl_link_ops even if they are physical devices. If a CAN interface is moved into a non-initial netns, destroying that netns lets the interface vanish instead of moving it back to the initial netns. default_device_exit() skips CAN interfaces due to having rtnl_link_ops set. Reproducer: ip netns add foo ip link set can0 netns foo ip netns delete foo WARNING: CPU: 1 PID: 84 at net/core/dev.c:11030 ops_exit_list+0x38/0x60 CPU: 1 PID: 84 Comm: kworker/u4:2 Not tainted 5.10.19 #1 Workqueue: netns cleanup_net [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x94/0xa8) [] (dump_stack) from [] (__warn+0xb8/0x114) [] (__warn) from [] (warn_slowpath_fmt+0x7c/0xac) [] (warn_slowpath_fmt) from [] (ops_exit_list+0x38/0x60) [] (ops_exit_list) from [] (cleanup_net+0x230/0x380) [] (cleanup_net) from [] (process_one_work+0x1d8/0x438) [] (process_one_work) from [] (worker_thread+0x64/0x5a8) [] (worker_thread) from [] (kthread+0x148/0x14c) [] (kthread) from [] (ret_from_fork+0x14/0x2c) To properly restore physical CAN devices to the initial netns on owning netns exit, introduce a flag on rtnl_link_ops that can be set by drivers. For CAN devices setting this flag, default_device_exit() considers them non-virtual, applying the usual namespace move. The issue was introduced in the commit mentioned below, as at that time CAN devices did not have a dellink() operation. Fixes: e008b5fc8dc7 ("net: Simplfy default_device_exit and improve batching.") Link: https://lore.kernel.org/r/20210302122423.872326-1-martin@strongswan.org Signed-off-by: Martin Willi Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/netlink.c | 1 + include/net/rtnetlink.h | 2 ++ net/core/dev.c | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index 867f6be31230..f5d79e6e5483 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -355,6 +355,7 @@ static void can_dellink(struct net_device *dev, struct list_head *head) struct rtnl_link_ops can_link_ops __read_mostly = { .kind = "can", + .netns_refund = true, .maxtype = IFLA_CAN_MAX, .policy = can_policy, .setup = can_setup, diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index e2091bb2b3a8..4da61c950e93 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -33,6 +33,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * * @list: Used internally * @kind: Identifier + * @netns_refund: Physical device, move to init_net on netns exit * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters @@ -64,6 +65,7 @@ struct rtnl_link_ops { size_t priv_size; void (*setup)(struct net_device *dev); + bool netns_refund; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], diff --git a/net/core/dev.c b/net/core/dev.c index 6c5967e80132..a142a207fc1d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11346,7 +11346,7 @@ static void __net_exit default_device_exit(struct net *net) continue; /* Leave virtual devices for the generic cleanup */ - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ From e4912459bd5edd493b61bc7c3a5d9b2eb17f5a89 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 18 Feb 2021 21:58:36 +0100 Subject: [PATCH 079/171] can: isotp: isotp_setsockopt(): only allow to set low level TX flags for CAN-FD CAN-FD frames have struct canfd_frame::flags, while classic CAN frames don't. This patch refuses to set TX flags (struct can_isotp_ll_options::tx_flags) on non CAN-FD isotp sockets. Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Link: https://lore.kernel.org/r/20210218215434.1708249-2-mkl@pengutronix.de Cc: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index 3ef7f78e553b..e32d446c121e 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1228,7 +1228,8 @@ static int isotp_setsockopt(struct socket *sock, int level, int optname, if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) return -EINVAL; - if (ll.mtu == CAN_MTU && ll.tx_dl > CAN_MAX_DLEN) + if (ll.mtu == CAN_MTU && + (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0)) return -EINVAL; memcpy(&so->ll, &ll, sizeof(ll)); From d4eb538e1f48b3cf7bb6cb9eb39fe3e9e8a701f7 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 18 Feb 2021 21:24:20 +0100 Subject: [PATCH 080/171] can: isotp: TX-path: ensure that CAN frame flags are initialized The previous patch ensures that the TX flags (struct can_isotp_ll_options::tx_flags) are 0 for classic CAN frames or a user configured value for CAN-FD frames. This patch sets the CAN frames flags unconditionally to the ISO-TP TX flags, so that they are initialized to a proper value. Otherwise when running "candump -x" on a classical CAN ISO-TP stream shows wrongly set "B" and "E" flags. | $ candump any,0:0,#FFFFFFFF -extA | [...] | can0 TX B E 713 [8] 2B 0A 0B 0C 0D 0E 0F 00 | can0 TX B E 713 [8] 2C 01 02 03 04 05 06 07 | can0 TX B E 713 [8] 2D 08 09 0A 0B 0C 0D 0E | can0 TX B E 713 [8] 2E 0F 00 01 02 03 04 05 Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Link: https://lore.kernel.org/r/20210218215434.1708249-2-mkl@pengutronix.de Cc: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index e32d446c121e..430976485d95 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -215,8 +215,7 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) if (ae) ncf->data[0] = so->opt.ext_address; - if (so->ll.mtu == CANFD_MTU) - ncf->flags = so->ll.tx_flags; + ncf->flags = so->ll.tx_flags; can_send_ret = can_send(nskb, 1); if (can_send_ret) @@ -790,8 +789,7 @@ isotp_tx_burst: so->tx.sn %= 16; so->tx.bs++; - if (so->ll.mtu == CANFD_MTU) - cf->flags = so->ll.tx_flags; + cf->flags = so->ll.tx_flags; skb->dev = dev; can_skb_set_owner(skb, sk); @@ -939,8 +937,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } /* send the first or only CAN frame */ - if (so->ll.mtu == CANFD_MTU) - cf->flags = so->ll.tx_flags; + cf->flags = so->ll.tx_flags; skb->dev = dev; skb->sk = sk; From 59ec7b89ed3e921cd0625a8c83f31a30d485fdf8 Mon Sep 17 00:00:00 2001 From: Stephane Grosjean Date: Tue, 9 Mar 2021 09:21:27 +0100 Subject: [PATCH 081/171] can: peak_usb: add forgotten supported devices Since the peak_usb driver also supports the CAN-USB interfaces "PCAN-USB X6" and "PCAN-Chip USB" from PEAK-System GmbH, this patch adds their names to the list of explicitly supported devices. Fixes: ea8b65b596d7 ("can: usb: Add support of PCAN-Chip USB stamp module") Fixes: f00b534ded60 ("can: peak: Add support for PCAN-USB X6 USB interface") Link: https://lore.kernel.org/r/20210309082128.23125-3-s.grosjean@peak-system.com Signed-off-by: Stephane Grosjean Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index f347ecc79aef..f1d018218c93 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -18,6 +18,8 @@ MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB FD adapter"); MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB Pro FD adapter"); +MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-Chip USB"); +MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB X6 adapter"); #define PCAN_USBPROFD_CHANNEL_COUNT 2 #define PCAN_USBFD_CHANNEL_COUNT 1 From 47c5e474bc1e1061fb037d13b5000b38967eb070 Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Tue, 16 Mar 2021 00:15:10 +0100 Subject: [PATCH 082/171] can: flexcan: flexcan_chip_freeze(): fix chip freeze for missing bitrate For cases when flexcan is built-in, bitrate is still not set at registering. So flexcan_chip_freeze() generates: [ 1.860000] *** ZERO DIVIDE *** FORMAT=4 [ 1.860000] Current process id is 1 [ 1.860000] BAD KERNEL TRAP: 00000000 [ 1.860000] PC: [<402e70c8>] flexcan_chip_freeze+0x1a/0xa8 To allow chip freeze, using an hardcoded timeout when bitrate is still not set. Fixes: ec15e27cc890 ("can: flexcan: enable RX FIFO after FRZ/HALT valid") Link: https://lore.kernel.org/r/20210315231510.650593-1-angelo@kernel-space.org Signed-off-by: Angelo Dureghello [mkl: use if instead of ? operator] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 134c05757a3b..57f3635ad8d7 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -697,9 +697,15 @@ static int flexcan_chip_disable(struct flexcan_priv *priv) static int flexcan_chip_freeze(struct flexcan_priv *priv) { struct flexcan_regs __iomem *regs = priv->regs; - unsigned int timeout = 1000 * 1000 * 10 / priv->can.bittiming.bitrate; + unsigned int timeout; + u32 bitrate = priv->can.bittiming.bitrate; u32 reg; + if (bitrate) + timeout = 1000 * 1000 * 10 / bitrate; + else + timeout = FLEXCAN_TIMEOUT_US / 10; + reg = priv->read(®s->mcr); reg |= FLEXCAN_MCR_FRZ | FLEXCAN_MCR_HALT; priv->write(reg, ®s->mcr); From 7c6e6bce08f918b64459415f58061d4d6df44994 Mon Sep 17 00:00:00 2001 From: Jimmy Assarsson Date: Tue, 9 Mar 2021 10:17:23 +0100 Subject: [PATCH 083/171] can: kvaser_pciefd: Always disable bus load reporting Under certain circumstances, when switching from Kvaser's linuxcan driver (kvpciefd) to the SocketCAN driver (kvaser_pciefd), the bus load reporting is not disabled. This is flooding the kernel log with prints like: [3485.574677] kvaser_pciefd 0000:02:00.0: Received unexpected packet type 0x00000009 Always put the controller in the expected state, instead of assuming that bus load reporting is inactive. Note: If bus load reporting is enabled when the driver is loaded, you will still get a number of bus load packages (and printouts), before it is disabled. Fixes: 26ad340e582d ("can: kvaser_pciefd: Add driver for Kvaser PCIEcan devices") Link: https://lore.kernel.org/r/20210309091724.31262-1-jimmyassarsson@gmail.com Signed-off-by: Jimmy Assarsson Signed-off-by: Marc Kleine-Budde --- drivers/net/can/kvaser_pciefd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c index 37e05010ca91..74d9899fc904 100644 --- a/drivers/net/can/kvaser_pciefd.c +++ b/drivers/net/can/kvaser_pciefd.c @@ -57,6 +57,7 @@ MODULE_DESCRIPTION("CAN driver for Kvaser CAN/PCIe devices"); #define KVASER_PCIEFD_KCAN_STAT_REG 0x418 #define KVASER_PCIEFD_KCAN_MODE_REG 0x41c #define KVASER_PCIEFD_KCAN_BTRN_REG 0x420 +#define KVASER_PCIEFD_KCAN_BUS_LOAD_REG 0x424 #define KVASER_PCIEFD_KCAN_BTRD_REG 0x428 #define KVASER_PCIEFD_KCAN_PWM_REG 0x430 /* Loopback control register */ @@ -949,6 +950,9 @@ static int kvaser_pciefd_setup_can_ctrls(struct kvaser_pciefd *pcie) timer_setup(&can->bec_poll_timer, kvaser_pciefd_bec_poll_timer, 0); + /* Disable Bus load reporting */ + iowrite32(0, can->reg_base + KVASER_PCIEFD_KCAN_BUS_LOAD_REG); + tx_npackets = ioread32(can->reg_base + KVASER_PCIEFD_KCAN_TX_NPACKETS_REG); if (((tx_npackets >> KVASER_PCIEFD_KCAN_TX_NPACKETS_MAX_SHIFT) & From 7507479c46b120c37ef83e59be7683a526e98e1a Mon Sep 17 00:00:00 2001 From: Jimmy Assarsson Date: Tue, 9 Mar 2021 10:17:24 +0100 Subject: [PATCH 084/171] can: kvaser_usb: Add support for USBcan Pro 4xHS Add support for Kvaser USBcan Pro 4xHS. Link: https://lore.kernel.org/r/20210309091724.31262-2-jimmyassarsson@gmail.com Signed-off-by: Jimmy Assarsson Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/Kconfig | 1 + drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/usb/Kconfig b/drivers/net/can/usb/Kconfig index c1e5d5b570b6..538f4d9adb91 100644 --- a/drivers/net/can/usb/Kconfig +++ b/drivers/net/can/usb/Kconfig @@ -73,6 +73,7 @@ config CAN_KVASER_USB - Kvaser Memorator Pro 5xHS - Kvaser USBcan Light 4xHS - Kvaser USBcan Pro 2xHS v2 + - Kvaser USBcan Pro 4xHS - Kvaser USBcan Pro 5xHS - Kvaser U100 - Kvaser U100P diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index 2b7efd296758..4e97da8434ab 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -86,8 +86,9 @@ #define USB_U100_PRODUCT_ID 273 #define USB_U100P_PRODUCT_ID 274 #define USB_U100S_PRODUCT_ID 275 +#define USB_USBCAN_PRO_4HS_PRODUCT_ID 276 #define USB_HYDRA_PRODUCT_ID_END \ - USB_U100S_PRODUCT_ID + USB_USBCAN_PRO_4HS_PRODUCT_ID static inline bool kvaser_is_leaf(const struct usb_device_id *id) { @@ -193,6 +194,7 @@ static const struct usb_device_id kvaser_usb_table[] = { { USB_DEVICE(KVASER_VENDOR_ID, USB_U100_PRODUCT_ID) }, { USB_DEVICE(KVASER_VENDOR_ID, USB_U100P_PRODUCT_ID) }, { USB_DEVICE(KVASER_VENDOR_ID, USB_U100S_PRODUCT_ID) }, + { USB_DEVICE(KVASER_VENDOR_ID, USB_USBCAN_PRO_4HS_PRODUCT_ID) }, { } }; MODULE_DEVICE_TABLE(usb, kvaser_usb_table); From 0429d6d89f97ebff4f17f13f5b5069c66bde8138 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 28 Feb 2021 21:45:11 -0500 Subject: [PATCH 085/171] can: c_can_pci: c_can_pci_remove(): fix use-after-free There is a UAF in c_can_pci_remove(). dev is released by free_c_can_dev() and is used by pci_iounmap(pdev, priv->base) later. To fix this issue, save the mmio address before releasing dev. Fixes: 5b92da0443c2 ("c_can_pci: generic module for C_CAN/D_CAN on PCI") Link: https://lore.kernel.org/r/20210301024512.539039-1-ztong0001@gmail.com Signed-off-by: Tong Zhang Signed-off-by: Marc Kleine-Budde --- drivers/net/can/c_can/c_can_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/c_can/c_can_pci.c b/drivers/net/can/c_can/c_can_pci.c index 406b4847e5dc..7efb60b50876 100644 --- a/drivers/net/can/c_can/c_can_pci.c +++ b/drivers/net/can/c_can/c_can_pci.c @@ -239,12 +239,13 @@ static void c_can_pci_remove(struct pci_dev *pdev) { struct net_device *dev = pci_get_drvdata(pdev); struct c_can_priv *priv = netdev_priv(dev); + void __iomem *addr = priv->base; unregister_c_can_dev(dev); free_c_can_dev(dev); - pci_iounmap(pdev, priv->base); + pci_iounmap(pdev, addr); pci_disable_msi(pdev); pci_clear_master(pdev); pci_release_regions(pdev); From 6e2fe01dd6f98da6cae8b07cd5cfa67abc70d97d Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Mon, 1 Mar 2021 21:55:40 -0500 Subject: [PATCH 086/171] can: c_can: move runtime PM enable/disable to c_can_platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently doing modprobe c_can_pci will make the kernel complain: Unbalanced pm_runtime_enable! this is caused by pm_runtime_enable() called before pm is initialized. This fix is similar to 227619c3ff7c, move those pm_enable/disable code to c_can_platform. Fixes: 4cdd34b26826 ("can: c_can: Add runtime PM support to Bosch C_CAN/D_CAN controller") Link: http://lore.kernel.org/r/20210302025542.987600-1-ztong0001@gmail.com Signed-off-by: Tong Zhang Tested-by: Uwe Kleine-König Signed-off-by: Marc Kleine-Budde --- drivers/net/can/c_can/c_can.c | 24 +----------------------- drivers/net/can/c_can/c_can_platform.c | 6 +++++- 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c index ef474bae47a1..6958830cb983 100644 --- a/drivers/net/can/c_can/c_can.c +++ b/drivers/net/can/c_can/c_can.c @@ -212,18 +212,6 @@ static const struct can_bittiming_const c_can_bittiming_const = { .brp_inc = 1, }; -static inline void c_can_pm_runtime_enable(const struct c_can_priv *priv) -{ - if (priv->device) - pm_runtime_enable(priv->device); -} - -static inline void c_can_pm_runtime_disable(const struct c_can_priv *priv) -{ - if (priv->device) - pm_runtime_disable(priv->device); -} - static inline void c_can_pm_runtime_get_sync(const struct c_can_priv *priv) { if (priv->device) @@ -1335,7 +1323,6 @@ static const struct net_device_ops c_can_netdev_ops = { int register_c_can_dev(struct net_device *dev) { - struct c_can_priv *priv = netdev_priv(dev); int err; /* Deactivate pins to prevent DRA7 DCAN IP from being @@ -1345,28 +1332,19 @@ int register_c_can_dev(struct net_device *dev) */ pinctrl_pm_select_sleep_state(dev->dev.parent); - c_can_pm_runtime_enable(priv); - dev->flags |= IFF_ECHO; /* we support local echo */ dev->netdev_ops = &c_can_netdev_ops; err = register_candev(dev); - if (err) - c_can_pm_runtime_disable(priv); - else + if (!err) devm_can_led_init(dev); - return err; } EXPORT_SYMBOL_GPL(register_c_can_dev); void unregister_c_can_dev(struct net_device *dev) { - struct c_can_priv *priv = netdev_priv(dev); - unregister_candev(dev); - - c_can_pm_runtime_disable(priv); } EXPORT_SYMBOL_GPL(unregister_c_can_dev); diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c index 05f425ceb53a..47b251b1607c 100644 --- a/drivers/net/can/c_can/c_can_platform.c +++ b/drivers/net/can/c_can/c_can_platform.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -386,6 +387,7 @@ static int c_can_plat_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dev); SET_NETDEV_DEV(dev, &pdev->dev); + pm_runtime_enable(priv->device); ret = register_c_can_dev(dev); if (ret) { dev_err(&pdev->dev, "registering %s failed (err=%d)\n", @@ -398,6 +400,7 @@ static int c_can_plat_probe(struct platform_device *pdev) return 0; exit_free_device: + pm_runtime_disable(priv->device); free_c_can_dev(dev); exit: dev_err(&pdev->dev, "probe failed\n"); @@ -408,9 +411,10 @@ exit: static int c_can_plat_remove(struct platform_device *pdev) { struct net_device *dev = platform_get_drvdata(pdev); + struct c_can_priv *priv = netdev_priv(dev); unregister_c_can_dev(dev); - + pm_runtime_disable(priv->device); free_c_can_dev(dev); return 0; From c0e399f3baf42279f48991554240af8c457535d1 Mon Sep 17 00:00:00 2001 From: Torin Cooper-Bennun Date: Wed, 3 Mar 2021 10:31:52 +0000 Subject: [PATCH 087/171] can: m_can: m_can_do_rx_poll(): fix extraneous msg loss warning Message loss from RX FIFO 0 is already handled in m_can_handle_lost_msg(), with netdev output included. Removing this warning also improves driver performance under heavy load, where m_can_do_rx_poll() may be called many times before this interrupt is cleared, causing this message to be output many times (thanks Mariusz Madej for this report). Fixes: e0d1f4816f2a ("can: m_can: add Bosch M_CAN controller support") Link: https://lore.kernel.org/r/20210303103151.3760532-1-torin@maxiluxsystems.com Reported-by: Mariusz Madej Signed-off-by: Torin Cooper-Bennun Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 3752520a7d4b..d783c46cac16 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -501,9 +501,6 @@ static int m_can_do_rx_poll(struct net_device *dev, int quota) } while ((rxfs & RXFS_FFL_MASK) && (quota > 0)) { - if (rxfs & RXFS_RFL) - netdev_warn(dev, "Rx FIFO 0 Message Lost\n"); - m_can_read_fifo(dev, rxfs); quota--; From e98d9ee64ee2cc9b1d1a8e26610ec4d0392ebe50 Mon Sep 17 00:00:00 2001 From: Torin Cooper-Bennun Date: Wed, 3 Mar 2021 14:43:51 +0000 Subject: [PATCH 088/171] can: m_can: m_can_rx_peripheral(): fix RX being blocked by errors For M_CAN peripherals, m_can_rx_handler() was called with quota = 1, which caused any error handling to block RX from taking place until the next time the IRQ handler is called. This had been observed to cause RX to be blocked indefinitely in some cases. This is fixed by calling m_can_rx_handler with a sensibly high quota. Fixes: f524f829b75a ("can: m_can: Create a m_can platform framework") Link: https://lore.kernel.org/r/20210303144350.4093750-1-torin@maxiluxsystems.com Suggested-by: Marc Kleine-Budde Signed-off-by: Torin Cooper-Bennun Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index d783c46cac16..0c8d36bc668c 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -873,7 +873,7 @@ static int m_can_rx_peripheral(struct net_device *dev) { struct m_can_classdev *cdev = netdev_priv(dev); - m_can_rx_handler(dev, 1); + m_can_rx_handler(dev, M_CAN_NAPI_WEIGHT); m_can_enable_all_interrupts(cdev); From 1944015fe9c1d9fa5e9eb7ffbbb5ef8954d6753b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Feb 2021 11:22:14 +0100 Subject: [PATCH 089/171] mac80211: fix rate mask reset Coverity reported the strange "if (~...)" condition that's always true. It suggested that ! was intended instead of ~, but upon further analysis I'm convinced that what really was intended was a comparison to 0xff/0xffff (in HT/VHT cases respectively), since this indicates that all of the rates are enabled. Change the comparison accordingly. I'm guessing this never really mattered because a reset to not having a rate mask is basically equivalent to having a mask that enables all rates. Reported-by: Colin Ian King Fixes: 2ffbe6d33366 ("mac80211: fix and optimize MCS mask handling") Fixes: b119ad6e726c ("mac80211: add rate mask logic for vht rates") Reviewed-by: Colin Ian King Link: https://lore.kernel.org/r/20210212112213.36b38078f569.I8546a20c80bc1669058eb453e213630b846e107b@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c4c70e30ad7f..68a0de02b561 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2950,14 +2950,14 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { - if (~sdata->rc_rateidx_mcs_mask[i][j]) { + if (sdata->rc_rateidx_mcs_mask[i][j] != 0xff) { sdata->rc_has_mcs_mask[i] = true; break; } } for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { - if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) { + if (sdata->rc_rateidx_vht_mcs_mask[i][j] != 0xffff) { sdata->rc_has_vht_mcs_mask[i] = true; break; } From 3bd801b14e0c5d29eeddc7336558beb3344efaa3 Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Sat, 13 Feb 2021 14:36:53 +0100 Subject: [PATCH 090/171] mac80211: fix double free in ibss_leave Clear beacon ie pointer and ie length after free in order to prevent double free. ================================================================== BUG: KASAN: double-free or invalid-free \ in ieee80211_ibss_leave+0x83/0xe0 net/mac80211/ibss.c:1876 CPU: 0 PID: 8472 Comm: syz-executor100 Not tainted 5.11.0-rc6-syzkaller #0 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x5b/0x2c6 mm/kasan/report.c:230 kasan_report_invalid_free+0x51/0x80 mm/kasan/report.c:355 ____kasan_slab_free+0xcc/0xe0 mm/kasan/common.c:341 kasan_slab_free include/linux/kasan.h:192 [inline] __cache_free mm/slab.c:3424 [inline] kfree+0xed/0x270 mm/slab.c:3760 ieee80211_ibss_leave+0x83/0xe0 net/mac80211/ibss.c:1876 rdev_leave_ibss net/wireless/rdev-ops.h:545 [inline] __cfg80211_leave_ibss+0x19a/0x4c0 net/wireless/ibss.c:212 __cfg80211_leave+0x327/0x430 net/wireless/core.c:1172 cfg80211_leave net/wireless/core.c:1221 [inline] cfg80211_netdev_notifier_call+0x9e8/0x12c0 net/wireless/core.c:1335 notifier_call_chain+0xb5/0x200 kernel/notifier.c:83 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:2040 call_netdevice_notifiers_extack net/core/dev.c:2052 [inline] call_netdevice_notifiers net/core/dev.c:2066 [inline] __dev_close_many+0xee/0x2e0 net/core/dev.c:1586 __dev_close net/core/dev.c:1624 [inline] __dev_change_flags+0x2cb/0x730 net/core/dev.c:8476 dev_change_flags+0x8a/0x160 net/core/dev.c:8549 dev_ifsioc+0x210/0xa70 net/core/dev_ioctl.c:265 dev_ioctl+0x1b1/0xc40 net/core/dev_ioctl.c:511 sock_do_ioctl+0x148/0x2d0 net/socket.c:1060 sock_ioctl+0x477/0x6a0 net/socket.c:1177 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported-by: syzbot+93976391bf299d425f44@syzkaller.appspotmail.com Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20210213133653.367130-1-markus.theil@tu-ilmenau.de Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 1f552f374e97..a7ac53a2f00d 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1874,6 +1874,8 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) /* remove beacon */ kfree(sdata->u.ibss.ie); + sdata->u.ibss.ie = NULL; + sdata->u.ibss.ie_len = 0; /* on the next join, re-program HT parameters */ memset(&ifibss->ht_capa, 0, sizeof(ifibss->ht_capa)); From 29175be06d2f7d0e694bbdd086644dc15db66d60 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 19 Feb 2021 10:57:45 +0100 Subject: [PATCH 091/171] mac80211: minstrel_ht: remove unused variable 'mg' This probably came in through some refactoring and what is now a call to minstrel_ht_group_min_rate_offset(), remove the unused variable. Reported-by: kernel test robot Acked-by: Felix Fietkau Link: https://lore.kernel.org/r/20210219105744.f2538a80f6cf.I3d53554c158d5b896ac07ea546bceac67372ec28@changeid Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 2f44f4919789..ecad9b10984f 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -805,7 +805,6 @@ minstrel_ht_group_min_rate_offset(struct minstrel_ht_sta *mi, int group, static u16 minstrel_ht_next_inc_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur) { - struct minstrel_mcs_group_data *mg; u8 type = MINSTREL_SAMPLE_TYPE_INC; int i, index = 0; u8 group; @@ -813,7 +812,6 @@ minstrel_ht_next_inc_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur) group = mi->sample[type].sample_group; for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); - mg = &mi->groups[group]; index = minstrel_ht_group_min_rate_offset(mi, group, fast_rate_dur); From 0f7e90faddeef53a3568f449a0c3992d77510b66 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 23 Feb 2021 13:19:26 +0800 Subject: [PATCH 092/171] mac80211: Allow HE operation to be longer than expected. We observed some Cisco APs sending the following HE Operation IE in associate response: ff 0a 24 f4 3f 00 01 fc ff 00 00 00 Its HE operation parameter is 0x003ff4, so the expected total length is 7 which does not match the actual length = 10. This causes association failing with "HE AP is missing HE Capability/operation." According to P802.11ax_D4 Table9-94, HE operation is extensible, and according to 802.11-2016 10.27.8, STA should discard the part beyond the maximum length and parse the truncated element. Allow HE operation element to be longer than expected to handle this case and future extensions. Fixes: e4d005b80dee ("mac80211: refactor extended element parsing") Signed-off-by: Brian Norris Signed-off-by: Yen-lin Lai Link: https://lore.kernel.org/r/20210223051926.2653301-1-yenlinlai@chromium.org Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- net/mac80211/util.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2e33a1263518..ce4e3855fec1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5071,7 +5071,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ies->data, ies->len); if (he_oper_ie && - he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3])) + he_oper_ie[1] >= ieee80211_he_oper_size(&he_oper_ie[3])) he_oper = (void *)(he_oper_ie + 3); else he_oper = NULL; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f080fcf60e45..c0fa526a45b4 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -968,7 +968,7 @@ static void ieee80211_parse_extension_element(u32 *crc, break; case WLAN_EID_EXT_HE_OPERATION: if (len >= sizeof(*elems->he_operation) && - len == ieee80211_he_oper_size(data) - 1) { + len >= ieee80211_he_oper_size(data) - 1) { if (crc) *crc = crc32_be(*crc, (void *)elem, elem->datalen + 2); From 58d25626f6f0ea5bcec3c13387b9f835d188723d Mon Sep 17 00:00:00 2001 From: Daniel Phan Date: Tue, 9 Mar 2021 12:41:36 -0800 Subject: [PATCH 093/171] mac80211: Check crypto_aead_encrypt for errors crypto_aead_encrypt returns <0 on error, so if these calls are not checked, execution may continue with failed encrypts. It also seems that these two crypto_aead_encrypt calls are the only instances in the codebase that are not checked for errors. Signed-off-by: Daniel Phan Link: https://lore.kernel.org/r/20210309204137.823268-1-daniel.phan36@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/aead_api.c | 5 +++-- net/mac80211/aes_gmac.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/net/mac80211/aead_api.c b/net/mac80211/aead_api.c index d7b3d905d535..b00d6f5b33f4 100644 --- a/net/mac80211/aead_api.c +++ b/net/mac80211/aead_api.c @@ -23,6 +23,7 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, struct aead_request *aead_req; int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); u8 *__aad; + int ret; aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC); if (!aead_req) @@ -40,10 +41,10 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, aead_request_set_crypt(aead_req, sg, sg, data_len, b_0); aead_request_set_ad(aead_req, sg[0].length); - crypto_aead_encrypt(aead_req); + ret = crypto_aead_encrypt(aead_req); kfree_sensitive(aead_req); - return 0; + return ret; } int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c index 6f3b3a0cc10a..512cab073f2e 100644 --- a/net/mac80211/aes_gmac.c +++ b/net/mac80211/aes_gmac.c @@ -22,6 +22,7 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, struct aead_request *aead_req; int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); const __le16 *fc; + int ret; if (data_len < GMAC_MIC_LEN) return -EINVAL; @@ -59,10 +60,10 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, aead_request_set_crypt(aead_req, sg, sg, 0, iv); aead_request_set_ad(aead_req, GMAC_AAD_LEN + data_len); - crypto_aead_encrypt(aead_req); + ret = crypto_aead_encrypt(aead_req); kfree_sensitive(aead_req); - return 0; + return ret; } struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[], From 77cbf790e5b482256662e14c8b6ef4fecb07d06d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Mar 2021 21:58:40 +0100 Subject: [PATCH 094/171] nl80211: fix locking for wireless device netns change We have all the network interfaces marked as netns-local since the only reasonable thing to do right now is to set a whole device, including all netdevs, into a different network namespace. For this reason, we also have our own way of changing the network namespace. Unfortunately, the RTNL locking changes broke this, and it now results in many RTNL assertions. The trivial fix for those (just hold RTNL for the changes) however leads to deadlocks in the cfg80211 netdev notifier. Since we only need the wiphy, and that's still protected by the RTNL, add a new NL80211_FLAG_NO_WIPHY_MTX flag to the nl80211 ops and use it to _not_ take the wiphy mutex but only the RTNL. This way, the notifier does all the work necessary during unregistration/registration of the netdevs from the old and in the new namespace. Reported-by: Sid Hayn Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20210310215839.eadf7c43781b.I5fc6cf6676f800ab8008e03bbea9c3349b02d804@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 521d36bb0803..ae6097fff133 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14789,6 +14789,7 @@ bad_tid_conf: #define NL80211_FLAG_NEED_WDEV_UP (NL80211_FLAG_NEED_WDEV |\ NL80211_FLAG_CHECK_NETDEV_UP) #define NL80211_FLAG_CLEAR_SKB 0x20 +#define NL80211_FLAG_NO_WIPHY_MTX 0x40 static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -14840,7 +14841,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, info->user_ptr[0] = rdev; } - if (rdev) { + if (rdev && !(ops->internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) { wiphy_lock(&rdev->wiphy); /* we keep the mutex locked until post_doit */ __release(&rdev->wiphy.mtx); @@ -14865,7 +14866,8 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, } } - if (info->user_ptr[0]) { + if (info->user_ptr[0] && + !(ops->internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; /* we kept the mutex locked since pre_doit */ @@ -15329,7 +15331,9 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_wiphy_netns, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_NO_WIPHY_MTX, }, { .cmd = NL80211_CMD_GET_SURVEY, From 041c881a0ba8a75f71118bd9766b78f04beed469 Mon Sep 17 00:00:00 2001 From: Karthikeyan Kathirvel Date: Thu, 11 Mar 2021 10:59:07 +0530 Subject: [PATCH 095/171] mac80211: choose first enabled channel for monitor Even if the first channel from sband channel list is invalid or disabled mac80211 ends up choosing it as the default channel for monitor interfaces, making them not usable. Fix this by assigning the first available valid or enabled channel instead. Signed-off-by: Karthikeyan Kathirvel Link: https://lore.kernel.org/r/1615440547-7661-1-git-send-email-kathirve@codeaurora.org [reword commit message, comment, code cleanups] Signed-off-by: Johannes Berg --- net/mac80211/main.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 4f3f8bb58e76..1b9c82616606 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -973,8 +973,19 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) continue; if (!dflt_chandef.chan) { + /* + * Assign the first enabled channel to dflt_chandef + * from the list of channels + */ + for (i = 0; i < sband->n_channels; i++) + if (!(sband->channels[i].flags & + IEEE80211_CHAN_DISABLED)) + break; + /* if none found then use the first anyway */ + if (i == sband->n_channels) + i = 0; cfg80211_chandef_create(&dflt_chandef, - &sband->channels[0], + &sband->channels[i], NL80211_CHAN_NO_HT); /* init channel we're on */ if (!local->use_chanctx && !local->_oper_chandef.chan) { From 239729a21e528466d02f5558936306ffa9314ad1 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 12 Mar 2021 11:36:51 -0500 Subject: [PATCH 096/171] wireless/nl80211: fix wdev_id may be used uninitialized Build currently fails with -Werror=maybe-uninitialized set: net/wireless/nl80211.c: In function '__cfg80211_wdev_from_attrs': net/wireless/nl80211.c:124:44: error: 'wdev_id' may be used uninitialized in this function [-Werror=maybe-uninitialized] Easy fix is to just initialize wdev_id to 0, since it's value doesn't otherwise matter unless have_wdev_id is true. Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") CC: Johannes Berg CC: "David S. Miller" CC: Jakub Kicinski CC: linux-wireless@vger.kernel.org CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Link: https://lore.kernel.org/r/20210312163651.1398207-1-jarod@redhat.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ae6097fff133..034af85f79d8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -70,7 +70,7 @@ __cfg80211_wdev_from_attrs(struct cfg80211_registered_device *rdev, struct wireless_dev *result = NULL; bool have_ifidx = attrs[NL80211_ATTR_IFINDEX]; bool have_wdev_id = attrs[NL80211_ATTR_WDEV]; - u64 wdev_id; + u64 wdev_id = 0; int wiphy_idx = -1; int ifidx = -1; From 81f711d67a973bf8a6db9556faf299b4074d536e Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Tue, 16 Mar 2021 01:04:29 +0000 Subject: [PATCH 097/171] selftests/net: fix warnings on reuseaddr_ports_exhausted Fix multiple warnings seen with gcc 10.2.1: reuseaddr_ports_exhausted.c:32:41: warning: missing braces around initializer [-Wmissing-braces] 32 | struct reuse_opts unreusable_opts[12] = { | ^ 33 | {0, 0, 0, 0}, | { } { } Fixes: 7f204a7de8b0 ("selftests: net: Add SO_REUSEADDR test to check if 4-tuples are fully utilized.") Signed-off-by: Carlos Llamas Signed-off-by: David S. Miller --- .../selftests/net/reuseaddr_ports_exhausted.c | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c index 7b01b7c2ec10..066efd30e294 100644 --- a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c @@ -30,25 +30,25 @@ struct reuse_opts { }; struct reuse_opts unreusable_opts[12] = { - {0, 0, 0, 0}, - {0, 0, 0, 1}, - {0, 0, 1, 0}, - {0, 0, 1, 1}, - {0, 1, 0, 0}, - {0, 1, 0, 1}, - {0, 1, 1, 0}, - {0, 1, 1, 1}, - {1, 0, 0, 0}, - {1, 0, 0, 1}, - {1, 0, 1, 0}, - {1, 0, 1, 1}, + {{0, 0}, {0, 0}}, + {{0, 0}, {0, 1}}, + {{0, 0}, {1, 0}}, + {{0, 0}, {1, 1}}, + {{0, 1}, {0, 0}}, + {{0, 1}, {0, 1}}, + {{0, 1}, {1, 0}}, + {{0, 1}, {1, 1}}, + {{1, 0}, {0, 0}}, + {{1, 0}, {0, 1}}, + {{1, 0}, {1, 0}}, + {{1, 0}, {1, 1}}, }; struct reuse_opts reusable_opts[4] = { - {1, 1, 0, 0}, - {1, 1, 0, 1}, - {1, 1, 1, 0}, - {1, 1, 1, 1}, + {{1, 1}, {0, 0}}, + {{1, 1}, {0, 1}}, + {{1, 1}, {1, 0}}, + {{1, 1}, {1, 1}}, }; int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport) From 8a4452ca29f9dc6a65e45a38c96af83b8ecb27fc Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 16 Mar 2021 11:27:37 +0800 Subject: [PATCH 098/171] docs: net: ena: Fix ena_start_xmit() function name typo The ena.rst documentation referred to end_start_xmit() when it should refer to ena_start_xmit(). Fix the typo. Signed-off-by: Zenghui Yu Acked-by: Shay Agroskin Signed-off-by: David S. Miller --- Documentation/networking/device_drivers/ethernet/amazon/ena.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/ethernet/amazon/ena.rst b/Documentation/networking/device_drivers/ethernet/amazon/ena.rst index 3561a8a29fd2..f8c6469f2bd2 100644 --- a/Documentation/networking/device_drivers/ethernet/amazon/ena.rst +++ b/Documentation/networking/device_drivers/ethernet/amazon/ena.rst @@ -267,7 +267,7 @@ DATA PATH Tx -- -end_start_xmit() is called by the stack. This function does the following: +ena_start_xmit() is called by the stack. This function does the following: - Maps data buffers (skb->data and frags). - Populates ena_buf for the push buffer (if the driver and device are From d29334c15d33a6a92d2043ca88f84cd5ad026c57 Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 16 Mar 2021 16:33:54 +0800 Subject: [PATCH 099/171] net/sched: act_api: fix miss set post_ct for ovs after do conntrack in act_ct When openvswitch conntrack offload with act_ct action. The first rule do conntrack in the act_ct in tc subsystem. And miss the next rule in the tc and fallback to the ovs datapath but miss set post_ct flag which will lead the ct_state_key with -trk flag. Fixes: 7baf2429a1a9 ("net/sched: cls_flower add CT_FLAGS_INVALID flag support") Signed-off-by: wenxu Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/openvswitch/conntrack.c | 8 +++++--- net/openvswitch/conntrack.h | 6 ++++-- net/openvswitch/flow.c | 4 +++- net/sched/cls_api.c | 1 + 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..f2c9ee71cb2c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -285,6 +285,7 @@ struct nf_bridge_info { struct tc_skb_ext { __u32 chain; __u16 mru; + bool post_ct; }; #endif diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 5eddfe7bd391..71cec03e8612 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -271,9 +271,11 @@ static void ovs_ct_update_key(const struct sk_buff *skb, /* This is called to initialize CT key fields possibly coming in from the local * stack. */ -void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) +void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key, + bool post_ct) { - ovs_ct_update_key(skb, NULL, key, false, false); + ovs_ct_update_key(skb, NULL, key, post_ct, false); } int ovs_ct_put_key(const struct sw_flow_key *swkey, @@ -1332,7 +1334,7 @@ int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) if (skb_nfct(skb)) { nf_conntrack_put(skb_nfct(skb)); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - ovs_ct_fill_key(skb, key); + ovs_ct_fill_key(skb, key, false); } return 0; diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index 59dc32761b91..317e525c8a11 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -25,7 +25,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, const struct ovs_conntrack_info *); int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key); -void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key, + bool post_ct); int ovs_ct_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, struct sk_buff *skb); void ovs_ct_free_action(const struct nlattr *a); @@ -74,7 +75,8 @@ static inline int ovs_ct_clear(struct sk_buff *skb, } static inline void ovs_ct_fill_key(const struct sk_buff *skb, - struct sw_flow_key *key) + struct sw_flow_key *key, + bool post_ct) { key->ct_state = 0; key->ct_zone = 0; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index c7f34d6a9934..e586424d8b04 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -857,6 +857,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) struct tc_skb_ext *tc_ext; #endif + bool post_ct = false; int res, err; /* Extract metadata from packet. */ @@ -895,6 +896,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, tc_ext = skb_ext_find(skb, TC_SKB_EXT); key->recirc_id = tc_ext ? tc_ext->chain : 0; OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0; + post_ct = tc_ext ? tc_ext->post_ct : false; } else { key->recirc_id = 0; } @@ -904,7 +906,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, err = key_extract(skb, key); if (!err) - ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */ + ovs_ct_fill_key(skb, key, post_ct); /* Must be after key_extract(). */ return err; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e37556cc37ab..13341e7fb077 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1629,6 +1629,7 @@ int tcf_classify_ingress(struct sk_buff *skb, return TC_ACT_SHOT; ext->chain = last_executed_chain; ext->mru = qdisc_skb_cb(skb)->mru; + ext->post_ct = qdisc_skb_cb(skb)->post_ct; } return ret; From a3bc483216650a7232559bf0a1debfbabff3e12c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 16 Mar 2021 15:03:41 +0100 Subject: [PATCH 100/171] net: broadcom: BCM4908_ENET should not default to y, unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Merely enabling compile-testing should not enable additional code. To fix this, restrict the automatic enabling of BCM4908_ENET to ARCH_BCM4908. Fixes: 4feffeadbcb2e5b1 ("net: broadcom: bcm4908enet: add BCM4908 controller driver") Signed-off-by: Geert Uytterhoeven Acked-by: Florian Fainelli Acked-by: Rafał Miłecki Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig index f8a168b73307..cb88ffb8f12f 100644 --- a/drivers/net/ethernet/broadcom/Kconfig +++ b/drivers/net/ethernet/broadcom/Kconfig @@ -54,7 +54,7 @@ config B44_PCI config BCM4908_ENET tristate "Broadcom BCM4908 internal mac support" depends on ARCH_BCM4908 || COMPILE_TEST - default y + default y if ARCH_BCM4908 help This driver supports Ethernet controller integrated into Broadcom BCM4908 family SoCs. From 982e5ee23d764fe6158f67a7813d416335e978b0 Mon Sep 17 00:00:00 2001 From: Louis Peens Date: Tue, 16 Mar 2021 19:13:08 +0100 Subject: [PATCH 101/171] nfp: flower: fix unsupported pre_tunnel flows There are some pre_tunnel flows combinations which are incorrectly being offloaded without proper support, fix these. - Matching on MPLS is not supported for pre_tun. - Match on IPv4/IPv6 layer must be present. - Destination MAC address must match pre_tun.dev MAC Fixes: 120ffd84a9ec ("nfp: flower: verify pre-tunnel rules") Signed-off-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/flower/offload.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 1c59aff2163c..d72225d64a75 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -1142,6 +1142,12 @@ nfp_flower_validate_pre_tun_rule(struct nfp_app *app, return -EOPNOTSUPP; } + if (!(key_layer & NFP_FLOWER_LAYER_IPV4) && + !(key_layer & NFP_FLOWER_LAYER_IPV6)) { + NL_SET_ERR_MSG_MOD(extack, "unsupported pre-tunnel rule: match on ipv4/ipv6 eth_type must be present"); + return -EOPNOTSUPP; + } + /* Skip fields known to exist. */ mask += sizeof(struct nfp_flower_meta_tci); ext += sizeof(struct nfp_flower_meta_tci); @@ -1152,6 +1158,13 @@ nfp_flower_validate_pre_tun_rule(struct nfp_app *app, mask += sizeof(struct nfp_flower_in_port); ext += sizeof(struct nfp_flower_in_port); + /* Ensure destination MAC address matches pre_tun_dev. */ + mac = (struct nfp_flower_mac_mpls *)ext; + if (memcmp(&mac->mac_dst[0], flow->pre_tun_rule.dev->dev_addr, 6)) { + NL_SET_ERR_MSG_MOD(extack, "unsupported pre-tunnel rule: dest MAC must match output dev MAC"); + return -EOPNOTSUPP; + } + /* Ensure destination MAC address is fully matched. */ mac = (struct nfp_flower_mac_mpls *)mask; if (!is_broadcast_ether_addr(&mac->mac_dst[0])) { @@ -1159,6 +1172,11 @@ nfp_flower_validate_pre_tun_rule(struct nfp_app *app, return -EOPNOTSUPP; } + if (mac->mpls_lse) { + NL_SET_ERR_MSG_MOD(extack, "unsupported pre-tunnel rule: MPLS not supported"); + return -EOPNOTSUPP; + } + mask += sizeof(struct nfp_flower_mac_mpls); ext += sizeof(struct nfp_flower_mac_mpls); if (key_layer & NFP_FLOWER_LAYER_IPV4 || From 5c4f5e19d6a8e159127b9d653bb67e0dc7a28047 Mon Sep 17 00:00:00 2001 From: Louis Peens Date: Tue, 16 Mar 2021 19:13:09 +0100 Subject: [PATCH 102/171] nfp: flower: add ipv6 bit to pre_tunnel control message Differentiate between ipv4 and ipv6 flows when configuring the pre_tunnel table to prevent them trampling each other in the table. Fixes: 783461604f7e ("nfp: flower: update flow merge code to support IPv6 tunnels") Signed-off-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/flower/tunnel_conf.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index 7248d248f604..d19c02e99114 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -16,8 +16,9 @@ #define NFP_FL_MAX_ROUTES 32 #define NFP_TUN_PRE_TUN_RULE_LIMIT 32 -#define NFP_TUN_PRE_TUN_RULE_DEL 0x1 -#define NFP_TUN_PRE_TUN_IDX_BIT 0x8 +#define NFP_TUN_PRE_TUN_RULE_DEL BIT(0) +#define NFP_TUN_PRE_TUN_IDX_BIT BIT(3) +#define NFP_TUN_PRE_TUN_IPV6_BIT BIT(7) /** * struct nfp_tun_pre_run_rule - rule matched before decap @@ -1268,6 +1269,7 @@ int nfp_flower_xmit_pre_tun_flow(struct nfp_app *app, { struct nfp_flower_priv *app_priv = app->priv; struct nfp_tun_offloaded_mac *mac_entry; + struct nfp_flower_meta_tci *key_meta; struct nfp_tun_pre_tun_rule payload; struct net_device *internal_dev; int err; @@ -1290,6 +1292,15 @@ int nfp_flower_xmit_pre_tun_flow(struct nfp_app *app, if (!mac_entry) return -ENOENT; + /* Set/clear IPV6 bit. cpu_to_be16() swap will lead to MSB being + * set/clear for port_idx. + */ + key_meta = (struct nfp_flower_meta_tci *)flow->unmasked_data; + if (key_meta->nfp_flow_key_layer & NFP_FLOWER_LAYER_IPV6) + mac_entry->index |= NFP_TUN_PRE_TUN_IPV6_BIT; + else + mac_entry->index &= ~NFP_TUN_PRE_TUN_IPV6_BIT; + payload.port_idx = cpu_to_be16(mac_entry->index); /* Copy mac id and vlan to flow - dev may not exist at delete time. */ From d8ce0275e45ec809a33f98fc080fe7921b720dfb Mon Sep 17 00:00:00 2001 From: Louis Peens Date: Tue, 16 Mar 2021 19:13:10 +0100 Subject: [PATCH 103/171] nfp: flower: fix pre_tun mask id allocation pre_tun_rule flows does not follow the usual add-flow path, instead they are used to update the pre_tun table on the firmware. This means that if the mask-id gets allocated here the firmware will never see the "NFP_FL_META_FLAG_MANAGE_MASK" flag for the specific mask id, which triggers the allocation on the firmware side. This leads to the firmware mask being corrupted and causing all sorts of strange behaviour. Fixes: f12725d98cbe ("nfp: flower: offload pre-tunnel rules") Signed-off-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/flower/metadata.c | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/metadata.c b/drivers/net/ethernet/netronome/nfp/flower/metadata.c index 5defd31d481c..aa06fcb38f8b 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/metadata.c +++ b/drivers/net/ethernet/netronome/nfp/flower/metadata.c @@ -327,8 +327,14 @@ int nfp_compile_flow_metadata(struct nfp_app *app, goto err_free_ctx_entry; } + /* Do net allocate a mask-id for pre_tun_rules. These flows are used to + * configure the pre_tun table and are never actually send to the + * firmware as an add-flow message. This causes the mask-id allocation + * on the firmware to get out of sync if allocated here. + */ new_mask_id = 0; - if (!nfp_check_mask_add(app, nfp_flow->mask_data, + if (!nfp_flow->pre_tun_rule.dev && + !nfp_check_mask_add(app, nfp_flow->mask_data, nfp_flow->meta.mask_len, &nfp_flow->meta.flags, &new_mask_id)) { NL_SET_ERR_MSG_MOD(extack, "invalid entry: cannot allocate a new mask id"); @@ -359,7 +365,8 @@ int nfp_compile_flow_metadata(struct nfp_app *app, goto err_remove_mask; } - if (!nfp_check_mask_remove(app, nfp_flow->mask_data, + if (!nfp_flow->pre_tun_rule.dev && + !nfp_check_mask_remove(app, nfp_flow->mask_data, nfp_flow->meta.mask_len, NULL, &new_mask_id)) { NL_SET_ERR_MSG_MOD(extack, "invalid entry: cannot release mask id"); @@ -374,8 +381,10 @@ int nfp_compile_flow_metadata(struct nfp_app *app, return 0; err_remove_mask: - nfp_check_mask_remove(app, nfp_flow->mask_data, nfp_flow->meta.mask_len, - NULL, &new_mask_id); + if (!nfp_flow->pre_tun_rule.dev) + nfp_check_mask_remove(app, nfp_flow->mask_data, + nfp_flow->meta.mask_len, + NULL, &new_mask_id); err_remove_rhash: WARN_ON_ONCE(rhashtable_remove_fast(&priv->stats_ctx_table, &ctx_entry->ht_node, @@ -406,9 +415,10 @@ int nfp_modify_flow_metadata(struct nfp_app *app, __nfp_modify_flow_metadata(priv, nfp_flow); - nfp_check_mask_remove(app, nfp_flow->mask_data, - nfp_flow->meta.mask_len, &nfp_flow->meta.flags, - &new_mask_id); + if (!nfp_flow->pre_tun_rule.dev) + nfp_check_mask_remove(app, nfp_flow->mask_data, + nfp_flow->meta.mask_len, &nfp_flow->meta.flags, + &new_mask_id); /* Update flow payload with mask ids. */ nfp_flow->unmasked_data[NFP_FL_MASK_ID_LOCATION] = new_mask_id; From fc649670ba50160dd29280c0d91c1635623e88e1 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Tue, 16 Mar 2021 10:13:52 -0700 Subject: [PATCH 104/171] MAINTAINERS: Update Spidernet network driver Change the Spidernet network driver from supported to maintained, add the linuxppc-dev ML, and add myself as a 'maintainer'. Cc: Ishizaki Kou Signed-off-by: Geoff Levand Signed-off-by: David S. Miller --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6acc3b839187..0d80d3dda3d0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16888,8 +16888,10 @@ F: tools/spi/ SPIDERNET NETWORK DRIVER for CELL M: Ishizaki Kou +M: Geoff Levand L: netdev@vger.kernel.org -S: Supported +L: linuxppc-dev@lists.ozlabs.org +S: Maintained F: Documentation/networking/device_drivers/ethernet/toshiba/spider_net.rst F: drivers/net/ethernet/toshiba/spider_net* From 8a141dd7f7060d1e64c14a5257e0babae20ac99b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Mar 2021 12:58:15 -0700 Subject: [PATCH 105/171] ftrace: Fix modify_ftrace_direct. The following sequence of commands: register_ftrace_direct(ip, addr1); modify_ftrace_direct(ip, addr1, addr2); unregister_ftrace_direct(ip, addr2); will cause the kernel to warn: [ 30.179191] WARNING: CPU: 2 PID: 1961 at kernel/trace/ftrace.c:5223 unregister_ftrace_direct+0x130/0x150 [ 30.180556] CPU: 2 PID: 1961 Comm: test_progs W O 5.12.0-rc2-00378-g86bc10a0a711-dirty #3246 [ 30.182453] RIP: 0010:unregister_ftrace_direct+0x130/0x150 When modify_ftrace_direct() changes the addr from old to new it should update the addr stored in ftrace_direct_funcs. Otherwise the final unregister_ftrace_direct() won't find the address and will cause the splat. Fixes: 0567d6809182 ("ftrace: Add modify_ftrace_direct()") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/bpf/20210316195815.34714-1-alexei.starovoitov@gmail.com --- kernel/trace/ftrace.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d8e35575549..b7e29db127fa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5045,6 +5045,20 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) return NULL; } +static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr) +{ + struct ftrace_direct_func *direct; + + direct = kmalloc(sizeof(*direct), GFP_KERNEL); + if (!direct) + return NULL; + direct->addr = addr; + direct->count = 0; + list_add_rcu(&direct->next, &ftrace_direct_funcs); + ftrace_direct_func_count++; + return direct; +} + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5120,15 +5134,11 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) direct = ftrace_find_direct_func(addr); if (!direct) { - direct = kmalloc(sizeof(*direct), GFP_KERNEL); + direct = ftrace_alloc_direct_func(addr); if (!direct) { kfree(entry); goto out_unlock; } - direct->addr = addr; - direct->count = 0; - list_add_rcu(&direct->next, &ftrace_direct_funcs); - ftrace_direct_func_count++; } entry->ip = ip; @@ -5329,6 +5339,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { + struct ftrace_direct_func *direct, *new_direct = NULL; struct ftrace_func_entry *entry; struct dyn_ftrace *rec; int ret = -ENODEV; @@ -5344,6 +5355,20 @@ int modify_ftrace_direct(unsigned long ip, if (entry->direct != old_addr) goto out_unlock; + direct = ftrace_find_direct_func(old_addr); + if (WARN_ON(!direct)) + goto out_unlock; + if (direct->count > 1) { + ret = -ENOMEM; + new_direct = ftrace_alloc_direct_func(new_addr); + if (!new_direct) + goto out_unlock; + direct->count--; + new_direct->count++; + } else { + direct->addr = new_addr; + } + /* * If there's no other ftrace callback on the rec->ip location, * then it can be changed directly by the architecture. @@ -5357,6 +5382,14 @@ int modify_ftrace_direct(unsigned long ip, ret = 0; } + if (unlikely(ret && new_direct)) { + direct->count++; + list_del_rcu(&new_direct->next); + synchronize_rcu_tasks(); + kfree(new_direct); + ftrace_direct_func_count--; + } + out_unlock: mutex_unlock(&ftrace_lock); mutex_unlock(&direct_mutex); From f232326f6966cf2a1d1db7bc917a4ce5f9f55f76 Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 09:47:02 +0100 Subject: [PATCH 106/171] bpf: Prohibit alu ops for pointer types not defining ptr_limit The purpose of this patch is to streamline error propagation and in particular to propagate retrieve_ptr_limit() errors for pointer types that are not defining a ptr_limit such that register-based alu ops against these types can be rejected. The main rationale is that a gap has been identified by Piotr in the existing protection against speculatively out-of-bounds loads, for example, in case of ctx pointers, unprivileged programs can still perform pointer arithmetic. This can be abused to execute speculatively out-of-bounds loads without restrictions and thus extract contents of kernel memory. Fix this by rejecting unprivileged programs that attempt any pointer arithmetic on unprotected pointer types. The two affected ones are pointer to ctx as well as pointer to map. Field access to a modified ctx' pointer is rejected at a later point in time in the verifier, and 7c6967326267 ("bpf: Permit map_ptr arithmetic with opcode add and offset 0") only relevant for root-only use cases. Risk of unprivileged program breakage is considered very low. Fixes: 7c6967326267 ("bpf: Permit map_ptr arithmetic with opcode add and offset 0") Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4192a9e56654..1c8cbef7cc14 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5934,6 +5934,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, u32 alu_state, alu_limit; struct bpf_reg_state tmp; bool ret; + int err; if (can_skip_alu_sanitation(env, insn)) return 0; @@ -5949,10 +5950,13 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; - if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) - return 0; - if (update_alu_sanitation_state(aux, alu_state, alu_limit)) - return -EACCES; + err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg); + if (err < 0) + return err; + + err = update_alu_sanitation_state(aux, alu_state, alu_limit); + if (err < 0) + return err; do_sim: /* Simulate and find potential out-of-bounds access under * speculative execution from truncation as a result of @@ -6103,7 +6107,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_ADD: ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); if (ret < 0) { - verbose(env, "R%d tried to add from different maps or paths\n", dst); + verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst); return ret; } /* We can take a fixed offset as long as it doesn't overflow @@ -6158,7 +6162,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_SUB: ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); if (ret < 0) { - verbose(env, "R%d tried to sub from different maps or paths\n", dst); + verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst); return ret; } if (dst_reg == off_reg) { From 10d2bb2e6b1d8c4576c56a748f697dbeb8388899 Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 08:20:16 +0100 Subject: [PATCH 107/171] bpf: Fix off-by-one for area size in creating mask to left retrieve_ptr_limit() computes the ptr_limit for registers with stack and map_value type. ptr_limit is the size of the memory area that is still valid / in-bounds from the point of the current position and direction of the operation (add / sub). This size will later be used for masking the operation such that attempting out-of-bounds access in the speculative domain is redirected to remain within the bounds of the current map value. When masking to the right the size is correct, however, when masking to the left, the size is off-by-one which would lead to an incorrect mask and thus incorrect arithmetic operation in the non-speculative domain. Piotr found that if the resulting alu_limit value is zero, then the BPF_MOV32_IMM() from the fixup_bpf_calls() rewrite will end up loading 0xffffffff into AX instead of sign-extending to the full 64 bit range, and as a result, this allows abuse for executing speculatively out-of- bounds loads against 4GB window of address space and thus extracting the contents of kernel memory via side-channel. Fixes: 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1c8cbef7cc14..56c63658b702 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5870,13 +5870,13 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) - *ptr_limit = MAX_BPF_STACK + off; + *ptr_limit = MAX_BPF_STACK + off + 1; else *ptr_limit = -off; return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { - *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + *ptr_limit = ptr_reg->umax_value + ptr_reg->off + 1; } else { off = ptr_reg->smin_value + ptr_reg->off; *ptr_limit = ptr_reg->map_ptr->value_size - off; From b5871dca250cd391885218b99cc015aca1a51aea Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 08:26:25 +0100 Subject: [PATCH 108/171] bpf: Simplify alu_limit masking for pointer arithmetic Instead of having the mov32 with aux->alu_limit - 1 immediate, move this operation to retrieve_ptr_limit() instead to simplify the logic and to allow for subsequent sanity boundary checks inside retrieve_ptr_limit(). This avoids in future that at the time of the verifier masking rewrite we'd run into an underflow which would not sign extend due to the nature of mov32 instruction. Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56c63658b702..d16722a99b61 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5870,16 +5870,16 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) - *ptr_limit = MAX_BPF_STACK + off + 1; + *ptr_limit = MAX_BPF_STACK + off; else - *ptr_limit = -off; + *ptr_limit = -off - 1; return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { - *ptr_limit = ptr_reg->umax_value + ptr_reg->off + 1; + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; } else { off = ptr_reg->smin_value + ptr_reg->off; - *ptr_limit = ptr_reg->map_ptr->value_size - off; + *ptr_limit = ptr_reg->map_ptr->value_size - off - 1; } return 0; default: @@ -11668,7 +11668,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) off_reg = issrc ? insn->src_reg : insn->dst_reg; if (isneg) *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); From d2c21422323b06938b3c070361dc544f047489d7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 16 Mar 2021 17:07:47 -0700 Subject: [PATCH 109/171] ionic: linearize tso skb with too many frags We were linearizing non-TSO skbs that had too many frags, but we weren't checking number of frags on TSO skbs. This could lead to a bad page reference when we received a TSO skb with more frags than the Tx descriptor could support. v2: use gso_segs rather than yet another division don't rework the check on the nr_frags Fixes: 0f3154e6bcb3 ("ionic: Add Tx and Rx handling") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 162a1ff1e9d2..4087311f7082 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -1079,15 +1079,17 @@ static int ionic_tx_descs_needed(struct ionic_queue *q, struct sk_buff *skb) { int sg_elems = q->lif->qtype_info[IONIC_QTYPE_TXQ].max_sg_elems; struct ionic_tx_stats *stats = q_to_tx_stats(q); + int ndescs; int err; - /* If TSO, need roundup(skb->len/mss) descs */ + /* Each desc is mss long max, so a descriptor for each gso_seg */ if (skb_is_gso(skb)) - return (skb->len / skb_shinfo(skb)->gso_size) + 1; + ndescs = skb_shinfo(skb)->gso_segs; + else + ndescs = 1; - /* If non-TSO, just need 1 desc and nr_frags sg elems */ if (skb_shinfo(skb)->nr_frags <= sg_elems) - return 1; + return ndescs; /* Too many frags, so linearize */ err = skb_linearize(skb); @@ -1096,8 +1098,7 @@ static int ionic_tx_descs_needed(struct ionic_queue *q, struct sk_buff *skb) stats->linearize++; - /* Need 1 desc and zero sg elems */ - return 1; + return ndescs; } static int ionic_maybe_stop_tx(struct ionic_queue *q, int ndescs) From afa536d8405a9ca36e45ba035554afbb8da27b82 Mon Sep 17 00:00:00 2001 From: wenxu Date: Wed, 17 Mar 2021 12:02:43 +0800 Subject: [PATCH 110/171] net/sched: cls_flower: fix only mask bit check in the validate_ct_state The ct_state validate should not only check the mask bit and also check mask_bit & key_bit.. For the +new+est case example, The 'new' and 'est' bits should be set in both state_mask and state flags. Or the -new-est case also will be reject by kernel. When Openvswitch with two flows ct_state=+trk+new,action=commit,forward ct_state=+trk+est,action=forward A packet go through the kernel and the contrack state is invalid, The ct_state will be +trk-inv. Upcall to the ovs-vswitchd, the finally dp action will be drop with -new-est+trk. Fixes: 1bcc51ac0731 ("net/sched: cls_flower: Reject invalid ct_state flags rules") Fixes: 3aed8b63336c ("net/sched: cls_flower: validate ct_state for invalid and reply flags") Signed-off-by: wenxu Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d097b5c15faa..c69a4ba9c33f 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1451,7 +1451,7 @@ static int fl_set_key_ct(struct nlattr **tb, &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, sizeof(key->ct_state)); - err = fl_validate_ct_state(mask->ct_state, + err = fl_validate_ct_state(key->ct_state & mask->ct_state, tb[TCA_FLOWER_KEY_CT_STATE_MASK], extack); if (err) From 1b1597e64e1a610c7a96710fc4717158e98a08b3 Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 09:47:02 +0100 Subject: [PATCH 111/171] bpf: Add sanity check for upper ptr_limit Given we know the max possible value of ptr_limit at the time of retrieving the latter, add basic assertions, so that the verifier can bail out if anything looks odd and reject the program. Nothing triggered this so far, but it also does not hurt to have these. Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d16722a99b61..44e4ec1640f1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5861,10 +5861,14 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, { bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || (opcode == BPF_SUB && !off_is_neg); - u32 off; + u32 off, max; switch (ptr_reg->type) { case PTR_TO_STACK: + /* Offset 0 is out-of-bounds, but acceptable start for the + * left direction, see BPF_REG_FP. + */ + max = MAX_BPF_STACK + mask_to_left; /* Indirect variable offset stack access is prohibited in * unprivileged mode so it's not handled here. */ @@ -5873,15 +5877,16 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, *ptr_limit = MAX_BPF_STACK + off; else *ptr_limit = -off - 1; - return 0; + return *ptr_limit >= max ? -ERANGE : 0; case PTR_TO_MAP_VALUE: + max = ptr_reg->map_ptr->value_size; if (mask_to_left) { *ptr_limit = ptr_reg->umax_value + ptr_reg->off; } else { off = ptr_reg->smin_value + ptr_reg->off; *ptr_limit = ptr_reg->map_ptr->value_size - off - 1; } - return 0; + return *ptr_limit >= max ? -ERANGE : 0; default: return -EINVAL; } From 0a13e3537ea67452d549a6a80da3776d6b7dedb3 Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 11:44:42 +0100 Subject: [PATCH 112/171] bpf, selftests: Fix up some test_verifier cases for unprivileged Fix up test_verifier error messages for the case where the original error message changed, or for the case where pointer alu errors differ between privileged and unprivileged tests. Also, add alternative tests for keeping coverage of the original verifier rejection error message (fp alu), and newly reject map_ptr += rX where rX == 0 given we now forbid alu on these types for unprivileged. All test_verifier cases pass after the change. The test case fixups were kept separate to ease backporting of core changes. Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- .../selftests/bpf/verifier/bounds_deduction.c | 27 +++++++++++++------ .../testing/selftests/bpf/verifier/map_ptr.c | 4 +++ tools/testing/selftests/bpf/verifier/unpriv.c | 15 ++++++++++- .../selftests/bpf/verifier/value_ptr_arith.c | 23 +++++++++++++++- 4 files changed, 59 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/verifier/bounds_deduction.c b/tools/testing/selftests/bpf/verifier/bounds_deduction.c index 1fd07a4f27ac..c162498a64fc 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_deduction.c +++ b/tools/testing/selftests/bpf/verifier/bounds_deduction.c @@ -6,8 +6,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 2", @@ -20,6 +21,8 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, }, @@ -31,8 +34,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 4", @@ -45,6 +49,8 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -55,8 +61,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 6", @@ -67,8 +74,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 7", @@ -80,8 +88,9 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", .errstr = "dereference of modified ctx ptr", + .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@ -94,8 +103,9 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .errstr = "dereference of modified ctx ptr", + .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@ -106,8 +116,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 10", @@ -119,6 +130,6 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, .errstr = "math between ctx pointer and register with unbounded min value is not allowed", + .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index b117bdd3806d..6f610cfddae5 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -75,6 +75,8 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_16b = { 4 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .result = ACCEPT, }, { @@ -91,5 +93,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_16b = { 4 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index b018ad71e0a8..3e32400c4b44 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -497,7 +497,7 @@ .result = ACCEPT, }, { - "unpriv: adding of fp", + "unpriv: adding of fp, reg", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_1, 0), @@ -505,6 +505,19 @@ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", + .result_unpriv = REJECT, + .result = ACCEPT, +}, +{ + "unpriv: adding of fp, imm", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), + BPF_EXIT_INSN(), + }, .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", .result_unpriv = REJECT, .result = ACCEPT, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index ed4e76b24649..feb91266db39 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -169,7 +169,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps or paths", + .errstr_unpriv = "R2 tried to add from different maps, paths, or prohibited types", .retval = 0, }, { @@ -516,6 +516,27 @@ .result = ACCEPT, .retval = 0xabcdef12, }, +{ + "map access: value_ptr += N, value_ptr -= N known scalar", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV32_IMM(BPF_REG_1, 0x12345678), + BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 0x12345678, +}, { "map access: unknown scalar += value_ptr, 1", .insns = { From cb038357937ee4f589aab2469ec3896dce90f317 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 16 Mar 2021 15:36:47 -0700 Subject: [PATCH 113/171] net: fix race between napi kthread mode and busy poll Currently, napi_thread_wait() checks for NAPI_STATE_SCHED bit to determine if the kthread owns this napi and could call napi->poll() on it. However, if socket busy poll is enabled, it is possible that the busy poll thread grabs this SCHED bit (after the previous napi->poll() invokes napi_complete_done() and clears SCHED bit) and tries to poll on the same napi. napi_disable() could grab the SCHED bit as well. This patch tries to fix this race by adding a new bit NAPI_STATE_SCHED_THREADED in napi->state. This bit gets set in ____napi_schedule() if the threaded mode is enabled, and gets cleared in napi_complete_done(), and we only poll the napi in kthread if this bit is set. This helps distinguish the ownership of the napi between kthread and other scenarios and fixes the race issue. Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support") Reported-by: Martin Zaharinov Suggested-by: Jakub Kicinski Signed-off-by: Wei Wang Cc: Alexander Duyck Cc: Eric Dumazet Cc: Paolo Abeni Cc: Hannes Frederic Sowa Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5b67ea89d5f2..87a5d186faff 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -360,6 +360,7 @@ enum { NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ + NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ }; enum { @@ -372,6 +373,7 @@ enum { NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), + NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), }; enum gro_result { diff --git a/net/core/dev.c b/net/core/dev.c index a142a207fc1d..bb568f7cb81b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4294,6 +4294,13 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ thread = READ_ONCE(napi->thread); if (thread) { + /* Avoid doing set_bit() if the thread is in + * INTERRUPTIBLE state, cause napi_thread_wait() + * makes sure to proceed with napi polling + * if the thread is explicitly woken from here. + */ + if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE) + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } @@ -6486,6 +6493,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_SCHED_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, @@ -6968,16 +6976,25 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static int napi_thread_wait(struct napi_struct *napi) { + bool woken = false; + set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop() && !napi_disable_pending(napi)) { - if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + /* Testing SCHED_THREADED bit here to make sure the current + * kthread owns this napi and could poll on this napi. + * Testing SCHED bit is not enough because SCHED bit might be + * set by some other busy poll thread or by napi_disable(). + */ + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); + /* woken being true indicates this thread owns this napi. */ + woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); From e21aa341785c679dd409c8cb71f864c00fe6c463 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Mar 2021 14:00:07 -0700 Subject: [PATCH 114/171] bpf: Fix fexit trampoline. The fexit/fmod_ret programs can be attached to kernel functions that can sleep. The synchronize_rcu_tasks() will not wait for such tasks to complete. In such case the trampoline image will be freed and when the task wakes up the return IP will point to freed memory causing the crash. Solve this by adding percpu_ref_get/put for the duration of trampoline and separate trampoline vs its image life times. The "half page" optimization has to be removed, since first_half->second_half->first_half transition cannot be guaranteed to complete in deterministic time. Every trampoline update becomes a new image. The image with fmod_ret or fexit progs will be freed via percpu_ref_kill and call_rcu_tasks. Together they will wait for the original function and trampoline asm to complete. The trampoline is patched from nop to jmp to skip fexit progs. They are freed independently from the trampoline. The image with fentry progs only will be freed via call_rcu_tasks_trace+call_rcu_tasks which will wait for both sleepable and non-sleepable progs to complete. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Reported-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney # for RCU Link: https://lore.kernel.org/bpf/20210316210007.38949-1-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 26 ++++- include/linux/bpf.h | 24 +++- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 4 +- kernel/bpf/trampoline.c | 218 +++++++++++++++++++++++++++--------- 5 files changed, 213 insertions(+), 61 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 747bba0a584a..72b5a57e9e31 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1936,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1975,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1993,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2003,6 +2011,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + emit_nops(&prog, 5); } if (fmod_ret->nr_progs) { @@ -2033,9 +2043,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d7e0f479a5b0..3625f019767d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -556,7 +557,8 @@ struct bpf_tramp_progs { * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +struct bpf_tramp_image; +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call); @@ -565,6 +567,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); +void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); +void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); struct bpf_ksym { unsigned long start; @@ -583,6 +587,18 @@ enum bpf_tramp_prog_type { BPF_TRAMP_REPLACE, /* more than MAX */ }; +struct bpf_tramp_image { + void *image; + struct bpf_ksym ksym; + struct percpu_ref pcref; + void *ip_after_call; + void *ip_epilogue; + union { + struct rcu_head rcu; + struct work_struct work; + }; +}; + struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; @@ -605,9 +621,8 @@ struct bpf_trampoline { /* Number of attached programs. A counter per kind. */ int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ - void *image; + struct bpf_tramp_image *cur_image; u64 selector; - struct bpf_ksym ksym; }; struct bpf_attach_target_info { @@ -691,6 +706,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); +int bpf_jit_charge_modmem(u32 pages); +void bpf_jit_uncharge_modmem(u32 pages); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) @@ -787,7 +804,6 @@ struct bpf_prog_aux { bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; - enum bpf_tramp_prog_type trampoline_prog_type; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 1a666a975416..70f6fd4fa305 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; - err = arch_prepare_bpf_trampoline(image, + err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, &st_ops->func_models[i], 0, tprogs, NULL); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3a283bf97f2f..75244ecb2389 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -static int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { @@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages) return 0; } -static void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 pages) { atomic_long_sub(pages, &bpf_jit_current); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7bc3b3209224..1f3a4be4b175 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -57,19 +57,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } -static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) -{ - struct bpf_ksym *ksym = &tr->ksym; - - snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); - bpf_image_ksym_add(tr->image, ksym); -} - static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; - void *image; int i; mutex_lock(&trampoline_mutex); @@ -84,14 +75,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) if (!tr) goto out; - /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec_page(); - if (!image) { - kfree(tr); - tr = NULL; - goto out; - } - tr->key = key; INIT_HLIST_NODE(&tr->hlist); hlist_add_head(&tr->hlist, head); @@ -99,9 +82,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); - tr->image = image; - INIT_LIST_HEAD_RCU(&tr->ksym.lnode); - bpf_trampoline_ksym_add(tr); out: mutex_unlock(&trampoline_mutex); return tr; @@ -185,10 +165,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) return tprogs; } +static void __bpf_tramp_image_put_deferred(struct work_struct *work) +{ + struct bpf_tramp_image *im; + + im = container_of(work, struct bpf_tramp_image, work); + bpf_image_ksym_del(&im->ksym); + bpf_jit_free_exec(im->image); + bpf_jit_uncharge_modmem(1); + percpu_ref_exit(&im->pcref); + kfree_rcu(im, rcu); +} + +/* callback, fexit step 3 or fentry step 2 */ +static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) +{ + struct bpf_tramp_image *im; + + im = container_of(rcu, struct bpf_tramp_image, rcu); + INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); + schedule_work(&im->work); +} + +/* callback, fexit step 2. Called after percpu_ref_kill confirms. */ +static void __bpf_tramp_image_release(struct percpu_ref *pcref) +{ + struct bpf_tramp_image *im; + + im = container_of(pcref, struct bpf_tramp_image, pcref); + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); +} + +/* callback, fexit or fentry step 1 */ +static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) +{ + struct bpf_tramp_image *im; + + im = container_of(rcu, struct bpf_tramp_image, rcu); + if (im->ip_after_call) + /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ + percpu_ref_kill(&im->pcref); + else + /* the case of fentry trampoline */ + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); +} + +static void bpf_tramp_image_put(struct bpf_tramp_image *im) +{ + /* The trampoline image that calls original function is using: + * rcu_read_lock_trace to protect sleepable bpf progs + * rcu_read_lock to protect normal bpf progs + * percpu_ref to protect trampoline itself + * rcu tasks to protect trampoline asm not covered by percpu_ref + * (which are few asm insns before __bpf_tramp_enter and + * after __bpf_tramp_exit) + * + * The trampoline is unreachable before bpf_tramp_image_put(). + * + * First, patch the trampoline to avoid calling into fexit progs. + * The progs will be freed even if the original function is still + * executing or sleeping. + * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on + * first few asm instructions to execute and call into + * __bpf_tramp_enter->percpu_ref_get. + * Then use percpu_ref_kill to wait for the trampoline and the original + * function to finish. + * Then use call_rcu_tasks() to make sure few asm insns in + * the trampoline epilogue are done as well. + * + * In !PREEMPT case the task that got interrupted in the first asm + * insns won't go through an RCU quiescent state which the + * percpu_ref_kill will be waiting for. Hence the first + * call_rcu_tasks() is not necessary. + */ + if (im->ip_after_call) { + int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, + NULL, im->ip_epilogue); + WARN_ON(err); + if (IS_ENABLED(CONFIG_PREEMPTION)) + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); + else + percpu_ref_kill(&im->pcref); + return; + } + + /* The trampoline without fexit and fmod_ret progs doesn't call original + * function and doesn't use percpu_ref. + * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. + * Then use call_rcu_tasks() to wait for the rest of trampoline asm + * and normal progs. + */ + call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); +} + +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) +{ + struct bpf_tramp_image *im; + struct bpf_ksym *ksym; + void *image; + int err = -ENOMEM; + + im = kzalloc(sizeof(*im), GFP_KERNEL); + if (!im) + goto out; + + err = bpf_jit_charge_modmem(1); + if (err) + goto out_free_im; + + err = -ENOMEM; + im->image = image = bpf_jit_alloc_exec_page(); + if (!image) + goto out_uncharge; + + err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); + if (err) + goto out_free_image; + + ksym = &im->ksym; + INIT_LIST_HEAD_RCU(&ksym->lnode); + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx); + bpf_image_ksym_add(image, ksym); + return im; + +out_free_image: + bpf_jit_free_exec(im->image); +out_uncharge: + bpf_jit_uncharge_modmem(1); +out_free_im: + kfree(im); +out: + return ERR_PTR(err); +} + static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_tramp_image *im; struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; int err, total; @@ -198,41 +310,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) return PTR_ERR(tprogs); if (total == 0) { - err = unregister_fentry(tr, old_image); + err = unregister_fentry(tr, tr->cur_image->image); + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = NULL; tr->selector = 0; goto out; } + im = bpf_tramp_image_alloc(tr->key, tr->selector); + if (IS_ERR(im)) { + err = PTR_ERR(im); + goto out; + } + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - /* Though the second half of trampoline page is unused a task could be - * preempted in the middle of the first half of trampoline and two - * updates to trampoline would change the code from underneath the - * preempted task. Hence wait for tasks to voluntarily schedule or go - * to userspace. - * The same trampoline can hold both sleepable and non-sleepable progs. - * synchronize_rcu_tasks_trace() is needed to make sure all sleepable - * programs finish executing. - * Wait for these two grace periods together. - */ - synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); - - err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, &tr->func.model, flags, tprogs, tr->func.addr); if (err < 0) goto out; - if (tr->selector) + WARN_ON(tr->cur_image && tr->selector == 0); + WARN_ON(!tr->cur_image && tr->selector); + if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, old_image, new_image); + err = modify_fentry(tr, tr->cur_image->image, im->image); else /* first time registering */ - err = register_fentry(tr, new_image); + err = register_fentry(tr, im->image); if (err) goto out; + if (tr->cur_image) + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = im; tr->selector++; out: kfree(tprogs); @@ -364,17 +477,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; - bpf_image_ksym_del(&tr->ksym); - /* This code will be executed when all bpf progs (both sleepable and - * non-sleepable) went through - * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). - * Hence no need for another synchronize_rcu_tasks_trace() here, - * but synchronize_rcu_tasks() is still needed, since trampoline - * may not have had any sleepable programs and we need to wait - * for tasks to get out of trampoline code before freeing it. + /* This code will be executed even when the last bpf_tramp_image + * is alive. All progs are detached from the trampoline and the + * trampoline image is patched with jmp into epilogue to skip + * fexit progs. The fentry-only trampoline will be freed via + * multiple rcu callbacks. */ - synchronize_rcu_tasks(); - bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); out: @@ -478,8 +586,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) rcu_read_unlock_trace(); } +void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) +{ + percpu_ref_get(&tr->pcref); +} + +void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) +{ + percpu_ref_put(&tr->pcref); +} + int __weak -arch_prepare_bpf_trampoline(void *image, void *image_end, +arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) From 8b2030b4305951f44afef80225f1475618e25a73 Mon Sep 17 00:00:00 2001 From: Ludovic Senecaux Date: Thu, 4 Mar 2021 04:10:50 -0500 Subject: [PATCH 115/171] netfilter: conntrack: Fix gre tunneling over ipv6 This fix permits gre connections to be tracked within ip6tables rules Signed-off-by: Ludovic Senecaux Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_gre.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 5b05487a60d2..db11e403d818 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -218,9 +218,6 @@ int nf_conntrack_gre_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { - if (state->pf != NFPROTO_IPV4) - return -NF_ACCEPT; - if (!nf_ct_is_confirmed(ct)) { unsigned int *timeouts = nf_ct_timeout_lookup(ct); From 7e6136f1b7272b2202817cff37ada355eb5e6784 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Mar 2021 11:31:55 +0100 Subject: [PATCH 116/171] netfilter: nftables: report EOPNOTSUPP on unsupported flowtable flags Error was not set accordingly. Fixes: 8bb69f3b2918 ("netfilter: nf_tables: add flowtable offload control plane") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 224c8e537cb3..0d034f895b7b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6963,8 +6963,10 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (nla[NFTA_FLOWTABLE_FLAGS]) { flowtable->data.flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); - if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) + if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) { + err = -EOPNOTSUPP; goto err3; + } } write_pnet(&flowtable->data.net, net); From 7b35582cd04ace2fd1807c1b624934e465cc939d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Mar 2021 12:54:57 +0100 Subject: [PATCH 117/171] netfilter: nftables: allow to update flowtable flags Honor flowtable flags from the control update path. Disallow disabling to toggle hardware offload support though. Fixes: 8bb69f3b2918 ("netfilter: nf_tables: add flowtable offload control plane") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index fdec57d862b7..5aaced6bf13e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1536,6 +1536,7 @@ struct nft_trans_flowtable { struct nft_flowtable *flowtable; bool update; struct list_head hook_list; + u32 flags; }; #define nft_trans_flowtable(trans) \ @@ -1544,6 +1545,8 @@ struct nft_trans_flowtable { (((struct nft_trans_flowtable *)trans->data)->update) #define nft_trans_flowtable_hooks(trans) \ (((struct nft_trans_flowtable *)trans->data)->hook_list) +#define nft_trans_flowtable_flags(trans) \ + (((struct nft_trans_flowtable *)trans->data)->flags) int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0d034f895b7b..4fcd07f1e925 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6842,6 +6842,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, struct nft_hook *hook, *next; struct nft_trans *trans; bool unregister = false; + u32 flags; int err; err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], @@ -6856,6 +6857,17 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, } } + if (nla[NFTA_FLOWTABLE_FLAGS]) { + flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); + if (flags & ~NFT_FLOWTABLE_MASK) + return -EOPNOTSUPP; + if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^ + (flags & NFT_FLOWTABLE_HW_OFFLOAD)) + return -EOPNOTSUPP; + } else { + flags = flowtable->data.flags; + } + err = nft_register_flowtable_net_hooks(ctx->net, ctx->table, &flowtable_hook.list, flowtable); if (err < 0) @@ -6869,6 +6881,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, goto err_flowtable_update_hook; } + nft_trans_flowtable_flags(trans) = flags; nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); @@ -8178,6 +8191,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { + nft_trans_flowtable(trans)->data.flags = + nft_trans_flowtable_flags(trans); nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), From 740b486a8d1f966e68ac0666f1fd57441a7cda94 Mon Sep 17 00:00:00 2001 From: Yinjun Zhang Date: Wed, 17 Mar 2021 13:42:24 +0100 Subject: [PATCH 118/171] netfilter: flowtable: Make sure GC works periodically in idle system Currently flowtable's GC work is initialized as deferrable, which means GC cannot work on time when system is idle. So the hardware offloaded flow may be deleted for timeout, since its used time is not timely updated. Resolve it by initializing the GC work as delayed work instead of deferrable. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Yinjun Zhang Signed-off-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 5fa657b8e03d..c77ba8690ed8 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -506,7 +506,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) { int err; - INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); init_rwsem(&flowtable->flow_block_lock); From 8f3f5792f2940c16ab63c614b26494c8689c9c1e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 17 Mar 2021 23:54:14 +0900 Subject: [PATCH 119/171] libbpf: Fix error path in bpf_object__elf_init() When it failed to get section names, it should call into bpf_object__elf_finish() like others. Fixes: 88a82120282b ("libbpf: Factor out common ELF operations and improve logging") Signed-off-by: Namhyung Kim Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210317145414.884817-1-namhyung@kernel.org --- tools/lib/bpf/libbpf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d43cc3f29dae..4181d178ee7b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1181,7 +1181,8 @@ static int bpf_object__elf_init(struct bpf_object *obj) if (!elf_rawdata(elf_getscn(obj->efile.elf, obj->efile.shstrndx), NULL)) { pr_warn("elf: failed to get section names strings from %s: %s\n", obj->path, elf_errmsg(-1)); - return -LIBBPF_ERRNO__FORMAT; + err = -LIBBPF_ERRNO__FORMAT; + goto errout; } /* Old LLVM set e_machine to EM_NONE */ From 58bfd95b554f1a23d01228672f86bb489bdbf4ba Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 17 Mar 2021 17:28:58 +0530 Subject: [PATCH 120/171] libbpf: Use SOCK_CLOEXEC when opening the netlink socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise, there exists a small window between the opening and closing of the socket fd where it may leak into processes launched by some other thread. Fixes: 949abbe88436 ("libbpf: add function to setup XDP") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210317115857.6536-1-memxor@gmail.com --- tools/lib/bpf/netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 4dd73de00b6f..d2cb28e9ef52 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -40,7 +40,7 @@ static int libbpf_netlink_open(__u32 *nl_pid) memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); if (sock < 0) return -errno; From 86fe2c19eec4728fd9a42ba18f3b47f0d5f9fd7c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Mar 2021 21:19:57 +0100 Subject: [PATCH 121/171] netfilter: nftables: skip hook overlap logic if flowtable is stale If the flowtable has been previously removed in this batch, skip the hook overlap checks. This fixes spurious EEXIST errors when removing and adding the flowtable in the same batch. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4fcd07f1e925..f57f1a6ba96f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6783,6 +6783,9 @@ static int nft_register_flowtable_net_hooks(struct net *net, list_for_each_entry(hook, hook_list, list) { list_for_each_entry(ft, &table->flowtables, list) { + if (!nft_is_active_next(net, ft)) + continue; + list_for_each_entry(hook2, &ft->hook_list, list) { if (hook->ops.dev == hook2->ops.dev && hook->ops.pf == hook2->ops.pf) { From eddbe8e6521401003e37e7848ef72e75c10ee2aa Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Mar 2021 17:45:23 -0700 Subject: [PATCH 122/171] selftest/bpf: Add a test to check trampoline freeing logic. Add a selftest for commit e21aa341785c ("bpf: Fix fexit trampoline.") to make sure that attaching fexit prog to a sleeping kernel function will trigger appropriate trampoline and program destruction. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210318004523.55908-1-alexei.starovoitov@gmail.com --- .../selftests/bpf/prog_tests/fexit_sleep.c | 82 +++++++++++++++++++ .../testing/selftests/bpf/progs/fexit_sleep.c | 31 +++++++ 2 files changed, 113 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/fexit_sleep.c create mode 100644 tools/testing/selftests/bpf/progs/fexit_sleep.c diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c new file mode 100644 index 000000000000..6c4d42a2386f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include "fexit_sleep.skel.h" + +static int do_sleep(void *skel) +{ + struct fexit_sleep *fexit_skel = skel; + struct timespec ts1 = { .tv_nsec = 1 }; + struct timespec ts2 = { .tv_sec = 10 }; + + fexit_skel->bss->pid = getpid(); + (void)syscall(__NR_nanosleep, &ts1, NULL); + (void)syscall(__NR_nanosleep, &ts2, NULL); + return 0; +} + +#define STACK_SIZE (1024 * 1024) +static char child_stack[STACK_SIZE]; + +void test_fexit_sleep(void) +{ + struct fexit_sleep *fexit_skel = NULL; + int wstatus, duration = 0; + pid_t cpid; + int err, fexit_cnt; + + fexit_skel = fexit_sleep__open_and_load(); + if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) + goto cleanup; + + err = fexit_sleep__attach(fexit_skel); + if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) + goto cleanup; + + cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); + if (CHECK(cpid == -1, "clone", strerror(errno))) + goto cleanup; + + /* wait until first sys_nanosleep ends and second sys_nanosleep starts */ + while (READ_ONCE(fexit_skel->bss->fentry_cnt) != 2); + fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt); + if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt)) + goto cleanup; + + /* close progs and detach them. That will trigger two nop5->jmp5 rewrites + * in the trampolines to skip nanosleep_fexit prog. + * The nanosleep_fentry prog will get detached first. + * The nanosleep_fexit prog will get detached second. + * Detaching will trigger freeing of both progs JITed images. + * There will be two dying bpf_tramp_image-s, but only the initial + * bpf_tramp_image (with both _fentry and _fexit progs will be stuck + * waiting for percpu_ref_kill to confirm). The other one + * will be freed quickly. + */ + close(bpf_program__fd(fexit_skel->progs.nanosleep_fentry)); + close(bpf_program__fd(fexit_skel->progs.nanosleep_fexit)); + fexit_sleep__detach(fexit_skel); + + /* kill the thread to unwind sys_nanosleep stack through the trampoline */ + kill(cpid, 9); + + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + goto cleanup; + if (CHECK(WEXITSTATUS(wstatus) != 0, "exitstatus", "failed")) + goto cleanup; + + /* The bypassed nanosleep_fexit prog shouldn't have executed. + * Unlike progs the maps were not freed and directly accessible. + */ + fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt); + if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt)) + goto cleanup; + +cleanup: + fexit_sleep__destroy(fexit_skel); +} diff --git a/tools/testing/selftests/bpf/progs/fexit_sleep.c b/tools/testing/selftests/bpf/progs/fexit_sleep.c new file mode 100644 index 000000000000..03a672d76353 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fexit_sleep.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include +#include + +char LICENSE[] SEC("license") = "GPL"; + +int pid = 0; +int fentry_cnt = 0; +int fexit_cnt = 0; + +SEC("fentry/__x64_sys_nanosleep") +int BPF_PROG(nanosleep_fentry, const struct pt_regs *regs) +{ + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + + fentry_cnt++; + return 0; +} + +SEC("fexit/__x64_sys_nanosleep") +int BPF_PROG(nanosleep_fexit, const struct pt_regs *regs, int ret) +{ + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + + fexit_cnt++; + return 0; +} From dcc32f4f183ab8479041b23a1525d48233df1d43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Mar 2021 09:55:15 -0700 Subject: [PATCH 123/171] ipv6: weaken the v4mapped source check This reverts commit 6af1799aaf3f1bc8defedddfa00df3192445bbf3. Commit 6af1799aaf3f ("ipv6: drop incoming packets having a v4mapped source address") introduced an input check against v4mapped addresses. Use of such addresses on the wire is indeed questionable and not allowed on public Internet. As the commit pointed out https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02 lists potential issues. Unfortunately there are applications which use v4mapped addresses, and breaking them is a clear regression. For example v4mapped addresses (or any semi-valid addresses, really) may be used for uni-direction event streams or packet export. Since the issue which sparked the addition of the check was with TCP and request_socks in particular push the check down to TCPv6 and DCCP. This restores the ability to receive UDPv6 packets with v4mapped address as the source. Keep using the IPSTATS_MIB_INHDRERRORS statistic to minimize the user-visible changes. Fixes: 6af1799aaf3f ("ipv6: drop incoming packets having a v4mapped source address") Reported-by: Sunyi Shao Signed-off-by: Jakub Kicinski Acked-by: Mat Martineau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv6.c | 5 +++++ net/ipv6/ip6_input.c | 10 ---------- net/ipv6/tcp_ipv6.c | 5 +++++ net/mptcp/subflow.c | 5 +++++ 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1f73603913f5..2be5c69824f9 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -319,6 +319,11 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) return 0; /* discard, don't send a reset here */ + if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { + __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); + return 0; + } + if (dccp_bad_service_code(sk, service)) { dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; goto drop; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index e9d2a4a409aa..80256717868e 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -245,16 +245,6 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, if (ipv6_addr_is_multicast(&hdr->saddr)) goto err; - /* While RFC4291 is not explicit about v4mapped addresses - * in IPv6 headers, it seems clear linux dual-stack - * model can not deal properly with these. - * Security models could be fooled by ::ffff:127.0.0.1 for example. - * - * https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02 - */ - if (ipv6_addr_v4mapped(&hdr->saddr)) - goto err; - skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bd44ded7e50c..d0f007741e8e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1175,6 +1175,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; + if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { + __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); + return 0; + } + return tcp_conn_request(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, skb); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3d47d670e665..d17d39ccdf34 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -477,6 +477,11 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; + if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { + __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); + return 0; + } + return tcp_conn_request(&mptcp_subflow_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); From 804741ac7b9f2fdebe3740cb0579cb8d94d49e60 Mon Sep 17 00:00:00 2001 From: Mian Yousaf Kaukab Date: Thu, 18 Mar 2021 09:50:26 +0100 Subject: [PATCH 124/171] netsec: restore phy power state after controller reset Since commit 8e850f25b581 ("net: socionext: Stop PHY before resetting netsec") netsec_netdev_init() power downs phy before resetting the controller. However, the state is not restored once the reset is complete. As a result it is not possible to bring up network on a platform with Broadcom BCM5482 phy. Fix the issue by restoring phy power state after controller reset is complete. Fixes: 8e850f25b581 ("net: socionext: Stop PHY before resetting netsec") Cc: stable@vger.kernel.org Signed-off-by: Mian Yousaf Kaukab Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/netsec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 3c53051bdacf..200785e703c8 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1715,14 +1715,17 @@ static int netsec_netdev_init(struct net_device *ndev) goto err1; /* set phy power down */ - data = netsec_phy_read(priv->mii_bus, priv->phy_addr, MII_BMCR) | - BMCR_PDOWN; - netsec_phy_write(priv->mii_bus, priv->phy_addr, MII_BMCR, data); + data = netsec_phy_read(priv->mii_bus, priv->phy_addr, MII_BMCR); + netsec_phy_write(priv->mii_bus, priv->phy_addr, MII_BMCR, + data | BMCR_PDOWN); ret = netsec_reset_hardware(priv, true); if (ret) goto err2; + /* Restore phy power state */ + netsec_phy_write(priv->mii_bus, priv->phy_addr, MII_BMCR, data); + spin_lock_init(&priv->desc_ring[NETSEC_RING_TX].lock); spin_lock_init(&priv->desc_ring[NETSEC_RING_RX].lock); From f41b2d67d767f34bcd29fab83efaddb7f1e54579 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Thu, 18 Mar 2021 19:45:42 +0530 Subject: [PATCH 125/171] octeontx2-pf: Do not modify number of rules In the ETHTOOL_GRXCLSRLALL ioctl ethtool uses below structure to read number of rules from the driver. struct ethtool_rxnfc { __u32 cmd; __u32 flow_type; __u64 data; struct ethtool_rx_flow_spec fs; union { __u32 rule_cnt; __u32 rss_context; }; __u32 rule_locs[0]; }; Driver must not modify rule_cnt member. But currently driver modifies it by modifying rss_context. Hence fix it by using a local variable. Fixes: 81a4362016e7 ("octeontx2-pf: Add RSS multi group support") Signed-off-by: Subbaraya Sundeep Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c index 0dbbf38e0597..dc1778420978 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c @@ -257,17 +257,19 @@ int otx2_get_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc, int otx2_get_all_flows(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc, u32 *rule_locs) { + u32 rule_cnt = nfc->rule_cnt; u32 location = 0; int idx = 0; int err = 0; nfc->data = pfvf->flow_cfg->ntuple_max_flows; - while ((!err || err == -ENOENT) && idx < nfc->rule_cnt) { + while ((!err || err == -ENOENT) && idx < rule_cnt) { err = otx2_get_flow(pfvf, nfc, location); if (!err) rule_locs[idx++] = location; location++; } + nfc->rule_cnt = rule_cnt; return err; } From f7884097141b615b6ce89c16f456a53902b4eec3 Mon Sep 17 00:00:00 2001 From: Rakesh Babu Date: Thu, 18 Mar 2021 19:45:43 +0530 Subject: [PATCH 126/171] octeontx2-af: Formatting debugfs entry rsrc_alloc. With the existing rsrc_alloc's format, there is misalignment for the pcifunc entries whose VF's index is a double digit. This patch fixes this. pcifunc NPA NIX0 NIX1 SSO GROUP SSOWS TIM CPT0 CPT1 REE0 REE1 PF0:VF0 8 5 PF0:VF1 9 3 PF0:VF10 18 10 PF0:VF11 19 8 PF0:VF12 20 11 PF0:VF13 21 9 PF0:VF14 22 12 PF0:VF15 23 10 PF1 0 0 Fixes: 23205e6d06d4 ("octeontx2-af: Dump current resource provisioning status") Signed-off-by: Rakesh Babu Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- .../marvell/octeontx2/af/rvu_debugfs.c | 46 ++++++++++++------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index aa2ca8780b9c..dc946953af06 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -234,12 +234,14 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { - int index, off = 0, flag = 0, go_back = 0, off_prev; + int index, off = 0, flag = 0, go_back = 0, len = 0; struct rvu *rvu = filp->private_data; int lf, pf, vf, pcifunc; struct rvu_block block; int bytes_not_copied; + int lf_str_size = 12; int buf_size = 2048; + char *lfs; char *buf; /* don't allow partial reads */ @@ -249,12 +251,18 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) return -ENOSPC; - off += scnprintf(&buf[off], buf_size - 1 - off, "\npcifunc\t\t"); + + lfs = kzalloc(lf_str_size, GFP_KERNEL); + if (!lfs) + return -ENOMEM; + off += scnprintf(&buf[off], buf_size - 1 - off, "%-*s", lf_str_size, + "pcifunc"); for (index = 0; index < BLK_COUNT; index++) - if (strlen(rvu->hw->block[index].name)) - off += scnprintf(&buf[off], buf_size - 1 - off, - "%*s\t", (index - 1) * 2, - rvu->hw->block[index].name); + if (strlen(rvu->hw->block[index].name)) { + off += scnprintf(&buf[off], buf_size - 1 - off, + "%-*s", lf_str_size, + rvu->hw->block[index].name); + } off += scnprintf(&buf[off], buf_size - 1 - off, "\n"); for (pf = 0; pf < rvu->hw->total_pfs; pf++) { for (vf = 0; vf <= rvu->hw->total_vfs; vf++) { @@ -263,14 +271,15 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, continue; if (vf) { + sprintf(lfs, "PF%d:VF%d", pf, vf - 1); go_back = scnprintf(&buf[off], buf_size - 1 - off, - "PF%d:VF%d\t\t", pf, - vf - 1); + "%-*s", lf_str_size, lfs); } else { + sprintf(lfs, "PF%d", pf); go_back = scnprintf(&buf[off], buf_size - 1 - off, - "PF%d\t\t", pf); + "%-*s", lf_str_size, lfs); } off += go_back; @@ -278,20 +287,22 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, block = rvu->hw->block[index]; if (!strlen(block.name)) continue; - off_prev = off; + len = 0; + lfs[len] = '\0'; for (lf = 0; lf < block.lf.max; lf++) { if (block.fn_map[lf] != pcifunc) continue; flag = 1; - off += scnprintf(&buf[off], buf_size - 1 - - off, "%3d,", lf); + len += sprintf(&lfs[len], "%d,", lf); } - if (flag && off_prev != off) - off--; - else - go_back++; + + if (flag) + len--; + lfs[len] = '\0'; off += scnprintf(&buf[off], buf_size - 1 - off, - "\t"); + "%-*s", lf_str_size, lfs); + if (!strlen(lfs)) + go_back += lf_str_size; } if (!flag) off -= go_back; @@ -303,6 +314,7 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, } bytes_not_copied = copy_to_user(buffer, buf, off); + kfree(lfs); kfree(buf); if (bytes_not_copied) From ce86c2a531e2f2995ee55ea527c1f39ba1d95f73 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Thu, 18 Mar 2021 19:45:44 +0530 Subject: [PATCH 127/171] octeontx2-af: Remove TOS field from MKEX TX The MKEX profile describes what packet fields need to be extracted from the input packet and how to place those packet fields in the output key for MCAM matching. The MKEX profile can be in a way where higher layer packet fields can overwrite lower layer packet fields in output MCAM Key. Hence MKEX profile is always ensured that there are no overlaps between any of the layers. But the commit 42006910b5ea ("octeontx2-af: cleanup KPU config data") introduced TX TOS field which overlaps with DMAC in MCAM key. This led to AF driver returning error when TX rule is installed with DMAC as match criteria since DMAC gets overwritten and cannot be supported. This patch fixes the issue by removing TOS field from MKEX TX profile. Fixes: 42006910b5ea ("octeontx2-af: cleanup KPU config data") Signed-off-by: Subbaraya Sundeep Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h index b192692b4fc4..5c372d2c24a1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h @@ -13499,8 +13499,6 @@ static struct npc_mcam_kex npc_mkex_default = { [NPC_LT_LC_IP] = { /* SIP+DIP: 8 bytes, KW2[63:0] */ KEX_LD_CFG(0x07, 0xc, 0x1, 0x0, 0x10), - /* TOS: 1 byte, KW1[63:56] */ - KEX_LD_CFG(0x0, 0x1, 0x1, 0x0, 0xf), }, /* Layer C: IPv6 */ [NPC_LT_LC_IP6] = { From 297887872973555cb9fb83fdd5a2748d6cd8fc1d Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Thu, 18 Mar 2021 19:45:45 +0530 Subject: [PATCH 128/171] octeontx2-af: Return correct CGX RX fifo size CGX receive buffer size is a constant value and cannot be read from CGX0 block always since CGX0 may not enabled everytime. Hence return CGX receive buffer size from first enabled CGX block instead of CGX0. Fixes: 6e54e1c5399a ("octeontx2-af: cn10K: MTU configuration") Signed-off-by: Subbaraya Sundeep Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/af/rvu.h | 1 + .../ethernet/marvell/octeontx2/af/rvu_cgx.c | 18 ++++++++++++++++-- .../marvell/octeontx2/af/rvu_debugfs.c | 9 +++++---- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index fa6e46e36ae4..76f399229ddb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -678,6 +678,7 @@ void npc_read_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, u8 *intf, u8 *ena); bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature); u32 rvu_cgx_get_fifolen(struct rvu *rvu); +void *rvu_first_cgx_pdata(struct rvu *rvu); /* CPT APIs */ int rvu_cpt_lf_teardown(struct rvu *rvu, u16 pcifunc, int lf, int slot); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index e668e482383a..6e2bf4fcd29c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -89,6 +89,21 @@ void *rvu_cgx_pdata(u8 cgx_id, struct rvu *rvu) return rvu->cgx_idmap[cgx_id]; } +/* Return first enabled CGX instance if none are enabled then return NULL */ +void *rvu_first_cgx_pdata(struct rvu *rvu) +{ + int first_enabled_cgx = 0; + void *cgxd = NULL; + + for (; first_enabled_cgx < rvu->cgx_cnt_max; first_enabled_cgx++) { + cgxd = rvu_cgx_pdata(first_enabled_cgx, rvu); + if (cgxd) + break; + } + + return cgxd; +} + /* Based on P2X connectivity find mapped NIX block for a PF */ static void rvu_map_cgx_nix_block(struct rvu *rvu, int pf, int cgx_id, int lmac_id) @@ -711,10 +726,9 @@ int rvu_mbox_handler_cgx_features_get(struct rvu *rvu, u32 rvu_cgx_get_fifolen(struct rvu *rvu) { struct mac_ops *mac_ops; - int rvu_def_cgx_id = 0; u32 fifo_len; - mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu)); + mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu)); fifo_len = mac_ops ? mac_ops->fifo_len : 0; return fifo_len; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index dc946953af06..b4c53b19f535 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -331,7 +331,6 @@ static int rvu_dbg_rvu_pf_cgx_map_display(struct seq_file *filp, void *unused) struct rvu *rvu = filp->private; struct pci_dev *pdev = NULL; struct mac_ops *mac_ops; - int rvu_def_cgx_id = 0; char cgx[10], lmac[10]; struct rvu_pfvf *pfvf; int pf, domain, blkid; @@ -339,7 +338,10 @@ static int rvu_dbg_rvu_pf_cgx_map_display(struct seq_file *filp, void *unused) u16 pcifunc; domain = 2; - mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu)); + mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu)); + /* There can be no CGX devices at all */ + if (!mac_ops) + return 0; seq_printf(filp, "PCI dev\t\tRVU PF Func\tNIX block\t%s\tLMAC\n", mac_ops->name); for (pf = 0; pf < rvu->hw->total_pfs; pf++) { @@ -1830,7 +1832,6 @@ static void rvu_dbg_cgx_init(struct rvu *rvu) { struct mac_ops *mac_ops; unsigned long lmac_bmap; - int rvu_def_cgx_id = 0; int i, lmac_id; char dname[20]; void *cgx; @@ -1838,7 +1839,7 @@ static void rvu_dbg_cgx_init(struct rvu *rvu) if (!cgx_get_cgxcnt_max()) return; - mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu)); + mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu)); if (!mac_ops) return; From ae2619dd4fccdad9876aa5f900bd85484179c50f Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Thu, 18 Mar 2021 19:45:46 +0530 Subject: [PATCH 129/171] octeontx2-af: Fix irq free in rvu teardown Current devlink code try to free already freed irqs as the irq_allocate flag is not cleared after free leading to kernel crash while removing rvu driver. The patch fixes the irq free sequence and clears the irq_allocate flag on free. Fixes: 7304ac4567bc ("octeontx2-af: Add mailbox IRQ and msg handlers") Signed-off-by: Geetha sowjanya Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index d9a1a71c7ccc..ab24a5e8ee8a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -2462,8 +2462,10 @@ static void rvu_unregister_interrupts(struct rvu *rvu) INTR_MASK(rvu->hw->total_pfs) & ~1ULL); for (irq = 0; irq < rvu->num_vec; irq++) { - if (rvu->irq_allocated[irq]) + if (rvu->irq_allocated[irq]) { free_irq(pci_irq_vector(rvu->pdev, irq), rvu); + rvu->irq_allocated[irq] = false; + } } pci_free_irq_vectors(rvu->pdev); @@ -2975,8 +2977,8 @@ static void rvu_remove(struct pci_dev *pdev) struct rvu *rvu = pci_get_drvdata(pdev); rvu_dbg_exit(rvu); - rvu_unregister_interrupts(rvu); rvu_unregister_dl(rvu); + rvu_unregister_interrupts(rvu); rvu_flr_wq_destroy(rvu); rvu_cgx_exit(rvu); rvu_fwdata_exit(rvu); From f12098ce9b43e1a6fcaa524acbd90f9118a74c0a Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Thu, 18 Mar 2021 19:45:47 +0530 Subject: [PATCH 130/171] octeontx2-pf: Clear RSS enable flag on interace down RSS configuration can not be get/set when interface is in down state as they required mbox communication. RSS enable flag status is used for set/get configuration. Current code do not clear the RSS enable flag on interface down which lead to mbox error while trying to set/get RSS configuration. Fixes: 85069e95e531 ("octeontx2-pf: Receive side scaling support") Signed-off-by: Geetha sowjanya Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 53ab1814d74b..2fd3d235d292 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1672,6 +1672,7 @@ int otx2_stop(struct net_device *netdev) struct otx2_nic *pf = netdev_priv(netdev); struct otx2_cq_poll *cq_poll = NULL; struct otx2_qset *qset = &pf->qset; + struct otx2_rss_info *rss; int qidx, vec, wrk; netif_carrier_off(netdev); @@ -1684,6 +1685,10 @@ int otx2_stop(struct net_device *netdev) /* First stop packet Rx/Tx */ otx2_rxtx_enable(pf, false); + /* Clear RSS enable flag */ + rss = &pf->hw.rss_info; + rss->enable = false; + /* Cleanup Queue IRQ */ vec = pci_irq_vector(pf->pdev, pf->hw.nix_msixoff + NIX_LF_QINT_VEC_START); From 64451b98306bf1334a62bcd020ec92bdb4cb68db Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 18 Mar 2021 19:45:48 +0530 Subject: [PATCH 131/171] octeontx2-af: fix infinite loop in unmapping NPC counter unmapping npc counter works in a way by traversing all mcam entries to find which mcam rule is associated with counter. But loop cursor variable 'entry' is not incremented before checking next mcam entry which resulting in infinite loop. This in turn hogs the kworker thread forever and no other mbox message is processed by AF driver after that. Fix this by updating entry value before checking next mcam entry. Fixes: a958dd59f9ce ("octeontx2-af: Map or unmap NPC MCAM entry and counter") Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index 04bb0803a5c5..0bd49c7080a6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -2490,10 +2490,10 @@ int rvu_mbox_handler_npc_mcam_free_counter(struct rvu *rvu, index = find_next_bit(mcam->bmap, mcam->bmap_entries, entry); if (index >= mcam->bmap_entries) break; + entry = index + 1; if (mcam->entry2cntr_map[index] != req->cntr) continue; - entry = index + 1; npc_unmap_mcam_entry_and_cntr(rvu, mcam, blkaddr, index, req->cntr); } From 8c16cb0304cd582e83584b81813a3404e9c7db47 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Thu, 18 Mar 2021 19:45:49 +0530 Subject: [PATCH 132/171] octeontx2-af: Fix uninitialized variable warning Initialize l4_key_offset variable to fix uninitialized variable compiler warning. Fixes: b9b7421a01d8 ("octeontx2-af: Support ESP/AH RSS hashing") Signed-off-by: Subbaraya Sundeep Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index d3000194e2d3..3d068b7d46bd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -2629,7 +2629,7 @@ static int set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg) struct nix_rx_flowkey_alg *field; struct nix_rx_flowkey_alg tmp; u32 key_type, valid_key; - int l4_key_offset; + int l4_key_offset = 0; if (!alg) return -EINVAL; From 600cc3c9c62defd920da07bc585eb739247bb732 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Mar 2021 17:25:08 +0000 Subject: [PATCH 133/171] net: marvell: Remove reference to CONFIG_MV64X60 Commit 92c8c16f3457 ("powerpc/embedded6xx: Remove C2K board support") removed last selector of CONFIG_MV64X60. As it is not a user selectable config item, all references to it are stale. Remove them. Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/Kconfig | 4 ++-- drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig index 7fe15a3286f4..fe0989c0fc25 100644 --- a/drivers/net/ethernet/marvell/Kconfig +++ b/drivers/net/ethernet/marvell/Kconfig @@ -6,7 +6,7 @@ config NET_VENDOR_MARVELL bool "Marvell devices" default y - depends on PCI || CPU_PXA168 || MV64X60 || PPC32 || PLAT_ORION || INET || COMPILE_TEST + depends on PCI || CPU_PXA168 || PPC32 || PLAT_ORION || INET || COMPILE_TEST help If you have a network (Ethernet) card belonging to this class, say Y. @@ -19,7 +19,7 @@ if NET_VENDOR_MARVELL config MV643XX_ETH tristate "Marvell Discovery (643XX) and Orion ethernet support" - depends on MV64X60 || PPC32 || PLAT_ORION || COMPILE_TEST + depends on PPC32 || PLAT_ORION || COMPILE_TEST depends on INET select PHYLIB select MVMDIO diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 90e6111ce534..3bfb659b5c99 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2684,7 +2684,7 @@ static const struct of_device_id mv643xx_eth_shared_ids[] = { MODULE_DEVICE_TABLE(of, mv643xx_eth_shared_ids); #endif -#if defined(CONFIG_OF_IRQ) && !defined(CONFIG_MV64X60) +#ifdef CONFIG_OF_IRQ #define mv643xx_eth_property(_np, _name, _v) \ do { \ u32 tmp; \ From 6c015a2256801597fadcbc11d287774c9c512fa5 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 18 Mar 2021 04:42:53 +0100 Subject: [PATCH 134/171] net: check all name nodes in __dev_alloc_name __dev_alloc_name(), when supplied with a name containing '%d', will search for the first available device number to generate a unique device name. Since commit ff92741270bf8b6e78aa885f166b68c7a67ab13a ("net: introduce name_node struct to be used in hashlist") network devices may have alternate names. __dev_alloc_name() does take these alternate names into account, possibly generating a name that is already taken and failing with -ENFILE as a result. This demonstrates the bug: # rmmod dummy 2>/dev/null # ip link property add dev lo altname dummy0 # modprobe dummy numdummies=1 modprobe: ERROR: could not insert 'dummy': Too many open files in system Instead of creating a device named dummy1, modprobe fails. Fix this by checking all the names in the d->name_node list, not just d->name. Signed-off-by: Jiri Bohac Fixes: ff92741270bf ("net: introduce name_node struct to be used in hashlist") Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index bb568f7cb81b..0f72ff5d34ba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1184,6 +1184,18 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) return -ENOMEM; for_each_netdev(net, d) { + struct netdev_name_node *name_node; + list_for_each_entry(name_node, &d->name_node->list, list) { + if (!sscanf(name_node->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, name_node->name, IFNAMSIZ)) + set_bit(i, inuse); + } if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) From c79a707072fe3fea0e3c92edee6ca85c1e53c29f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 18 Mar 2021 16:57:49 +0100 Subject: [PATCH 135/171] net: cdc-phonet: fix data-interface release on probe failure Set the disconnected flag before releasing the data interface in case netdev registration fails to avoid having the disconnect callback try to deregister the never registered netdev (and trigger a WARN_ON()). Fixes: 87cf65601e17 ("USB host CDC Phonet network interface driver") Signed-off-by: Johan Hovold Signed-off-by: David S. Miller --- drivers/net/usb/cdc-phonet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/cdc-phonet.c b/drivers/net/usb/cdc-phonet.c index 02e6bbb17b15..8d1f69dad603 100644 --- a/drivers/net/usb/cdc-phonet.c +++ b/drivers/net/usb/cdc-phonet.c @@ -387,6 +387,8 @@ static int usbpn_probe(struct usb_interface *intf, const struct usb_device_id *i err = register_netdev(dev); if (err) { + /* Set disconnected flag so that disconnect() returns early. */ + pnd->disconnected = 1; usb_driver_release_interface(&usbpn_driver, data_intf); goto out; } From 896ea5dab25ef12f7d0988b8f0b053a287faf889 Mon Sep 17 00:00:00 2001 From: Tom Seewald Date: Sun, 21 Feb 2021 22:00:04 -0600 Subject: [PATCH 136/171] e1000e: Fix duplicate include guard The include guard "_E1000_HW_H_" is used by header files in three different drivers (e1000/e1000_hw.h, e1000e/hw.h, and igb/e1000_hw.h). Using the same include guard macro in more than one header file may cause unexpected behavior from the compiler. Fix the duplicate include guard in the e1000e driver by renaming it. Fixes: bc7f75fa9788 ("[E1000E]: New pci-express e1000 driver (currently for ICH9 devices only)") Signed-off-by: Tom Seewald Tested-by: Dvora Fuxbrumer Acked-by: Sasha Neftin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/hw.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index 69a2329ea463..db79c4e6413e 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 1999 - 2018 Intel Corporation. */ -#ifndef _E1000_HW_H_ -#define _E1000_HW_H_ +#ifndef _E1000E_HW_H_ +#define _E1000E_HW_H_ #include "regs.h" #include "defines.h" @@ -714,4 +714,4 @@ struct e1000_hw { #include "80003es2lan.h" #include "ich8lan.h" -#endif +#endif /* _E1000E_HW_H_ */ From a75519a84855bca029ce7d8a27de9409d9b84956 Mon Sep 17 00:00:00 2001 From: Tom Seewald Date: Sun, 21 Feb 2021 22:00:05 -0600 Subject: [PATCH 137/171] igb: Fix duplicate include guard The include guard "_E1000_HW_H_" is used by two separate header files in two different drivers (e1000/e1000_hw.h and igb/e1000_hw.h). Using the same include guard macro in more than one header file may cause unexpected behavior from the compiler. Fix this by renaming the duplicate guard in the igb driver. Fixes: 9d5c824399de ("igb: PCI-Express 82575 Gigabit Ethernet driver") Signed-off-by: Tom Seewald Reviewed-by: Jesse Brandeburg Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/e1000_hw.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/e1000_hw.h b/drivers/net/ethernet/intel/igb/e1000_hw.h index 5d87957b2627..44111f65afc7 100644 --- a/drivers/net/ethernet/intel/igb/e1000_hw.h +++ b/drivers/net/ethernet/intel/igb/e1000_hw.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 2007 - 2018 Intel Corporation. */ -#ifndef _E1000_HW_H_ -#define _E1000_HW_H_ +#ifndef _E1000_IGB_HW_H_ +#define _E1000_IGB_HW_H_ #include #include @@ -551,4 +551,4 @@ s32 igb_write_pcie_cap_reg(struct e1000_hw *hw, u32 reg, u16 *value); void igb_read_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value); void igb_write_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value); -#endif /* _E1000_HW_H_ */ +#endif /* _E1000_IGB_HW_H_ */ From f0a03a026857d6c7766eb7d5835edbf5523ca15c Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Wed, 3 Mar 2021 12:51:03 -0800 Subject: [PATCH 138/171] igb: check timestamp validity Add a couple of checks to make sure timestamping is on and that the timestamp value from DMA is valid. This avoids any functional issues that could come from a misinterpreted time stamp. One of the functions changed doesn't need a return value added because there was no value in checking from the calling locations. While here, fix a couple of reverse christmas tree issues next to the code being changed. Fixes: f56e7bba22fa ("igb: Pull timestamp from fragment before adding it to skb") Fixes: 9cbc948b5a20 ("igb: add XDP support") Signed-off-by: Jesse Brandeburg Tested-by: Dave Switzer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb.h | 4 +-- drivers/net/ethernet/intel/igb/igb_main.c | 11 ++++---- drivers/net/ethernet/intel/igb/igb_ptp.c | 31 ++++++++++++++++++----- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index aaa954aae574..7bda8c5edea5 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -748,8 +748,8 @@ void igb_ptp_suspend(struct igb_adapter *adapter); void igb_ptp_rx_hang(struct igb_adapter *adapter); void igb_ptp_tx_hang(struct igb_adapter *adapter); void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb); -void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, - struct sk_buff *skb); +int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, + struct sk_buff *skb); int igb_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr); int igb_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr); void igb_set_flag_queue_pairs(struct igb_adapter *, const u32); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index d3b37761f637..a45cd2b416c8 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8302,9 +8302,10 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, return NULL; if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) { - igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb); - xdp->data += IGB_TS_HDR_LEN; - size -= IGB_TS_HDR_LEN; + if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb)) { + xdp->data += IGB_TS_HDR_LEN; + size -= IGB_TS_HDR_LEN; + } } /* Determine available headroom for copy */ @@ -8365,8 +8366,8 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, /* pull timestamp out of packet data */ if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { - igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb); - __skb_pull(skb, IGB_TS_HDR_LEN); + if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb)) + __skb_pull(skb, IGB_TS_HDR_LEN); } /* update buffer offset */ diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index 7cc5428c3b3d..86a576201f5f 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -856,6 +856,9 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter) dev_kfree_skb_any(skb); } +#define IGB_RET_PTP_DISABLED 1 +#define IGB_RET_PTP_INVALID 2 + /** * igb_ptp_rx_pktstamp - retrieve Rx per packet timestamp * @q_vector: Pointer to interrupt specific structure @@ -864,19 +867,29 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter) * * This function is meant to retrieve a timestamp from the first buffer of an * incoming frame. The value is stored in little endian format starting on - * byte 8. + * byte 8 + * + * Returns: 0 if success, nonzero if failure **/ -void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, - struct sk_buff *skb) +int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, + struct sk_buff *skb) { - __le64 *regval = (__le64 *)va; struct igb_adapter *adapter = q_vector->adapter; + __le64 *regval = (__le64 *)va; int adjust = 0; + if (!(adapter->ptp_flags & IGB_PTP_ENABLED)) + return IGB_RET_PTP_DISABLED; + /* The timestamp is recorded in little endian format. * DWORD: 0 1 2 3 * Field: Reserved Reserved SYSTIML SYSTIMH */ + + /* check reserved dwords are zero, be/le doesn't matter for zero */ + if (regval[0]) + return IGB_RET_PTP_INVALID; + igb_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb), le64_to_cpu(regval[1])); @@ -896,6 +909,8 @@ void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, } skb_hwtstamps(skb)->hwtstamp = ktime_sub_ns(skb_hwtstamps(skb)->hwtstamp, adjust); + + return 0; } /** @@ -906,13 +921,15 @@ void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, * This function is meant to retrieve a timestamp from the internal registers * of the adapter and store it in the skb. **/ -void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, - struct sk_buff *skb) +void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb) { struct igb_adapter *adapter = q_vector->adapter; struct e1000_hw *hw = &adapter->hw; - u64 regval; int adjust = 0; + u64 regval; + + if (!(adapter->ptp_flags & IGB_PTP_ENABLED)) + return; /* If this bit is set, then the RX registers contain the time stamp. No * other packet will be time stamped until we read these registers, so From 8ff0b1f08ea73e5c08f5addd23481e76a60e741c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 19 Mar 2021 11:52:41 +0800 Subject: [PATCH 139/171] sctp: move sk_route_caps check and set into sctp_outq_flush_transports The sk's sk_route_caps is set in sctp_packet_config, and later it only needs to change when traversing the transport_list in a loop, as the dst might be changed in the tx path. So move sk_route_caps check and set into sctp_outq_flush_transports from sctp_packet_transmit. This also fixes a dst leak reported by Chen Yi: https://bugzilla.kernel.org/show_bug.cgi?id=212227 As calling sk_setup_caps() in sctp_packet_transmit may also set the sk_route_caps for the ctrl sock in a netns. When the netns is being deleted, the ctrl sock's releasing is later than dst dev's deleting, which will cause this dev's deleting to hang and dmesg error occurs: unregister_netdevice: waiting for xxx to become free. Usage count = 1 Reported-by: Chen Yi Fixes: bcd623d8e9fa ("sctp: call sk_setup_caps in sctp_packet_transmit instead") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/output.c | 7 ------- net/sctp/outqueue.c | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/sctp/output.c b/net/sctp/output.c index 6614c9fdc51e..a6aa17df09ef 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -584,13 +584,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) goto out; } - rcu_read_lock(); - if (__sk_dst_get(sk) != tp->dst) { - dst_hold(tp->dst); - sk_setup_caps(sk, tp->dst); - } - rcu_read_unlock(); - /* pack up chunks */ pkt_count = sctp_packet_pack(packet, head, gso, gfp); if (!pkt_count) { diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 3fd06a27105d..5cb1aa5f067b 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1135,6 +1135,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx) { + struct sock *sk = ctx->asoc->base.sk; struct list_head *ltransport; struct sctp_packet *packet; struct sctp_transport *t; @@ -1144,6 +1145,12 @@ static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx) t = list_entry(ltransport, struct sctp_transport, send_ready); packet = &t->packet; if (!sctp_packet_empty(packet)) { + rcu_read_lock(); + if (t->dst && __sk_dst_get(sk) != t->dst) { + dst_hold(t->dst); + sk_setup_caps(sk, t->dst); + } + rcu_read_unlock(); error = sctp_packet_transmit(packet, ctx->gfp); if (error < 0) ctx->q->asoc->base.sk->sk_err = -error; From 8a2dc6af67a0c9f65a22ea40fc79974ee8f368c7 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 19 Mar 2021 10:16:23 +0530 Subject: [PATCH 140/171] sch_red: Fix a typo s/recalcultion/recalculation/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: David S. Miller --- include/net/red.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/red.h b/include/net/red.h index 9e6647c4ccd1..0b39eff1d50a 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -295,7 +295,7 @@ static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms int shift; /* - * The problem: ideally, average length queue recalcultion should + * The problem: ideally, average length queue recalculation should * be done over constant clock intervals. This is too expensive, so * that the calculation is driven by outgoing packets. * When the queue is idle we have to model this clock by hand. From f91a50d8b51b5c8ef1cfb08115a005bba4250507 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 19 Mar 2021 15:37:21 +0800 Subject: [PATCH 141/171] r8152: limit the RX buffer size of RTL8153A for USB 2.0 If the USB host controller is EHCI, the throughput is reduced from 300Mb/s to 60Mb/s, when the rx buffer size is modified from 16K to 32K. According to the EHCI spec, the maximum size of the qTD is 20K. Therefore, when the driver uses more than 20K buffer, the latency time of EHCI would be increased. And, it let the RTL8153A get worse throughput. However, the driver uses alloc_pages() for rx buffer, so I limit the rx buffer to 16K rather than 20K. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=205923 Fixes: ec5791c202ac ("r8152: separate the rx buffer size") Reported-by: Robert Davies Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 90f1c0200042..20fb5638ac65 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -6553,7 +6553,10 @@ static int rtl_ops_init(struct r8152 *tp) ops->in_nway = rtl8153_in_nway; ops->hw_phy_cfg = r8153_hw_phy_cfg; ops->autosuspend_en = rtl8153_runtime_enable; - tp->rx_buf_sz = 32 * 1024; + if (tp->udev->speed < USB_SPEED_SUPER) + tp->rx_buf_sz = 16 * 1024; + else + tp->rx_buf_sz = 32 * 1024; tp->eee_en = true; tp->eee_adv = MDIO_EEE_1000T | MDIO_EEE_100TX; break; From 014dfa26ce1c647af09bf506285ef67e0e3f0a6b Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Fri, 19 Mar 2021 13:44:22 +0000 Subject: [PATCH 142/171] net: stmmac: dwmac-sun8i: Provide TX and RX fifo sizes MTU cannot be changed on dwmac-sun8i. (ip link set eth0 mtu xxx returning EINVAL) This is due to tx_fifo_size being 0, since this value is used to compute valid MTU range. Like dwmac-sunxi (with commit 806fd188ce2a ("net: stmmac: dwmac-sunxi: Provide TX and RX fifo sizes")) dwmac-sun8i need to have tx and rx fifo sizes set. I have used values from datasheets. After this patch, setting a non-default MTU (like 1000) value works and network is still useable. Tested-on: sun8i-h3-orangepi-pc Tested-on: sun8i-r40-bananapi-m2-ultra Tested-on: sun50i-a64-bananapi-m64 Tested-on: sun50i-h5-nanopi-neo-plus2 Tested-on: sun50i-h6-pine-h64 Fixes: 9f93ac8d408 ("net-next: stmmac: Add dwmac-sun8i") Reported-by: Belisko Marek Signed-off-by: Corentin Labbe Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 6b75cf2603ff..e62efd166ec8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1214,6 +1214,8 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) plat_dat->init = sun8i_dwmac_init; plat_dat->exit = sun8i_dwmac_exit; plat_dat->setup = sun8i_dwmac_setup; + plat_dat->tx_fifo_size = 4096; + plat_dat->rx_fifo_size = 16384; ret = sun8i_dwmac_set_syscon(&pdev->dev, plat_dat); if (ret) From 1f935e8e72ec28dddb2dc0650b3b6626a293d94b Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Fri, 19 Mar 2021 13:05:41 +0000 Subject: [PATCH 143/171] selinux: vsock: Set SID for socket returned by accept() For AF_VSOCK, accept() currently returns sockets that are unlabelled. Other socket families derive the child's SID from the SID of the parent and the SID of the incoming packet. This is typically done as the connected socket is placed in the queue that accept() removes from. Reuse the existing 'security_sk_clone' hook to copy the SID from the parent (server) socket to the child. There is no packet SID in this case. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: David Brazdil Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5546710d8ac1..bc7fb9bf3351 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -755,6 +755,7 @@ static struct sock *__vsock_create(struct net *net, vsk->buffer_size = psk->buffer_size; vsk->buffer_min_size = psk->buffer_min_size; vsk->buffer_max_size = psk->buffer_max_size; + security_sk_clone(parent, sk); } else { vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); From ef2ef02cd9c2484f7ba29227d5fd5c78d7ea0393 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 19 Mar 2021 11:33:02 -0700 Subject: [PATCH 144/171] mptcp: Change mailing list address The mailing list for MPTCP maintenance has moved to the kernel.org-supported mptcp@lists.linux.dev address. Complete, combined archives for both lists are now hosted at https://lore.kernel.org/mptcp Cc: Matthieu Baerts Signed-off-by: Mat Martineau Acked-by: Matthieu Baerts Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0d80d3dda3d0..8e43e0417335 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12539,7 +12539,7 @@ NETWORKING [MPTCP] M: Mat Martineau M: Matthieu Baerts L: netdev@vger.kernel.org -L: mptcp@lists.01.org +L: mptcp@lists.linux.dev S: Maintained W: https://github.com/multipath-tcp/mptcp_net-next/wiki B: https://github.com/multipath-tcp/mptcp_net-next/issues From 5aa3c334a449bab24519c4967f5ac2b3304c8dcf Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 19 Mar 2021 22:33:14 +0800 Subject: [PATCH 145/171] selftests: forwarding: vxlan_bridge_1d: Fix vxlan ecn decapsulate value The ECN bit defines ECT(1) = 1, ECT(0) = 2. So inner 0x02 + outer 0x01 should be inner ECT(0) + outer ECT(1). Based on the description of __INET_ECN_decapsulate, the final decapsulate value should be ECT(1). So fix the test expect value to 0x01. Before the fix: TEST: VXLAN: ECN decap: 01/02->0x02 [FAIL] Expected to capture 10 packets, got 0. After the fix: TEST: VXLAN: ECN decap: 01/02->0x01 [ OK ] Fixes: a0b61f3d8ebf ("selftests: forwarding: vxlan_bridge_1d: Add an ECN decap test") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index ce6bea9675c0..0ccb1dda099a 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -658,7 +658,7 @@ test_ecn_decap() # In accordance with INET_ECN_decapsulate() __test_ecn_decap 00 00 0x00 __test_ecn_decap 01 01 0x01 - __test_ecn_decap 02 01 0x02 + __test_ecn_decap 02 01 0x01 __test_ecn_decap 01 03 0x03 __test_ecn_decap 02 03 0x03 test_ecn_decap_error From 901ee1d750f29a335423eeb9463c3ca461ca18c2 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Fri, 19 Mar 2021 12:25:54 +0100 Subject: [PATCH 146/171] libbpf: Fix BTF dump of pointer-to-array-of-struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vmlinux.h generated from BTF is invalid when building drivers/phy/ti/phy-gmii-sel.c with clang: vmlinux.h:61702:27: error: array type has incomplete element type ‘struct reg_field’ 61702 | const struct reg_field (*regfields)[3]; | ^~~~~~~~~ bpftool generates a forward declaration for this struct regfield, which compilers aren't happy about. Here's a simplified reproducer: struct inner { int val; }; struct outer { struct inner (*ptr_to_array)[2]; } A; After build with clang -> bpftool btf dump c -> clang/gcc: ./def-clang.h:11:23: error: array has incomplete element type 'struct inner' struct inner (*ptr_to_array)[2]; Member ptr_to_array of struct outer is a pointer to an array of struct inner. In the DWARF generated by clang, struct outer appears before struct inner, so when converting BTF of struct outer into C, bpftool issues a forward declaration to struct inner. With GCC the DWARF info is reversed so struct inner gets fully defined. That forward declaration is not sufficient when compilers handle an array of the struct, even when it's only used through a pointer. Note that we can trigger the same issue with an intermediate typedef: struct inner { int val; }; typedef struct inner inner2_t[2]; struct outer { inner2_t *ptr_to_array; } A; Becomes: struct inner; typedef struct inner inner2_t[2]; And causes: ./def-clang.h:10:30: error: array has incomplete element type 'struct inner' typedef struct inner inner2_t[2]; To fix this, clear through_ptr whenever we encounter an intermediate array, to make the inner struct part of a strong link and force full declaration. Fixes: 351131b51c7a ("libbpf: add btf_dump API for BTF-to-C conversion") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210319112554.794552-2-jean-philippe@linaro.org --- tools/lib/bpf/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 2f9d685bd522..0911aea4cdbe 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -462,7 +462,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) return err; case BTF_KIND_ARRAY: - return btf_dump_order_type(d, btf_array(t)->type, through_ptr); + return btf_dump_order_type(d, btf_array(t)->type, false); case BTF_KIND_STRUCT: case BTF_KIND_UNION: { From f118aac651d87c1811d2abd940f73c45c16b29d7 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Fri, 19 Mar 2021 12:25:55 +0100 Subject: [PATCH 147/171] selftests/bpf: Add selftest for pointer-to-array-of-struct BTF dump Bpftool used to issue forward declarations for a struct used as part of a pointer to array, which is invalid. Add a test to check that the struct is fully defined in this case: @@ -134,9 +134,9 @@ }; }; -struct struct_in_array {}; +struct struct_in_array; -struct struct_in_array_typed {}; +struct struct_in_array_typed; typedef struct struct_in_array_typed struct_in_array_t[2]; @@ -189,3 +189,7 @@ struct struct_with_embedded_stuff _14; }; +struct struct_in_array {}; + +struct struct_in_array_typed {}; + ... #13/1 btf_dump: syntax:FAIL Suggested-by: Andrii Nakryiko Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210319112554.794552-3-jean-philippe@linaro.org --- .../selftests/bpf/progs/btf_dump_test_case_syntax.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index 31975c96e2c9..3ac0c9afc35a 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -174,6 +174,12 @@ struct struct_in_struct { }; }; +struct struct_in_array {}; + +struct struct_in_array_typed {}; + +typedef struct struct_in_array_typed struct_in_array_t[2]; + struct struct_with_embedded_stuff { int a; struct { @@ -203,6 +209,8 @@ struct struct_with_embedded_stuff { } r[5]; struct struct_in_struct s[10]; int t[11]; + struct struct_in_array (*u)[2]; + struct_in_array_t *v; }; struct root_struct { From f60a85cad677c4f9bb4cadd764f1d106c38c7cf8 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 17 Mar 2021 11:09:15 +0800 Subject: [PATCH 148/171] bpf: Fix umd memory leak in copy_process() The syzbot reported a memleak as follows: BUG: memory leak unreferenced object 0xffff888101b41d00 (size 120): comm "kworker/u4:0", pid 8, jiffies 4294944270 (age 12.780s) backtrace: [] alloc_pid+0x66/0x560 [] copy_process+0x1465/0x25e0 [] kernel_clone+0xf3/0x670 [] kernel_thread+0x61/0x80 [] call_usermodehelper_exec_work [] call_usermodehelper_exec_work+0xc4/0x120 [] process_one_work+0x2c9/0x600 [] worker_thread+0x59/0x5d0 [] kthread+0x178/0x1b0 [] ret_from_fork+0x1f/0x30 unreferenced object 0xffff888110ef5c00 (size 232): comm "kworker/u4:0", pid 8414, jiffies 4294944270 (age 12.780s) backtrace: [] kmem_cache_zalloc [] __alloc_file+0x1f/0xf0 [] alloc_empty_file+0x69/0x120 [] alloc_file+0x33/0x1b0 [] alloc_file_pseudo+0xb2/0x140 [] create_pipe_files+0x138/0x2e0 [] umd_setup+0x33/0x220 [] call_usermodehelper_exec_async+0xb4/0x1b0 [] ret_from_fork+0x1f/0x30 After the UMD process exits, the pipe_to_umh/pipe_from_umh and tgid need to be released. Fixes: d71fa5c9763c ("bpf: Add kernel module with user mode driver that populates bpffs.") Reported-by: syzbot+44908bb56d2bfe56b28e@syzkaller.appspotmail.com Signed-off-by: Zqiang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210317030915.2865-1-qiang.zhang@windriver.com --- include/linux/usermode_driver.h | 1 + kernel/bpf/preload/bpf_preload_kern.c | 19 +++++++++++++++---- kernel/usermode_driver.c | 21 +++++++++++++++------ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h index 073a9e0ec07d..ad970416260d 100644 --- a/include/linux/usermode_driver.h +++ b/include/linux/usermode_driver.h @@ -14,5 +14,6 @@ struct umd_info { int umd_load_blob(struct umd_info *info, const void *data, size_t len); int umd_unload_blob(struct umd_info *info); int fork_usermode_driver(struct umd_info *info); +void umd_cleanup_helper(struct umd_info *info); #endif /* __LINUX_USERMODE_DRIVER_H__ */ diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 79c5772465f1..53736e52c1df 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -60,9 +60,12 @@ static int finish(void) &magic, sizeof(magic), &pos); if (n != sizeof(magic)) return -EPIPE; + tgid = umd_ops.info.tgid; - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_ops.info.tgid = NULL; + if (tgid) { + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); + umd_cleanup_helper(&umd_ops.info); + } return 0; } @@ -80,10 +83,18 @@ static int __init load_umd(void) static void __exit fini_umd(void) { + struct pid *tgid; + bpf_preload_ops = NULL; + /* kill UMD in case it's still there due to earlier error */ - kill_pid(umd_ops.info.tgid, SIGKILL, 1); - umd_ops.info.tgid = NULL; + tgid = umd_ops.info.tgid; + if (tgid) { + kill_pid(tgid, SIGKILL, 1); + + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); + umd_cleanup_helper(&umd_ops.info); + } umd_unload_blob(&umd_ops.info); } late_initcall(load_umd); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 0b35212ffc3d..bb7bb3b478ab 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -139,14 +139,23 @@ static void umd_cleanup(struct subprocess_info *info) struct umd_info *umd_info = info->data; /* cleanup if umh_setup() was successful but exec failed */ - if (info->retval) { - fput(umd_info->pipe_to_umh); - fput(umd_info->pipe_from_umh); - put_pid(umd_info->tgid); - umd_info->tgid = NULL; - } + if (info->retval) + umd_cleanup_helper(umd_info); } +/** + * umd_cleanup_helper - release the resources which were allocated in umd_setup + * @info: information about usermode driver + */ +void umd_cleanup_helper(struct umd_info *info) +{ + fput(info->pipe_to_umh); + fput(info->pipe_from_umh); + put_pid(info->tgid); + info->tgid = NULL; +} +EXPORT_SYMBOL_GPL(umd_cleanup_helper); + /** * fork_usermode_driver - fork a usermode driver * @info: information about usermode driver (shouldn't be NULL) From b9082970478009b778aa9b22d5561eef35b53b63 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 19 Mar 2021 17:00:01 -0700 Subject: [PATCH 149/171] bpf: Use NOP_ATOMIC5 instead of emit_nops(&prog, 5) for BPF_TRAMP_F_CALL_ORIG __bpf_arch_text_poke does rewrite only for atomic nop5, emit_nops(xxx, 5) emits non-atomic one which breaks fentry/fexit with k8 atomics: P6_NOP5 == P6_NOP5_ATOMIC (0f1f440000 == 0f1f440000) K8_NOP5 != K8_NOP5_ATOMIC (6666906690 != 6666666690) Can be reproduced by doing "ideal_nops = k8_nops" in "arch_init_ideal_nops() and running fexit_bpf2bpf selftest. Fixes: e21aa341785c ("bpf: Fix fexit trampoline.") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210320000001.915366-1-sdf@google.com --- arch/x86/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 72b5a57e9e31..b35fc8023884 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2012,7 +2012,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); im->ip_after_call = prog; - emit_nops(&prog, 5); + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { From b5f020f82a8e41201c6ede20fa00389d6980b223 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 19 Mar 2021 11:06:19 +0100 Subject: [PATCH 150/171] can: isotp: tx-path: zero initialize outgoing CAN frames Commit d4eb538e1f48 ("can: isotp: TX-path: ensure that CAN frame flags are initialized") ensured the TX flags to be properly set for outgoing CAN frames. In fact the root cause of the issue results from a missing initialization of outgoing CAN frames created by isotp. This is no problem on the CAN bus as the CAN driver only picks the correctly defined content from the struct can(fd)_frame. But when the outgoing frames are monitored (e.g. with candump) we potentially leak some bytes in the unused content of struct can(fd)_frame. Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Cc: Marc Kleine-Budde Link: https://lore.kernel.org/r/20210319100619.10858-1-socketcan@hartkopp.net Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index 430976485d95..15ea1234d457 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -196,7 +196,7 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; - skb_put(nskb, so->ll.mtu); + skb_put_zero(nskb, so->ll.mtu); /* create & send flow control reply */ ncf->can_id = so->txid; @@ -779,7 +779,7 @@ isotp_tx_burst: can_skb_prv(skb)->skbcnt = 0; cf = (struct canfd_frame *)skb->data; - skb_put(skb, so->ll.mtu); + skb_put_zero(skb, so->ll.mtu); /* create consecutive frame */ isotp_fill_dataframe(cf, so, ae, 0); @@ -895,7 +895,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) so->tx.idx = 0; cf = (struct canfd_frame *)skb->data; - skb_put(skb, so->ll.mtu); + skb_put_zero(skb, so->ll.mtu); /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { From 5d7047ed6b7214fbabc16d8712a822e256b1aa44 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Sat, 20 Mar 2021 20:21:54 +0100 Subject: [PATCH 151/171] can: peak_usb: Revert "can: peak_usb: add forgotten supported devices" In commit 6417f03132a6 ("module: remove never implemented MODULE_SUPPORTED_DEVICE") the MODULE_SUPPORTED_DEVICE macro was removed from the kerne entirely. Shortly before this patch was applied mainline the commit 59ec7b89ed3e ("can: peak_usb: add forgotten supported devices") was added to net/master. As this would result in a merge conflict, let's revert this patch. Fixes: 59ec7b89ed3e ("can: peak_usb: add forgotten supported devices") Link: https://lore.kernel.org/r/20210320192649.341832-1-mkl@pengutronix.de Suggested-by: Leon Romanovsky Cc: Stephane Grosjean Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index f1d018218c93..f347ecc79aef 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -18,8 +18,6 @@ MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB FD adapter"); MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB Pro FD adapter"); -MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-Chip USB"); -MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB X6 adapter"); #define PCAN_USBPROFD_CHANNEL_COUNT 2 #define PCAN_USBFD_CHANNEL_COUNT 1 From b4afd4b90a7cfe54c7cd9db49e3c36d552325eac Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 20 Mar 2021 09:17:28 -0500 Subject: [PATCH 152/171] net: ipa: fix init header command validation We use ipa_cmd_header_valid() to ensure certain values we will program into hardware are within range, well in advance of when we actually program them. This way we avoid having to check for errors when we actually program the hardware. Unfortunately the dev_err() call for a bad offset value does not supply the arguments to match the format specifiers properly. Fix this. There was also supposed to be a check to ensure the size to be programmed fits in the field that holds it. Add this missing check. Rearrange the way we ensure the header table fits in overall IPA memory range. Finally, update ipa_cmd_table_valid() so the format of messages printed for errors matches what's done in ipa_cmd_header_valid(). Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_cmd.c | 50 ++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/drivers/net/ipa/ipa_cmd.c b/drivers/net/ipa/ipa_cmd.c index 35e35852c25c..d73b03a80ef8 100644 --- a/drivers/net/ipa/ipa_cmd.c +++ b/drivers/net/ipa/ipa_cmd.c @@ -175,21 +175,23 @@ bool ipa_cmd_table_valid(struct ipa *ipa, const struct ipa_mem *mem, : field_max(IP_FLTRT_FLAGS_NHASH_ADDR_FMASK); if (mem->offset > offset_max || ipa->mem_offset > offset_max - mem->offset) { - dev_err(dev, "IPv%c %s%s table region offset too large " - "(0x%04x + 0x%04x > 0x%04x)\n", - ipv6 ? '6' : '4', hashed ? "hashed " : "", - route ? "route" : "filter", - ipa->mem_offset, mem->offset, offset_max); + dev_err(dev, "IPv%c %s%s table region offset too large\n", + ipv6 ? '6' : '4', hashed ? "hashed " : "", + route ? "route" : "filter"); + dev_err(dev, " (0x%04x + 0x%04x > 0x%04x)\n", + ipa->mem_offset, mem->offset, offset_max); + return false; } if (mem->offset > ipa->mem_size || mem->size > ipa->mem_size - mem->offset) { - dev_err(dev, "IPv%c %s%s table region out of range " - "(0x%04x + 0x%04x > 0x%04x)\n", - ipv6 ? '6' : '4', hashed ? "hashed " : "", - route ? "route" : "filter", - mem->offset, mem->size, ipa->mem_size); + dev_err(dev, "IPv%c %s%s table region out of range\n", + ipv6 ? '6' : '4', hashed ? "hashed " : "", + route ? "route" : "filter"); + dev_err(dev, " (0x%04x + 0x%04x > 0x%04x)\n", + mem->offset, mem->size, ipa->mem_size); + return false; } @@ -205,22 +207,36 @@ static bool ipa_cmd_header_valid(struct ipa *ipa) u32 size_max; u32 size; + /* In ipa_cmd_hdr_init_local_add() we record the offset and size + * of the header table memory area. Make sure the offset and size + * fit in the fields that need to hold them, and that the entire + * range is within the overall IPA memory range. + */ offset_max = field_max(HDR_INIT_LOCAL_FLAGS_HDR_ADDR_FMASK); if (mem->offset > offset_max || ipa->mem_offset > offset_max - mem->offset) { - dev_err(dev, "header table region offset too large " - "(0x%04x + 0x%04x > 0x%04x)\n", - ipa->mem_offset + mem->offset, offset_max); + dev_err(dev, "header table region offset too large\n"); + dev_err(dev, " (0x%04x + 0x%04x > 0x%04x)\n", + ipa->mem_offset, mem->offset, offset_max); + return false; } size_max = field_max(HDR_INIT_LOCAL_FLAGS_TABLE_SIZE_FMASK); size = ipa->mem[IPA_MEM_MODEM_HEADER].size; size += ipa->mem[IPA_MEM_AP_HEADER].size; - if (mem->offset > ipa->mem_size || size > ipa->mem_size - mem->offset) { - dev_err(dev, "header table region out of range " - "(0x%04x + 0x%04x > 0x%04x)\n", - mem->offset, size, ipa->mem_size); + + if (size > size_max) { + dev_err(dev, "header table region size too large\n"); + dev_err(dev, " (0x%04x > 0x%08x)\n", size, size_max); + + return false; + } + if (size > ipa->mem_size || mem->offset > ipa->mem_size - size) { + dev_err(dev, "header table region out of range\n"); + dev_err(dev, " (0x%04x + 0x%04x > 0x%04x)\n", + mem->offset, size, ipa->mem_size); + return false; } From f658b90977d2e79822a558e48116e059a7e75dec Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 20 Mar 2021 21:40:08 +0100 Subject: [PATCH 153/171] r8169: fix DMA being used after buffer free if WoL is enabled IOMMU errors have been reported if WoL is enabled and interface is brought down. It turned out that the network chip triggers DMA transfers after the DMA buffers have been freed. For WoL to work we need to leave rx enabled, therefore simply stop the chip from being a DMA busmaster. Fixes: 567ca57faa62 ("r8169: add rtl8169_up") Tested-by: Paul Blazejowski Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 7aad0ba53372..581a92fc3292 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4646,6 +4646,9 @@ static void rtl8169_down(struct rtl8169_private *tp) rtl8169_update_counters(tp); + pci_clear_master(tp->pci_dev); + rtl_pci_commit(tp); + rtl8169_cleanup(tp, true); rtl_prepare_power_down(tp); @@ -4653,6 +4656,7 @@ static void rtl8169_down(struct rtl8169_private *tp) static void rtl8169_up(struct rtl8169_private *tp) { + pci_set_master(tp->pci_dev); phy_resume(tp->phydev); rtl8169_init_phy(tp); napi_enable(&tp->napi); @@ -5307,8 +5311,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) rtl_hw_reset(tp); - pci_set_master(pdev); - rc = rtl_alloc_irq(tp); if (rc < 0) { dev_err(&pdev->dev, "Can't allocate interrupt\n"); From 87d77e59d1ebc31850697341ab15ca013004b81b Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sun, 21 Mar 2021 02:37:03 +0530 Subject: [PATCH 154/171] docs: networking: Fix a typo s/subsytem/subsystem/ Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: David S. Miller --- Documentation/networking/xfrm_device.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm_device.rst index da1073acda96..01391dfd37d9 100644 --- a/Documentation/networking/xfrm_device.rst +++ b/Documentation/networking/xfrm_device.rst @@ -50,7 +50,7 @@ Callbacks to implement The NIC driver offering ipsec offload will need to implement these callbacks to make the offload available to the network stack's -XFRM subsytem. Additionally, the feature bits NETIF_F_HW_ESP and +XFRM subsystem. Additionally, the feature bits NETIF_F_HW_ESP and NETIF_F_HW_ESP_TX_CSUM will signal the availability of the offload. From 6debc0fd71b947b03c1a39cc100f52b8238259d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sun, 21 Mar 2021 14:00:01 +0100 Subject: [PATCH 155/171] MAINTAINERS: Combine "QLOGIC QLGE 10Gb ETHERNET DRIVER" sections into one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There ended up being two sections with the same title. Combine the two into one section. Signed-off-by: Jonathan Neuschäfer Cc: Manish Chopra Cc: Coiby Xu Signed-off-by: David S. Miller --- MAINTAINERS | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8e43e0417335..897b7c249073 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14710,15 +14710,11 @@ F: drivers/net/ethernet/qlogic/qlcnic/ QLOGIC QLGE 10Gb ETHERNET DRIVER M: Manish Chopra M: GR-Linux-NIC-Dev@marvell.com -L: netdev@vger.kernel.org -S: Supported -F: drivers/staging/qlge/ - -QLOGIC QLGE 10Gb ETHERNET DRIVER M: Coiby Xu L: netdev@vger.kernel.org -S: Maintained +S: Supported F: Documentation/networking/device_drivers/qlogic/qlge.rst +F: drivers/staging/qlge/ QM1D1B0004 MEDIA DRIVER M: Akihiro Tsukada From a50a151e311bd3a793ebe4e5f233db8bfad0b78f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 22 Mar 2021 02:26:37 +0200 Subject: [PATCH 156/171] net: ipconfig: ic_dev can be NULL in ic_close_devs ic_close_dev contains a generalization of the logic to not close a network interface if it's the host port for a DSA switch. This logic is disguised behind an iteration through the lowers of ic_dev in ic_close_dev. When no interface for ipconfig can be found, ic_dev is NULL, and ic_close_dev: - dereferences a NULL pointer when assigning selected_dev - would attempt to search through the lower interfaces of a NULL net_device pointer So we should protect against that case. The "lower_dev" iterator variable was shortened to "lower" in order to keep the 80 character limit. Fixes: f68cbaed67cb ("net: ipconfig: avoid use-after-free in ic_close_devs") Fixes: 46acf7bdbc72 ("Revert "net: ipv4: handle DSA enabled master network devices"") Signed-off-by: Vladimir Oltean Tested-by: Heiko Thiery Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 47db1bfdaaa0..bc2f6ca97152 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -309,7 +309,7 @@ have_carrier: */ static void __init ic_close_devs(void) { - struct net_device *selected_dev = ic_dev->dev; + struct net_device *selected_dev = ic_dev ? ic_dev->dev : NULL; struct ic_device *d, *next; struct net_device *dev; @@ -317,16 +317,18 @@ static void __init ic_close_devs(void) next = ic_first_dev; while ((d = next)) { bool bring_down = (d != ic_dev); - struct net_device *lower_dev; + struct net_device *lower; struct list_head *iter; next = d->next; dev = d->dev; - netdev_for_each_lower_dev(selected_dev, lower_dev, iter) { - if (dev == lower_dev) { - bring_down = false; - break; + if (selected_dev) { + netdev_for_each_lower_dev(selected_dev, lower, iter) { + if (dev == lower) { + bring_down = false; + break; + } } } if (bring_down) { From a07231084da2207629b42244380ae2f1e10bd9b4 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Thu, 18 Mar 2021 20:33:19 -0500 Subject: [PATCH 157/171] net/mlx5: Add back multicast stats for uplink representor The multicast counter got removed from uplink representor due to the cited patch. Fixes: 47c97e6b10a1 ("net/mlx5e: Fix multicast counter not up-to-date in "ip -s"") Signed-off-by: Huy Nguyen Reviewed-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 33b418796e43..c8b8249846a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3846,10 +3846,17 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) } if (mlx5e_is_uplink_rep(priv)) { + struct mlx5e_vport_stats *vstats = &priv->stats.vport; + stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok); stats->rx_bytes = PPORT_802_3_GET(pstats, a_octets_received_ok); stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok); stats->tx_bytes = PPORT_802_3_GET(pstats, a_octets_transmitted_ok); + + /* vport multicast also counts packets that are dropped due to steering + * or rx out of buffer + */ + stats->multicast = VPORT_COUNTER_GET(vstats, received_eth_multicast.packets); } else { mlx5e_fold_sw_stats64(priv, stats); } From 7d6c86e3ccb5ceea767df5c7a9a17cdfccd3df9a Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Wed, 10 Mar 2021 17:01:46 +0200 Subject: [PATCH 158/171] net/mlx5e: Allow to match on MPLS parameters only for MPLS over UDP Currently, we support hardware offload only for MPLS over UDP. However, rules matching on MPLS parameters are now wrongly offloaded for regular MPLS, without actually taking the parameters into consideration when doing the offload. Fix it by rejecting such unsupported rules. Fixes: 72046a91d134 ("net/mlx5e: Allow to match on mpls parameters") Signed-off-by: Alaa Hleihel Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 0cacf46dc950..3359098c51d4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2296,6 +2296,16 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, *match_level = MLX5_MATCH_L4; } + /* Currenlty supported only for MPLS over UDP */ + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) && + !netif_is_bareudp(filter_dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on MPLS is supported only for MPLS over UDP"); + netdev_err(priv->netdev, + "Matching on MPLS is supported only for MPLS over UDP\n"); + return -EOPNOTSUPP; + } + return 0; } From 96b5b4585843e3c83fb1930e5dfbefd0fb889c55 Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Thu, 4 Mar 2021 21:28:11 +0200 Subject: [PATCH 159/171] net/mlx5e: Offload tuple rewrite for non-CT flows Setting connection tracking OVS flows and then setting non-CT flows that use tuple rewrite action (e.g. mod_tp_dst), causes the latter flows not being offloaded. Fix by using a stricter condition in modify_header_match_supported() to check tuple rewrite support only for flows with CT action. The check is factored out into standalone modify_tuple_supported() function to aid readability. Fixes: 7e36feeb0467 ("net/mlx5e: CT: Don't offload tuple rewrites for established tuples") Signed-off-by: Dima Chumak Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/tc_ct.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 44 ++++++++++++++----- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index f3f6eb081948..b2cd29847a37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -1181,7 +1181,8 @@ int mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec) mlx5e_tc_match_to_reg_get_match(spec, CTSTATE_TO_REG, &ctstate, &ctstate_mask); - if (ctstate_mask) + + if ((ctstate & ctstate_mask) == MLX5_CT_STATE_TRK_BIT) return -EOPNOTSUPP; ctstate_mask |= MLX5_CT_STATE_TRK_BIT; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 3359098c51d4..df2a0af854bb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2909,6 +2909,37 @@ static int is_action_keys_supported(const struct flow_action_entry *act, return 0; } +static bool modify_tuple_supported(bool modify_tuple, bool ct_clear, + bool ct_flow, struct netlink_ext_ack *extack, + struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec) +{ + if (!modify_tuple || ct_clear) + return true; + + if (ct_flow) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload tuple modification with non-clear ct()"); + netdev_info(priv->netdev, + "can't offload tuple modification with non-clear ct()"); + return false; + } + + /* Add ct_state=-trk match so it will be offloaded for non ct flows + * (or after clear action), as otherwise, since the tuple is changed, + * we can't restore ct state + */ + if (mlx5_tc_ct_add_no_trk_match(spec)) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload tuple modification with ct matches and no ct(clear) action"); + netdev_info(priv->netdev, + "can't offload tuple modification with ct matches and no ct(clear) action"); + return false; + } + + return true; +} + static bool modify_header_match_supported(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec, struct flow_action *flow_action, @@ -2947,18 +2978,9 @@ static bool modify_header_match_supported(struct mlx5e_priv *priv, return err; } - /* Add ct_state=-trk match so it will be offloaded for non ct flows - * (or after clear action), as otherwise, since the tuple is changed, - * we can't restore ct state - */ - if (!ct_clear && modify_tuple && - mlx5_tc_ct_add_no_trk_match(spec)) { - NL_SET_ERR_MSG_MOD(extack, - "can't offload tuple modify header with ct matches"); - netdev_info(priv->netdev, - "can't offload tuple modify header with ct matches"); + if (!modify_tuple_supported(modify_tuple, ct_clear, ct_flow, extack, + priv, spec)) return false; - } ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol); if (modify_ip_header && ip_proto != IPPROTO_TCP && From 4eacfe72e3e037e3fc019113df32c39a705148c2 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 11 Mar 2021 17:46:35 +0200 Subject: [PATCH 160/171] net/mlx5e: Fix error path for ethtool set-priv-flag Expose error value when failing to comply to command: $ ethtool --set-priv-flags eth2 rx_cqe_compress [on/off] Fixes: be7e87f92b58 ("net/mlx5e: Fail safe cqe compressing/moderation mode setting") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 0e059d5c57ac..f5f2a8fd0046 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1887,6 +1887,7 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev, { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; + int err; if (!MLX5_CAP_GEN(mdev, cqe_compression)) return -EOPNOTSUPP; @@ -1896,7 +1897,10 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev, return -EINVAL; } - mlx5e_modify_rx_cqe_compression_locked(priv, enable); + err = mlx5e_modify_rx_cqe_compression_locked(priv, enable); + if (err) + return err; + priv->channels.params.rx_cqe_compress_def = enable; return 0; From 846d6da1fcdb14105f86b46b4345233550a79d55 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 1 Mar 2021 20:59:59 +0200 Subject: [PATCH 161/171] net/mlx5e: Fix division by 0 in mlx5e_select_queue mlx5e_select_queue compares num_tc_x_num_ch to real_num_tx_queues to determine if HTB and/or PTP offloads are active. If they are, it calculates netdev_pick_tx() % num_tc_x_num_ch to prevent it from selecting HTB and PTP queues for regular traffic. However, before the channels are first activated, num_tc_x_num_ch is zero. If ndo_select_queue gets called at this point, the HTB/PTP check will pass, and mlx5e_select_queue will attempt to take a modulo by num_tc_x_num_ch, which equals to zero. This commit fixes the bug by assigning num_tc_x_num_ch to a non-zero value before registering the netdev. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Reported-by: Jesper Dangaard Brouer Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c8b8249846a9..158f947a8503 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4979,6 +4979,11 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 priv->max_nch); params->num_tc = 1; + /* Set an initial non-zero value, so that mlx5e_select_queue won't + * divide by zero if called before first activating channels. + */ + priv->num_tc_x_num_ch = params->num_channels * params->num_tc; + /* SQ */ params->log_sq_size = is_kdump_kernel() ? MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE : From 7c1ef1959b6fefe616ef3e7df832bf63dfbab9cf Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 11 Mar 2021 20:19:25 +0200 Subject: [PATCH 162/171] net/mlx5: SF, do not use ecpu bit for vhca state processing Device firmware doesn't handle ecpu bit for vhca state processing events and commands. Instead device firmware refers to the unique function id to distinguish SF of different PCI functions. When ecpu bit is used, firmware returns a syndrome. mlx5_cmd_check:780:(pid 872): MODIFY_VHCA_STATE(0xb0e) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x263211) mlx5_sf_dev_table_create:248:(pid 872): SF DEV table create err = -22 Hence, avoid using ecpu bit. Fixes: 8f0105418668 ("net/mlx5: SF, Add port add delete functionality") Fixes: 90d010b8634b ("net/mlx5: SF, Add auxiliary device support") Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/sf/dev/dev.c | 4 +--- .../ethernet/mellanox/mlx5/core/sf/hw_table.c | 8 +++---- .../mellanox/mlx5/core/sf/vhca_event.c | 22 +++++++++---------- .../mellanox/mlx5/core/sf/vhca_event.h | 7 +++--- 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c index b265f27b2166..90b524c59f3c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -181,15 +181,13 @@ static int mlx5_sf_dev_vhca_arm_all(struct mlx5_sf_dev_table *table) u16 max_functions; u16 function_id; int err = 0; - bool ecpu; int i; max_functions = mlx5_sf_max_functions(dev); function_id = MLX5_CAP_GEN(dev, sf_base_id); - ecpu = mlx5_read_embedded_cpu(dev); /* Arm the vhca context as the vhca event notifier */ for (i = 0; i < max_functions; i++) { - err = mlx5_vhca_event_arm(dev, function_id, ecpu); + err = mlx5_vhca_event_arm(dev, function_id); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c index 0914909806cb..a5a0f60bef66 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c @@ -6,7 +6,7 @@ #include "sf.h" #include "mlx5_ifc_vhca_event.h" #include "vhca_event.h" -#include "ecpf.h" +#include "mlx5_core.h" struct mlx5_sf_hw { u32 usr_sfnum; @@ -18,7 +18,6 @@ struct mlx5_sf_hw_table { struct mlx5_core_dev *dev; struct mlx5_sf_hw *sfs; int max_local_functions; - u8 ecpu: 1; struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */ struct notifier_block vhca_nb; }; @@ -72,7 +71,7 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum) if (err) goto err; - err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, table->ecpu, usr_sfnum); + err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, usr_sfnum); if (err) goto vhca_err; @@ -118,7 +117,7 @@ void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u16 id) hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id); mutex_lock(&table->table_lock); - err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, table->ecpu, out, sizeof(out)); + err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, out, sizeof(out)); if (err) goto err; state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state); @@ -164,7 +163,6 @@ int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev) table->dev = dev; table->sfs = sfs; table->max_local_functions = max_functions; - table->ecpu = mlx5_read_embedded_cpu(dev); dev->priv.sf_hw_table = table; mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c index f1c2068d4f2a..28b14b05086f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c @@ -19,52 +19,51 @@ struct mlx5_vhca_event_work { struct mlx5_vhca_state_event event; }; -int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, - bool ecpu, u32 *out, u32 outlen) +int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, u32 *out, u32 outlen) { u32 in[MLX5_ST_SZ_DW(query_vhca_state_in)] = {}; MLX5_SET(query_vhca_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_STATE); MLX5_SET(query_vhca_state_in, in, function_id, function_id); - MLX5_SET(query_vhca_state_in, in, embedded_cpu_function, ecpu); + MLX5_SET(query_vhca_state_in, in, embedded_cpu_function, 0); return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); } static int mlx5_cmd_modify_vhca_state(struct mlx5_core_dev *dev, u16 function_id, - bool ecpu, u32 *in, u32 inlen) + u32 *in, u32 inlen) { u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {}; MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE); MLX5_SET(modify_vhca_state_in, in, function_id, function_id); - MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, ecpu); + MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, 0); return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); } -int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, bool ecpu, u32 sw_fn_id) +int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, u32 sw_fn_id) { u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {}; MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE); MLX5_SET(modify_vhca_state_in, in, function_id, function_id); - MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, ecpu); + MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, 0); MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.sw_function_id, 1); MLX5_SET(modify_vhca_state_in, in, vhca_state_context.sw_function_id, sw_fn_id); return mlx5_cmd_exec_inout(dev, modify_vhca_state, in, out); } -int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id, bool ecpu) +int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id) { u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {}; MLX5_SET(modify_vhca_state_in, in, vhca_state_context.arm_change_event, 1); MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.arm_change_event, 1); - return mlx5_cmd_modify_vhca_state(dev, function_id, ecpu, in, sizeof(in)); + return mlx5_cmd_modify_vhca_state(dev, function_id, in, sizeof(in)); } static void @@ -73,7 +72,7 @@ mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event * u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {}; int err; - err = mlx5_cmd_query_vhca_state(dev, event->function_id, event->ecpu, out, sizeof(out)); + err = mlx5_cmd_query_vhca_state(dev, event->function_id, out, sizeof(out)); if (err) return; @@ -82,7 +81,7 @@ mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event * event->new_vhca_state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state); - mlx5_vhca_event_arm(dev, event->function_id, event->ecpu); + mlx5_vhca_event_arm(dev, event->function_id); blocking_notifier_call_chain(&dev->priv.vhca_state_notifier->n_head, 0, event); } @@ -111,7 +110,6 @@ mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, v INIT_WORK(&work->work, &mlx5_vhca_state_work_handler); work->notifier = notifier; work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id); - work->event.ecpu = be16_to_cpu(eqe->data.vhca_state.ec_function); mlx5_events_work_enqueue(notifier->dev, &work->work); return NOTIFY_OK; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h index 1fe1ec6f4d4b..013cdfe90616 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h @@ -10,7 +10,6 @@ struct mlx5_vhca_state_event { u16 function_id; u16 sw_function_id; u8 new_vhca_state; - bool ecpu; }; static inline bool mlx5_vhca_event_supported(const struct mlx5_core_dev *dev) @@ -25,10 +24,10 @@ void mlx5_vhca_event_start(struct mlx5_core_dev *dev); void mlx5_vhca_event_stop(struct mlx5_core_dev *dev); int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb); void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb); -int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, bool ecpu, u32 sw_fn_id); -int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id, bool ecpu); +int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, u32 sw_fn_id); +int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id); int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, - bool ecpu, u32 *out, u32 outlen); + u32 *out, u32 outlen); #else static inline void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap) From 5ee7d4c7fbc9d3119a20b1c77d34003d1f82ac26 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Mar 2021 17:44:29 +0100 Subject: [PATCH 163/171] isdn: capi: fix mismatched prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc-11 complains about a prototype declaration that is different from the function definition: drivers/isdn/capi/kcapi.c:724:44: error: argument 2 of type ‘u8 *’ {aka ‘unsigned char *’} declared as a pointer [-Werror=array-parameter=] 724 | u16 capi20_get_manufacturer(u32 contr, u8 *buf) | ~~~~^~~ In file included from drivers/isdn/capi/kcapi.c:13: drivers/isdn/capi/kcapi.h:62:43: note: previously declared as an array ‘u8[64]’ {aka ‘unsigned char[64]’} 62 | u16 capi20_get_manufacturer(u32 contr, u8 buf[CAPI_MANUFACTURER_LEN]); | ~~~^~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/isdn/capi/kcapi.c:790:38: error: argument 2 of type ‘u8 *’ {aka ‘unsigned char *’} declared as a pointer [-Werror=array-parameter=] 790 | u16 capi20_get_serial(u32 contr, u8 *serial) | ~~~~^~~~~~ In file included from drivers/isdn/capi/kcapi.c:13: drivers/isdn/capi/kcapi.h:64:37: note: previously declared as an array ‘u8[8]’ {aka ‘unsigned char[8]’} 64 | u16 capi20_get_serial(u32 contr, u8 serial[CAPI_SERIAL_LEN]); | ~~~^~~~~~~~~~~~~~~~~~~~~~~ Change the definition to make them match. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/isdn/capi/kcapi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 7168778fbbe1..cb0afe897162 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -721,7 +721,7 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb) * Return value: CAPI result code */ -u16 capi20_get_manufacturer(u32 contr, u8 *buf) +u16 capi20_get_manufacturer(u32 contr, u8 buf[CAPI_MANUFACTURER_LEN]) { struct capi_ctr *ctr; u16 ret; @@ -787,7 +787,7 @@ u16 capi20_get_version(u32 contr, struct capi_version *verp) * Return value: CAPI result code */ -u16 capi20_get_serial(u32 contr, u8 *serial) +u16 capi20_get_serial(u32 contr, u8 serial[CAPI_SERIAL_LEN]) { struct capi_ctr *ctr; u16 ret; From e0c755a45f6fb6e81e3a62a94db0400ef0cdc046 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Mon, 22 Mar 2021 15:26:50 -0500 Subject: [PATCH 164/171] net: dsa: don't assign an error value to tag_ops Use a temporary variable to hold the return value from dsa_tag_driver_get() instead of assigning it to dst->tag_ops. Leaving an error value in dst->tag_ops can result in deferencing an invalid pointer when a deferred switch configuration happens later. Fixes: 357f203bb3b5 ("net: dsa: keep a copy of the tagging protocol in the DSA switch tree") Signed-off-by: George McCollister Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 4d4956ed303b..d142eb2b288b 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1066,6 +1066,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) { struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; tag_protocol = dsa_get_tag_protocol(dp, master); @@ -1080,14 +1081,16 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) * nothing to do here. */ } else { - dst->tag_ops = dsa_tag_driver_get(tag_protocol); - if (IS_ERR(dst->tag_ops)) { - if (PTR_ERR(dst->tag_ops) == -ENOPROTOOPT) + tag_ops = dsa_tag_driver_get(tag_protocol); + if (IS_ERR(tag_ops)) { + if (PTR_ERR(tag_ops) == -ENOPROTOOPT) return -EPROBE_DEFER; dev_warn(ds->dev, "No tagger for this switch\n"); dp->master = NULL; - return PTR_ERR(dst->tag_ops); + return PTR_ERR(tag_ops); } + + dst->tag_ops = tag_ops; } dp->master = master; From 8ca1b090e5c9a71abeea1dda8757f4ec3811f06e Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 22 Mar 2021 15:13:22 -0300 Subject: [PATCH 165/171] net/sched: act_ct: clear post_ct if doing ct_clear Invalid detection works with two distinct moments: act_ct tries to find a conntrack entry and set post_ct true, indicating that that was attempted. Then, when flow dissector tries to dissect CT info and no entry is there, it knows that it was tried and no entry was found, and synthesizes/sets key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_INVALID; mimicing what OVS does. OVS has this a bit more streamlined, as it recomputes the key after trying to find a conntrack entry for it. Issue here is, when we have 'tc action ct clear', it didn't clear post_ct, causing a subsequent match on 'ct_state -trk' to fail, due to the above. The fix, thus, is to clear it. Reproducer rules: tc filter add dev enp130s0f0np0_0 ingress prio 1 chain 0 \ protocol ip flower ip_proto tcp ct_state -trk \ action ct zone 1 pipe \ action goto chain 2 tc filter add dev enp130s0f0np0_0 ingress prio 1 chain 2 \ protocol ip flower \ action ct clear pipe \ action goto chain 4 tc filter add dev enp130s0f0np0_0 ingress prio 1 chain 4 \ protocol ip flower ct_state -trk \ action mirred egress redirect dev enp130s0f1np1_0 With the fix, the 3rd rule matches, like it does with OVS kernel datapath. Fixes: 7baf2429a1a9 ("net/sched: cls_flower add CT_FLAGS_INVALID flag support") Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: wenxu Signed-off-by: David S. Miller --- net/sched/act_ct.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index f0a0aa125b00..16e888a9601d 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -945,13 +945,14 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&c->tcf_tm); if (clear) { + qdisc_skb_cb(skb)->post_ct = false; ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_conntrack_put(&ct->ct_general); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } - goto out; + goto out_clear; } family = tcf_ct_skb_nf_family(skb); @@ -1030,8 +1031,9 @@ out_push: skb_push_rcsum(skb, nh_ofs); out: - tcf_action_update_bstats(&c->common, skb); qdisc_skb_cb(skb)->post_ct = true; +out_clear: + tcf_action_update_bstats(&c->common, skb); if (defrag) qdisc_skb_cb(skb)->pkt_len = skb->len; return retval; From 6ab4c3117aec4e08007d9e971fa4133e1de1082d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 22 Mar 2021 20:21:08 +0200 Subject: [PATCH 166/171] net: bridge: don't notify switchdev for local FDB addresses As explained in this discussion: https://lore.kernel.org/netdev/20210117193009.io3nungdwuzmo5f7@skbuf/ the switchdev notifiers for FDB entries managed to have a zero-day bug. The bridge would not say that this entry is local: ip link add br0 type bridge ip link set swp0 master br0 bridge fdb add dev swp0 00:01:02:03:04:05 master local and the switchdev driver would be more than happy to offload it as a normal static FDB entry. This is despite the fact that 'local' and non-'local' entries have completely opposite directions: a local entry is locally terminated and not forwarded, whereas a static entry is forwarded and not locally terminated. So, for example, DSA would install this entry on swp0 instead of installing it on the CPU port as it should. There is an even sadder part, which is that the 'local' flag is implicit if 'static' is not specified, meaning that this command produces the same result of adding a 'local' entry: bridge fdb add dev swp0 00:01:02:03:04:05 master I've updated the man pages for 'bridge', and after reading it now, it should be pretty clear to any user that the commands above were broken and should have never resulted in the 00:01:02:03:04:05 address being forwarded (this behavior is coherent with non-switchdev interfaces): https://patchwork.kernel.org/project/netdevbpf/cover/20210211104502.2081443-1-olteanv@gmail.com/ If you're a user reading this and this is what you want, just use: bridge fdb add dev swp0 00:01:02:03:04:05 master static Because switchdev should have given drivers the means from day one to classify FDB entries as local/non-local, but didn't, it means that all drivers are currently broken. So we can just as well omit the switchdev notifications for local FDB entries, which is exactly what this patch does to close the bug in stable trees. For further development work where drivers might want to trap the local FDB entries to the host, we can add a 'bool is_local' to br_switchdev_fdb_call_notifiers(), and selectively make drivers act upon that bit, while all the others ignore those entries if the 'is_local' bit is set. Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about FDB add/del") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/bridge/br_switchdev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index b89503832fcc..1e24d9a2c9a7 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -128,6 +128,8 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) { if (!fdb->dst) return; + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) + return; switch (type) { case RTM_DELNEIGH: From f51d7bf1dbe5522c51c93fe8faa5f4abbdf339cd Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 23 Mar 2021 16:02:29 +0800 Subject: [PATCH 167/171] ptp_qoriq: fix overflow in ptp_qoriq_adjfine() u64 calcalation Current calculation for diff of TMR_ADD register value may have 64-bit overflow in this code line, when long type scaled_ppm is large. adj *= scaled_ppm; This patch is to resolve it by using mul_u64_u64_div_u64(). Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/ptp/ptp_qoriq.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index beb5f74944cd..08f4cf0ad9e3 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -189,15 +189,16 @@ int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) tmr_add = ptp_qoriq->tmr_add; adj = tmr_add; - /* calculate diff as adj*(scaled_ppm/65536)/1000000 - * and round() to the nearest integer + /* + * Calculate diff and round() to the nearest integer + * + * diff = adj * (ppb / 1000000000) + * = adj * scaled_ppm / 65536000000 */ - adj *= scaled_ppm; - diff = div_u64(adj, 8000000); - diff = (diff >> 13) + ((diff >> 12) & 1); + diff = mul_u64_u64_div_u64(adj, scaled_ppm, 32768000000); + diff = DIV64_U64_ROUND_UP(diff, 2); tmr_add = neg_adj ? tmr_add - diff : tmr_add + diff; - ptp_qoriq->write(®s->ctrl_regs->tmr_add, tmr_add); return 0; From 9e0a537d06fc36861e4f78d0a7df1fe2b3592714 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 23 Mar 2021 12:32:45 +0000 Subject: [PATCH 168/171] octeontx2-af: Fix memory leak of object buf Currently the error return path when lfs fails to allocate is not free'ing the memory allocated to buf. Fix this by adding the missing kfree. Addresses-Coverity: ("Resource leak") Fixes: f7884097141b ("octeontx2-af: Formatting debugfs entry rsrc_alloc.") Signed-off-by: Colin Ian King Acked-by: Sunil Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index b4c53b19f535..de3968d2e5ce 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -253,8 +253,10 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, return -ENOSPC; lfs = kzalloc(lf_str_size, GFP_KERNEL); - if (!lfs) + if (!lfs) { + kfree(buf); return -ENOMEM; + } off += scnprintf(&buf[off], buf_size - 1 - off, "%-*s", lf_str_size, "pcifunc"); for (index = 0; index < BLK_COUNT; index++) From 6f235a69e59484e382dc31952025b0308efedc17 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Mar 2021 22:52:50 +0100 Subject: [PATCH 169/171] ch_ktls: fix enum-conversion warning gcc points out an incorrect enum assignment: drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c: In function 'chcr_ktls_cpl_set_tcb_rpl': drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c:684:22: warning: implicit conversion from 'enum ' to 'enum ch_ktls_open_state' [-Wenum-conversion] This appears harmless, and should apparently use 'CH_KTLS_OPEN_SUCCESS' instead of 'false', with the same value '0'. Fixes: efca3878a5fb ("ch_ktls: Issue if connection offload fails") Reviewed-by: Andrew Lunn Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 169e10c91378..1115b8f9ea4e 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -722,7 +722,7 @@ static int chcr_ktls_cpl_set_tcb_rpl(struct adapter *adap, unsigned char *input) kvfree(tx_info); return 0; } - tx_info->open_state = false; + tx_info->open_state = CH_KTLS_OPEN_SUCCESS; spin_unlock(&tx_info->lock); complete(&tx_info->completion); From bf45947864764548697e7515fe693e10f173f312 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 24 Mar 2021 16:42:54 -0700 Subject: [PATCH 170/171] math: Export mul_u64_u64_div_u64 Fixes: f51d7bf1dbe5 ("ptp_qoriq: fix overflow in ptp_qoriq_adjfine() u64 calcalation") Signed-off-by: David S. Miller --- lib/math/div64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/math/div64.c b/lib/math/div64.c index 064d68a5391a..46866394fc84 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -232,4 +232,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) return res + div64_u64(a * b, c); } +EXPORT_SYMBOL(mul_u64_u64_div_u64); #endif From e43accba9b071dcd106b5e7643b1b106a158cbb1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 24 Mar 2021 21:43:32 +0200 Subject: [PATCH 171/171] psample: Fix user API breakage Cited commit added a new attribute before the existing group reference count attribute, thereby changing its value and breaking existing applications on new kernels. Before: # psample -l libpsample ERROR psample_group_foreach: failed to recv message: Operation not supported After: # psample -l Group Num Refcount Group Seq 1 1 0 Fix by restoring the value of the old attribute and remove the misleading comments from the enumerator to avoid future bugs. Cc: stable@vger.kernel.org Fixes: d8bed686ab96 ("net: psample: Add tunnel support") Signed-off-by: Ido Schimmel Reported-by: Adiel Bidani Reviewed-by: Jiri Pirko Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- include/uapi/linux/psample.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h index aea26ab1431c..bff5032c98df 100644 --- a/include/uapi/linux/psample.h +++ b/include/uapi/linux/psample.h @@ -3,7 +3,6 @@ #define __UAPI_PSAMPLE_H enum { - /* sampled packet metadata */ PSAMPLE_ATTR_IIFINDEX, PSAMPLE_ATTR_OIFINDEX, PSAMPLE_ATTR_ORIGSIZE, @@ -11,10 +10,8 @@ enum { PSAMPLE_ATTR_GROUP_SEQ, PSAMPLE_ATTR_SAMPLE_RATE, PSAMPLE_ATTR_DATA, - PSAMPLE_ATTR_TUNNEL, - - /* commands attributes */ PSAMPLE_ATTR_GROUP_REFCOUNT, + PSAMPLE_ATTR_TUNNEL, __PSAMPLE_ATTR_MAX };