From 2c82c7e724ff51cab78e1afd5c2aaa31994fe41e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 Apr 2019 14:53:11 +0200 Subject: [PATCH 01/89] netfilter: nf_tables: fix oops during rule dump We can oops in nf_tables_fill_rule_info(). Its not possible to fetch previous element in rcu-protected lists when deletions are not prevented somehow: list_del_rcu poisons the ->prev pointer value. Before rcu-conversion this was safe as dump operations did hold nfnetlink mutex. Pass previous rule as argument, obtained by keeping a pointer to the previous rule during traversal. Fixes: d9adf22a291883 ("netfilter: nf_tables: use call_rcu in netlink dumps") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 28241e82fd15..4b5159936034 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2270,13 +2270,13 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule) + const struct nft_rule *rule, + const struct nft_rule *prule) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; const struct nft_expr *expr, *next; struct nlattr *list; - const struct nft_rule *prule; u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); @@ -2296,8 +2296,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, NFTA_RULE_PAD)) goto nla_put_failure; - if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { - prule = list_prev_entry(rule, list); + if (event != NFT_MSG_DELRULE && prule) { if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(prule->handle), NFTA_RULE_PAD)) @@ -2344,7 +2343,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, event, 0, ctx->family, ctx->table, - ctx->chain, rule); + ctx->chain, rule, NULL); if (err < 0) { kfree_skb(skb); goto err; @@ -2369,12 +2368,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, const struct nft_chain *chain) { struct net *net = sock_net(skb->sk); + const struct nft_rule *rule, *prule; unsigned int s_idx = cb->args[0]; - const struct nft_rule *rule; + prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) - goto cont; + goto cont_skip; if (*idx < s_idx) goto cont; if (*idx > s_idx) { @@ -2386,11 +2386,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule) < 0) + table, chain, rule, prule) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: + prule = rule; +cont_skip: (*idx)++; } return 0; @@ -2546,7 +2548,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule); + family, table, chain, rule, NULL); if (err < 0) goto err; From 946c0d8e6ed43dae6527e878d0077c1e11015db0 Mon Sep 17 00:00:00 2001 From: Jagdish Motwani Date: Mon, 13 May 2019 23:47:40 +0530 Subject: [PATCH 02/89] netfilter: nf_queue: fix reinject verdict handling This patch fixes netfilter hook traversal when there are more than 1 hooks returning NF_QUEUE verdict. When the first queue reinjects the packet, 'nf_reinject' starts traversing hooks with a proper hook_index. However, if it again receives a NF_QUEUE verdict (by some other netfilter hook), it queues the packet with a wrong hook_index. So, when the second queue reinjects the packet, it re-executes hooks in between. Fixes: 960632ece694 ("netfilter: convert hook list to an array") Signed-off-by: Jagdish Motwani Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_queue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 9dc1d6e04946..b5b2be55ca82 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -255,6 +255,7 @@ static unsigned int nf_iterate(struct sk_buff *skb, repeat: verdict = nf_hook_entry_hookfn(hook, skb, state); if (verdict != NF_ACCEPT) { + *index = i; if (verdict != NF_REPEAT) return verdict; goto repeat; From e633508a95289489d28faacb68b32c3e7e68ef6f Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 15 May 2019 20:15:32 +0200 Subject: [PATCH 03/89] netfilter: nft_fib: Fix existence check support NFTA_FIB_F_PRESENT flag was not always honored since eval functions did not call nft_fib_store_result in all cases. Given that in all callsites there is a struct net_device pointer available which holds the interface data to be stored in destination register, simplify nft_fib_store_result() to just accept that pointer instead of the nft_pktinfo pointer and interface index. This also allows to drop the index to interface lookup previously needed to get the name associated with given index. Fixes: 055c4b34b94f6 ("netfilter: nft_fib: Support existence check") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 2 +- net/ipv4/netfilter/nft_fib_ipv4.c | 23 +++-------------------- net/ipv6/netfilter/nft_fib_ipv6.c | 16 ++-------------- net/netfilter/nft_fib.c | 6 +++--- 4 files changed, 9 insertions(+), 38 deletions(-) diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index a88f92737308..e4c4d8eaca8c 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -34,5 +34,5 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_fib_store_result(void *reg, const struct nft_fib *priv, - const struct nft_pktinfo *pkt, int index); + const struct net_device *dev); #endif diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 94eb25bc8d7e..c8888e52591f 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -58,11 +58,6 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, } EXPORT_SYMBOL_GPL(nft_fib4_eval_type); -static int get_ifindex(const struct net_device *dev) -{ - return dev ? dev->ifindex : 0; -} - void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { @@ -94,8 +89,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, pkt, - nft_in(pkt)->ifindex); + nft_fib_store_result(dest, priv, nft_in(pkt)); return; } @@ -108,8 +102,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (ipv4_is_zeronet(iph->saddr)) { if (ipv4_is_lbcast(iph->daddr) || ipv4_is_local_multicast(iph->daddr)) { - nft_fib_store_result(dest, priv, pkt, - get_ifindex(pkt->skb->dev)); + nft_fib_store_result(dest, priv, pkt->skb->dev); return; } } @@ -150,17 +143,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, found = oif; } - switch (priv->result) { - case NFT_FIB_RESULT_OIF: - *dest = found->ifindex; - break; - case NFT_FIB_RESULT_OIFNAME: - strncpy((char *)dest, found->name, IFNAMSIZ); - break; - default: - WARN_ON_ONCE(1); - break; - } + nft_fib_store_result(dest, priv, found); } EXPORT_SYMBOL_GPL(nft_fib4_eval); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 73cdc0bc63f7..ec068b0cffca 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -169,8 +169,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, pkt, - nft_in(pkt)->ifindex); + nft_fib_store_result(dest, priv, nft_in(pkt)); return; } @@ -187,18 +186,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (oif && oif != rt->rt6i_idev->dev) goto put_rt_err; - switch (priv->result) { - case NFT_FIB_RESULT_OIF: - *dest = rt->rt6i_idev->dev->ifindex; - break; - case NFT_FIB_RESULT_OIFNAME: - strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ); - break; - default: - WARN_ON_ONCE(1); - break; - } - + nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); put_rt_err: ip6_rt_put(rt); } diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 21df8cccea65..77f00a99dfab 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -135,17 +135,17 @@ int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr) EXPORT_SYMBOL_GPL(nft_fib_dump); void nft_fib_store_result(void *reg, const struct nft_fib *priv, - const struct nft_pktinfo *pkt, int index) + const struct net_device *dev) { - struct net_device *dev; u32 *dreg = reg; + int index; switch (priv->result) { case NFT_FIB_RESULT_OIF: + index = dev ? dev->ifindex : 0; *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index; break; case NFT_FIB_RESULT_OIFNAME: - dev = dev_get_by_index_rcu(nft_net(pkt), index); if (priv->flags & NFTA_FIB_F_PRESENT) *dreg = !!dev; else From 719c7d563c17b150877cee03a4b812a424989dfa Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 17 May 2019 22:31:49 +0800 Subject: [PATCH 04/89] ipvs: Fix use-after-free in ip_vs_in BUG: KASAN: use-after-free in ip_vs_in.part.29+0xe8/0xd20 [ip_vs] Read of size 4 at addr ffff8881e9b26e2c by task sshd/5603 CPU: 0 PID: 5603 Comm: sshd Not tainted 4.19.39+ #30 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Call Trace: dump_stack+0x71/0xab print_address_description+0x6a/0x270 kasan_report+0x179/0x2c0 ip_vs_in.part.29+0xe8/0xd20 [ip_vs] ip_vs_in+0xd8/0x170 [ip_vs] nf_hook_slow+0x5f/0xe0 __ip_local_out+0x1d5/0x250 ip_local_out+0x19/0x60 __tcp_transmit_skb+0xba1/0x14f0 tcp_write_xmit+0x41f/0x1ed0 ? _copy_from_iter_full+0xca/0x340 __tcp_push_pending_frames+0x52/0x140 tcp_sendmsg_locked+0x787/0x1600 ? tcp_sendpage+0x60/0x60 ? inet_sk_set_state+0xb0/0xb0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x6d/0x80 sock_write_iter+0x121/0x1c0 ? sock_sendmsg+0x80/0x80 __vfs_write+0x23e/0x370 vfs_write+0xe7/0x230 ksys_write+0xa1/0x120 ? __ia32_sys_read+0x50/0x50 ? __audit_syscall_exit+0x3ce/0x450 do_syscall_64+0x73/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7ff6f6147c60 Code: 73 01 c3 48 8b 0d 28 12 2d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 5d 73 2d 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 RSP: 002b:00007ffd772ead18 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000034 RCX: 00007ff6f6147c60 RDX: 0000000000000034 RSI: 000055df30a31270 RDI: 0000000000000003 RBP: 000055df30a31270 R08: 0000000000000000 R09: 0000000000000000 R10: 00007ffd772ead70 R11: 0000000000000246 R12: 00007ffd772ead74 R13: 00007ffd772eae20 R14: 00007ffd772eae24 R15: 000055df2f12ddc0 Allocated by task 6052: kasan_kmalloc+0xa0/0xd0 __kmalloc+0x10a/0x220 ops_init+0x97/0x190 register_pernet_operations+0x1ac/0x360 register_pernet_subsys+0x24/0x40 0xffffffffc0ea016d do_one_initcall+0x8b/0x253 do_init_module+0xe3/0x335 load_module+0x2fc0/0x3890 __do_sys_finit_module+0x192/0x1c0 do_syscall_64+0x73/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 6067: __kasan_slab_free+0x130/0x180 kfree+0x90/0x1a0 ops_free_list.part.7+0xa6/0xc0 unregister_pernet_operations+0x18b/0x1f0 unregister_pernet_subsys+0x1d/0x30 ip_vs_cleanup+0x1d/0xd2f [ip_vs] __x64_sys_delete_module+0x20c/0x300 do_syscall_64+0x73/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff8881e9b26600 which belongs to the cache kmalloc-4096 of size 4096 The buggy address is located 2092 bytes inside of 4096-byte region [ffff8881e9b26600, ffff8881e9b27600) The buggy address belongs to the page: page:ffffea0007a6c800 count:1 mapcount:0 mapping:ffff888107c0e600 index:0x0 compound_mapcount: 0 flags: 0x17ffffc0008100(slab|head) raw: 0017ffffc0008100 dead000000000100 dead000000000200 ffff888107c0e600 raw: 0000000000000000 0000000080070007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected while unregistering ipvs module, ops_free_list calls __ip_vs_cleanup, then nf_unregister_net_hooks be called to do remove nf hook entries. It need a RCU period to finish, however net->ipvs is set to NULL immediately, which will trigger NULL pointer dereference when a packet is hooked and handled by ip_vs_in where net->ipvs is dereferenced. Another scene is ops_free_list call ops_free to free the net_generic directly while __ip_vs_cleanup finished, then calling ip_vs_in will triggers use-after-free. This patch moves nf_unregister_net_hooks from __ip_vs_cleanup() to __ip_vs_dev_cleanup(), where rcu_barrier() is called by unregister_pernet_device -> unregister_pernet_operations, that will do the needed grace period. Reported-by: Hulk Robot Fixes: efe41606184e ("ipvs: convert to use pernet nf_hook api") Suggested-by: Julian Anastasov Signed-off-by: YueHaibing Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 14457551bcb4..8ebf21149ec3 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2312,7 +2312,6 @@ static void __net_exit __ip_vs_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ ip_vs_conn_net_cleanup(ipvs); ip_vs_app_net_cleanup(ipvs); @@ -2327,6 +2326,7 @@ static void __net_exit __ip_vs_dev_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); EnterFunction(2); + nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); ipvs->enable = 0; /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); From 82ce6eb1dd13fd12e449b2ee2c2ec051e6f52c43 Mon Sep 17 00:00:00 2001 From: Jeffrin Jose T Date: Wed, 15 May 2019 12:14:04 +0530 Subject: [PATCH 05/89] selftests: netfilter: missing error check when setting up veth interface A test for the basic NAT functionality uses ip command which needs veth device. There is a condition where the kernel support for veth is not compiled into the kernel and the test script breaks. This patch contains code for reasonable error display and correct code exit. Signed-off-by: Jeffrin Jose T Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_nat.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index 21159f5f3362..f102a65aa8b7 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -24,7 +24,11 @@ ip netns add ns0 ip netns add ns1 ip netns add ns2 -ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 +ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi ip link add veth1 netns ns0 type veth peer name eth0 netns ns2 ip -net ns0 link set lo up From 6bac76db1da3cb162c425d58ae421486f8e43955 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 20 May 2019 13:48:10 +0200 Subject: [PATCH 06/89] netfilter: nat: fix udp checksum corruption Due to copy&paste error nf_nat_mangle_udp_packet passes IPPROTO_TCP, resulting in incorrect udp checksum when payload had to be mangled. Fixes: dac3fe72596f9 ("netfilter: nat: remove csum_recalc hook") Reported-by: Marc Haber Tested-by: Marc Haber Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index ccc06f7539d7..53aeb12b70fb 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -170,7 +170,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) return true; - nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_TCP, + nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_UDP, udph, &udph->check, datalen, oldlen); return true; From e75b3e1c9bc5b997d09bdf8eb72ab3dd3c1a7072 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:30 +0200 Subject: [PATCH 07/89] netfilter: nf_flow_table: ignore DF bit setting Its irrelevant if the DF bit is set or not, we must pass packet to stack in either case. If the DF bit is set, we must pass it to stack so the appropriate ICMP error can be generated. If the DF is not set, we must pass it to stack for fragmentation. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 0d603e20b519..bfd44db9f214 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -243,8 +243,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; outdev = rt->dst.dev; - if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) && - (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0) + if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) return NF_ACCEPT; if (skb_try_make_writable(skb, sizeof(*iph))) From 8437a6209f76f85a2db1abb12a9bde2170801617 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:31 +0200 Subject: [PATCH 08/89] netfilter: nft_flow_offload: set liberal tracking mode for tcp Without it, whenever a packet has to be pushed up the stack (e.g. because of mtu mismatch), then conntrack will flag packets as invalid, which in turn breaks NAT. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 69d7a8439c7a..bde63d9c3c4e 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -72,6 +72,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, struct nf_flow_route route; struct flow_offload *flow; enum ip_conntrack_dir dir; + bool is_tcp = false; struct nf_conn *ct; int ret; @@ -84,6 +85,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: + is_tcp = true; + break; case IPPROTO_UDP: break; default: @@ -108,6 +111,11 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (!flow) goto err_flow_alloc; + if (is_tcp) { + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + } + ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; From 91a9048f238063dde7feea752b9dd386f7e3808b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:32 +0200 Subject: [PATCH 09/89] netfilter: nft_flow_offload: don't offload when sequence numbers need adjustment We can't deal with tcp sequence number rewrite in flow_offload. While at it, simplify helper check, we only need to know if the extension is present, we don't need the helper data. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index bde63d9c3c4e..c97c03c3939a 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -12,7 +12,6 @@ #include #include #include -#include struct nft_flow_offload { struct nft_flowtable *flowtable; @@ -67,7 +66,6 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; - const struct nf_conn_help *help; enum ip_conntrack_info ctinfo; struct nf_flow_route route; struct flow_offload *flow; @@ -93,8 +91,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto out; } - help = nfct_help(ct); - if (help) + if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || + ct->status & IPS_SEQ_ADJUST) goto out; if (!nf_ct_is_confirmed(ct)) From 69aeb538587e087bfc81dd1f465eab3558ff3158 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:33 +0200 Subject: [PATCH 10/89] netfilter: nft_flow_offload: IPCB is only valid for ipv4 family Guard this with a check vs. ipv4, IPCB isn't valid in ipv6 case. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index c97c03c3939a..d70742e95e14 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -48,15 +48,20 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, return 0; } -static bool nft_flow_offload_skip(struct sk_buff *skb) +static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { - struct ip_options *opt = &(IPCB(skb)->opt); - - if (unlikely(opt->optlen)) - return true; if (skb_sec_path(skb)) return true; + if (family == NFPROTO_IPV4) { + const struct ip_options *opt; + + opt = &(IPCB(skb)->opt); + + if (unlikely(opt->optlen)) + return true; + } + return false; } @@ -74,7 +79,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, struct nf_conn *ct; int ret; - if (nft_flow_offload_skip(pkt->skb)) + if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); From 2de03b45236f3af1800755614fd434d347adf046 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 May 2019 13:24:34 +0200 Subject: [PATCH 11/89] selftests: netfilter: add flowtable test script Exercises 3 cases: 1. no pmtu discovery (need to frag) 2. no PMTUd + NAT (don't flag packets as invalid from conntrack) 3. PMTU + NAT (need to send icmp error) The first two cases make sure we handle fragments correctly, i.e. pass them to classic forwarding path. Third case checks we offload everything (in the test case, PMTUd will kick in so all packets should be within link mtu). Nftables rules will filter packets that are supposed to be handled by the fast-path. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/Makefile | 2 +- .../selftests/netfilter/nft_flowtable.sh | 324 ++++++++++++++++++ 2 files changed, 325 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/nft_flowtable.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 3e6d1bcc2894..4144984ebee5 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -2,6 +2,6 @@ # Makefile for netfilter selftests TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ - conntrack_icmp_related.sh + conntrack_icmp_related.sh nft_flowtable.sh include ../lib.mk diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh new file mode 100755 index 000000000000..fe52488a6f72 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -0,0 +1,324 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This tests basic flowtable functionality. +# Creates following topology: +# +# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000) +# Router1 is the one doing flow offloading, Router2 has no special +# purpose other than having a link that is smaller than either Originator +# and responder, i.e. TCPMSS announced values are too large and will still +# result in fragmentation and/or PMTU discovery. + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +ns1in="" +ns2in="" +ns1out="" +ns2out="" + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +which nc > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nc (netcat)" + exit $ksft_skip +fi + +ip netns add nsr1 +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace" + exit $ksft_skip +fi + +ip netns add ns1 +ip netns add ns2 + +ip netns add nsr2 + +cleanup() { + for i in 1 2; do + ip netns del ns$i + ip netns del nsr$i + done + + rm -f "$ns1in" "$ns1out" + rm -f "$ns2in" "$ns2out" + + [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +trap cleanup EXIT + +sysctl -q net.netfilter.nf_log_all_netns=1 + +ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1 +ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2 + +ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2 + +for dev in lo veth0 veth1; do + for i in 1 2; do + ip -net nsr$i link set $dev up + done +done + +ip -net nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net nsr1 addr add dead:1::1/64 dev veth0 + +ip -net nsr2 addr add 10.0.2.1/24 dev veth1 +ip -net nsr2 addr add dead:2::1/64 dev veth1 + +# set different MTUs so we need to push packets coming from ns1 (large MTU) +# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), +# or to do PTMU discovery (send ICMP error back to originator). +# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers +# is NOT the lowest link mtu. + +ip -net nsr1 link set veth0 mtu 9000 +ip -net ns1 link set eth0 mtu 9000 + +ip -net nsr2 link set veth1 mtu 2000 +ip -net ns2 link set eth0 mtu 2000 + +# transfer-net between nsr1 and nsr2. +# these addresses are not used for connections. +ip -net nsr1 addr add 192.168.10.1/24 dev veth1 +ip -net nsr1 addr add fee1:2::1/64 dev veth1 + +ip -net nsr2 addr add 192.168.10.2/24 dev veth0 +ip -net nsr2 addr add fee1:2::2/64 dev veth0 + +for i in 1 2; do + ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip -net ns$i link set lo up + ip -net ns$i link set eth0 up + ip -net ns$i addr add 10.0.$i.99/24 dev eth0 + ip -net ns$i route add default via 10.0.$i.1 + ip -net ns$i addr add dead:$i::99/64 dev eth0 + ip -net ns$i route add default via dead:$i::1 + ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null + + # don't set ip DF bit for first two tests + ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null +done + +ip -net nsr1 route add default via 192.168.10.2 +ip -net nsr2 route add default via 192.168.10.1 + +ip netns exec nsr1 nft -f - < /dev/null +if [ $? -ne 0 ];then + echo "ERROR: ns1 cannot reach ns2" 1>&2 + bash + exit 1 +fi + +ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null +if [ $? -ne 0 ];then + echo "ERROR: ns2 cannot reach ns1" 1>&2 + exit 1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: netns routing/connectivity: ns1 can reach ns2" +fi + +ns1in=$(mktemp) +ns1out=$(mktemp) +ns2in=$(mktemp) +ns2out=$(mktemp) + +make_file() +{ + name=$1 + who=$2 + + SIZE=$((RANDOM % (1024 * 8))) + TSIZE=$((SIZE * 1024)) + + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + + SIZE=$((RANDOM % 1024)) + SIZE=$((SIZE + 128)) + TSIZE=$((TSIZE + SIZE)) + dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null +} + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "FAIL: file mismatch for $what" 1>&2 + ls -l "$in" + ls -l "$out" + return 1 + fi + + return 0 +} + +test_tcp_forwarding() +{ + local nsa=$1 + local nsb=$2 + local lret=0 + + ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" & + lpid=$! + + sleep 1 + ip netns exec $nsa nc -w 4 10.0.2.99 12345 < "$ns1in" > "$ns1out" & + cpid=$! + + sleep 3 + + kill $lpid + kill $cpid + wait + + check_transfer "$ns1in" "$ns2out" "ns1 -> ns2" + if [ $? -ne 0 ];then + lret=1 + fi + + check_transfer "$ns2in" "$ns1out" "ns1 <- ns2" + if [ $? -ne 0 ];then + lret=1 + fi + + return $lret +} + +make_file "$ns1in" "ns1" +make_file "$ns2in" "ns2" + +# First test: +# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. +test_tcp_forwarding ns1 ns2 +if [ $? -eq 0 ] ;then + echo "PASS: flow offloaded for ns1/ns2" +else + echo "FAIL: flow offload for ns1/ns2:" 1>&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# delete default route, i.e. ns2 won't be able to reach ns1 and +# will depend on ns1 being masqueraded in nsr1. +# expect ns1 has nsr1 address. +ip -net ns2 route del default via 10.0.2.1 +ip -net ns2 route del default via dead:2::1 +ip -net ns2 route add 192.168.10.1 via 10.0.2.1 + +# Second test: +# Same, but with NAT enabled. +ip netns exec nsr1 nft -f - <&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# Third test: +# Same as second test, but with PMTU discovery enabled. +handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2) + +ip netns exec nsr1 nft delete rule inet filter forward $handle +if [ $? -ne 0 ] ;then + echo "FAIL: Could not delete large-packet accept rule" + exit 1 +fi + +ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null + +test_tcp_forwarding ns1 ns2 +if [ $? -eq 0 ] ;then + echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery" +else + echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 + ip netns exec nsr1 nft list ruleset +fi + +exit $ret From 7dc2bccab0ee37ac28096b8fcdc390a679a15841 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 21 May 2019 06:40:04 +0000 Subject: [PATCH 12/89] Validate required parameters in inet6_validate_link_af inet6_set_link_af requires that at least one of IFLA_INET6_TOKEN or IFLA_INET6_ADDR_GET_MODE is passed. If none of them is passed, it returns -EINVAL, which may cause do_setlink() to fail in the middle of processing other commands and give the following warning message: A link change request failed with some changes committed already. Interface eth0 may have been left with an inconsistent configuration, please check. Check the presence of at least one of them in inet6_validate_link_af to detect invalid parameters at an early stage, before do_setlink does anything. Also validate the address generation mode at an early stage. Signed-off-by: Maxim Mikityanskiy Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 57 ++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f96d1de79509..b51630ddb728 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5661,18 +5661,6 @@ static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = { [IFLA_INET6_TOKEN] = { .len = sizeof(struct in6_addr) }, }; -static int inet6_validate_link_af(const struct net_device *dev, - const struct nlattr *nla) -{ - struct nlattr *tb[IFLA_INET6_MAX + 1]; - - if (dev && !__in6_dev_get(dev)) - return -EAFNOSUPPORT; - - return nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, - inet6_af_policy, NULL); -} - static int check_addr_gen_mode(int mode) { if (mode != IN6_ADDR_GEN_MODE_EUI64 && @@ -5693,14 +5681,44 @@ static int check_stable_privacy(struct inet6_dev *idev, struct net *net, return 1; } +static int inet6_validate_link_af(const struct net_device *dev, + const struct nlattr *nla) +{ + struct nlattr *tb[IFLA_INET6_MAX + 1]; + struct inet6_dev *idev = NULL; + int err; + + if (dev) { + idev = __in6_dev_get(dev); + if (!idev) + return -EAFNOSUPPORT; + } + + err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, + inet6_af_policy, NULL); + if (err) + return err; + + if (!tb[IFLA_INET6_TOKEN] && !tb[IFLA_INET6_ADDR_GEN_MODE]) + return -EINVAL; + + if (tb[IFLA_INET6_ADDR_GEN_MODE]) { + u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); + + if (check_addr_gen_mode(mode) < 0) + return -EINVAL; + if (dev && check_stable_privacy(idev, dev_net(dev), mode) < 0) + return -EINVAL; + } + + return 0; +} + static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) { - int err = -EINVAL; struct inet6_dev *idev = __in6_dev_get(dev); struct nlattr *tb[IFLA_INET6_MAX + 1]; - - if (!idev) - return -EAFNOSUPPORT; + int err; if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0) BUG(); @@ -5714,15 +5732,10 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (tb[IFLA_INET6_ADDR_GEN_MODE]) { u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); - if (check_addr_gen_mode(mode) < 0 || - check_stable_privacy(idev, dev_net(dev), mode) < 0) - return -EINVAL; - idev->cnf.addr_gen_mode = mode; - err = 0; } - return err; + return 0; } static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, From 40a1578d631a8ac1cf0ef797c435114107747859 Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Tue, 21 May 2019 19:52:55 +0300 Subject: [PATCH 13/89] ocelot: Dont allocate another multicast list, use __dev_mc_sync Doing kmalloc in atomic context is always an issue, more so for a list that can grow significantly. Turns out that the driver only uses the duplicated list of multicast mac addresses to keep track of what addresses to delete from h/w before committing the new list from kernel to h/w back again via set_rx_mode, every time this list gets updated by the kernel. Given that the h/w knows how to add and delete mac addresses based on the mac address value alone, __dev_mc_sync should be the much better choice of kernel API for these operations avoiding the considerable overhead of maintaining a duplicated list in the driver. Signed-off-by: Claudiu Manoil Tested-by: Alexandre Belloni Acked-by: Alexandre Belloni Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 43 ++++++------------------------ drivers/net/ethernet/mscc/ocelot.h | 4 --- 2 files changed, 8 insertions(+), 39 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index d715ef4fc92f..02ad11e0b0d8 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -593,45 +593,25 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static void ocelot_mact_mc_reset(struct ocelot_port *port) +static int ocelot_mc_unsync(struct net_device *dev, const unsigned char *addr) { - struct ocelot *ocelot = port->ocelot; - struct netdev_hw_addr *ha, *n; + struct ocelot_port *port = netdev_priv(dev); - /* Free and forget all the MAC addresses stored in the port private mc - * list. These are mc addresses that were previously added by calling - * ocelot_mact_mc_add(). - */ - list_for_each_entry_safe(ha, n, &port->mc, list) { - ocelot_mact_forget(ocelot, ha->addr, port->pvid); - list_del(&ha->list); - kfree(ha); - } + return ocelot_mact_forget(port->ocelot, addr, port->pvid); } -static int ocelot_mact_mc_add(struct ocelot_port *port, - struct netdev_hw_addr *hw_addr) +static int ocelot_mc_sync(struct net_device *dev, const unsigned char *addr) { - struct ocelot *ocelot = port->ocelot; - struct netdev_hw_addr *ha = kzalloc(sizeof(*ha), GFP_ATOMIC); + struct ocelot_port *port = netdev_priv(dev); - if (!ha) - return -ENOMEM; - - memcpy(ha, hw_addr, sizeof(*ha)); - list_add_tail(&ha->list, &port->mc); - - ocelot_mact_learn(ocelot, PGID_CPU, ha->addr, port->pvid, - ENTRYTYPE_LOCKED); - - return 0; + return ocelot_mact_learn(port->ocelot, PGID_CPU, addr, port->pvid, + ENTRYTYPE_LOCKED); } static void ocelot_set_rx_mode(struct net_device *dev) { struct ocelot_port *port = netdev_priv(dev); struct ocelot *ocelot = port->ocelot; - struct netdev_hw_addr *ha; int i; u32 val; @@ -643,13 +623,7 @@ static void ocelot_set_rx_mode(struct net_device *dev) for (i = ocelot->num_phys_ports + 1; i < PGID_CPU; i++) ocelot_write_rix(ocelot, val, ANA_PGID_PGID, i); - /* Handle the device multicast addresses. First remove all the - * previously installed addresses and then add the latest ones to the - * mac table. - */ - ocelot_mact_mc_reset(port); - netdev_for_each_mc_addr(ha, dev) - ocelot_mact_mc_add(port, ha); + __dev_mc_sync(dev, ocelot_mc_sync, ocelot_mc_unsync); } static int ocelot_port_get_phys_port_name(struct net_device *dev, @@ -1657,7 +1631,6 @@ int ocelot_probe_port(struct ocelot *ocelot, u8 port, ocelot_port->regs = regs; ocelot_port->chip_port = port; ocelot_port->phy = phy; - INIT_LIST_HEAD(&ocelot_port->mc); ocelot->ports[port] = ocelot_port; dev->netdev_ops = &ocelot_port_netdev_ops; diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index ba3b3380b4d0..541fe41e60b0 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -441,10 +441,6 @@ struct ocelot_port { struct phy_device *phy; void __iomem *regs; u8 chip_port; - /* Keep a track of the mc addresses added to the mac table, so that they - * can be removed when needed. - */ - struct list_head mc; /* Ingress default VLAN (pvid) */ u16 pvid; From d008b3d2be4b00267e7af5c21269e7af4f65c6e2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 May 2019 12:42:56 +0300 Subject: [PATCH 14/89] mISDN: Fix indenting in dsp_cmx.c We used a script to indent this code back in 2012, but I guess it got confused by the ifdefs and added some extra tabs. This patch removes them. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/isdn/mISDN/dsp_cmx.c | 437 +++++++++++++++++------------------ 1 file changed, 218 insertions(+), 219 deletions(-) diff --git a/drivers/isdn/mISDN/dsp_cmx.c b/drivers/isdn/mISDN/dsp_cmx.c index d4b6f01a3f0e..6d2088fbaf69 100644 --- a/drivers/isdn/mISDN/dsp_cmx.c +++ b/drivers/isdn/mISDN/dsp_cmx.c @@ -1676,9 +1676,9 @@ dsp_cmx_send(void *arg) #ifdef CMX_CONF_DEBUG if (conf->software && members > 1) #else - if (conf->software && members > 2) + if (conf->software && members > 2) #endif - mustmix = 1; + mustmix = 1; } /* transmission required */ @@ -1699,263 +1699,262 @@ dsp_cmx_send(void *arg) #ifdef CMX_CONF_DEBUG if (conf->software && members > 1) { #else - if (conf->software && members > 2) { + if (conf->software && members > 2) { #endif - /* check for hdlc conf */ - member = list_entry(conf->mlist.next, - struct dsp_conf_member, list); - if (member->dsp->hdlc) - continue; - /* mix all data */ - memset(mixbuffer, 0, length * sizeof(s32)); - list_for_each_entry(member, &conf->mlist, list) { - dsp = member->dsp; - /* get range of data to mix */ - c = mixbuffer; - q = dsp->rx_buff; - r = dsp->rx_R; - rr = (r + length) & CMX_BUFF_MASK; - /* add member's data */ - while (r != rr) { - *c++ += dsp_audio_law_to_s32[q[r]]; - r = (r + 1) & CMX_BUFF_MASK; - } - } - - /* process each member */ - list_for_each_entry(member, &conf->mlist, list) { - /* transmission */ - dsp_cmx_send_member(member->dsp, length, - mixbuffer, members); + /* check for hdlc conf */ + member = list_entry(conf->mlist.next, + struct dsp_conf_member, list); + if (member->dsp->hdlc) + continue; + /* mix all data */ + memset(mixbuffer, 0, length * sizeof(s32)); + list_for_each_entry(member, &conf->mlist, list) { + dsp = member->dsp; + /* get range of data to mix */ + c = mixbuffer; + q = dsp->rx_buff; + r = dsp->rx_R; + rr = (r + length) & CMX_BUFF_MASK; + /* add member's data */ + while (r != rr) { + *c++ += dsp_audio_law_to_s32[q[r]]; + r = (r + 1) & CMX_BUFF_MASK; } } + + /* process each member */ + list_for_each_entry(member, &conf->mlist, list) { + /* transmission */ + dsp_cmx_send_member(member->dsp, length, + mixbuffer, members); + } + } + } + + /* delete rx-data, increment buffers, change pointers */ + list_for_each_entry(dsp, &dsp_ilist, list) { + if (dsp->hdlc) + continue; + p = dsp->rx_buff; + q = dsp->tx_buff; + r = dsp->rx_R; + /* move receive pointer when receiving */ + if (!dsp->rx_is_off) { + rr = (r + length) & CMX_BUFF_MASK; + /* delete rx-data */ + while (r != rr) { + p[r] = dsp_silence; + r = (r + 1) & CMX_BUFF_MASK; + } + /* increment rx-buffer pointer */ + dsp->rx_R = r; /* write incremented read pointer */ } - /* delete rx-data, increment buffers, change pointers */ - list_for_each_entry(dsp, &dsp_ilist, list) { - if (dsp->hdlc) - continue; - p = dsp->rx_buff; - q = dsp->tx_buff; - r = dsp->rx_R; - /* move receive pointer when receiving */ - if (!dsp->rx_is_off) { - rr = (r + length) & CMX_BUFF_MASK; + /* check current rx_delay */ + delay = (dsp->rx_W-dsp->rx_R) & CMX_BUFF_MASK; + if (delay >= CMX_BUFF_HALF) + delay = 0; /* will be the delay before next write */ + /* check for lower delay */ + if (delay < dsp->rx_delay[0]) + dsp->rx_delay[0] = delay; + /* check current tx_delay */ + delay = (dsp->tx_W-dsp->tx_R) & CMX_BUFF_MASK; + if (delay >= CMX_BUFF_HALF) + delay = 0; /* will be the delay before next write */ + /* check for lower delay */ + if (delay < dsp->tx_delay[0]) + dsp->tx_delay[0] = delay; + if (jittercheck) { + /* find the lowest of all rx_delays */ + delay = dsp->rx_delay[0]; + i = 1; + while (i < MAX_SECONDS_JITTER_CHECK) { + if (delay > dsp->rx_delay[i]) + delay = dsp->rx_delay[i]; + i++; + } + /* + * remove rx_delay only if we have delay AND we + * have not preset cmx_delay AND + * the delay is greater dsp_poll + */ + if (delay > dsp_poll && !dsp->cmx_delay) { + if (dsp_debug & DEBUG_DSP_CLOCK) + printk(KERN_DEBUG + "%s lowest rx_delay of %d bytes for" + " dsp %s are now removed.\n", + __func__, delay, + dsp->name); + r = dsp->rx_R; + rr = (r + delay - (dsp_poll >> 1)) + & CMX_BUFF_MASK; /* delete rx-data */ while (r != rr) { p[r] = dsp_silence; r = (r + 1) & CMX_BUFF_MASK; } /* increment rx-buffer pointer */ - dsp->rx_R = r; /* write incremented read pointer */ + dsp->rx_R = r; + /* write incremented read pointer */ } - - /* check current rx_delay */ - delay = (dsp->rx_W-dsp->rx_R) & CMX_BUFF_MASK; - if (delay >= CMX_BUFF_HALF) - delay = 0; /* will be the delay before next write */ - /* check for lower delay */ - if (delay < dsp->rx_delay[0]) - dsp->rx_delay[0] = delay; - /* check current tx_delay */ - delay = (dsp->tx_W-dsp->tx_R) & CMX_BUFF_MASK; - if (delay >= CMX_BUFF_HALF) - delay = 0; /* will be the delay before next write */ - /* check for lower delay */ - if (delay < dsp->tx_delay[0]) - dsp->tx_delay[0] = delay; - if (jittercheck) { - /* find the lowest of all rx_delays */ - delay = dsp->rx_delay[0]; - i = 1; - while (i < MAX_SECONDS_JITTER_CHECK) { - if (delay > dsp->rx_delay[i]) - delay = dsp->rx_delay[i]; - i++; - } - /* - * remove rx_delay only if we have delay AND we - * have not preset cmx_delay AND - * the delay is greater dsp_poll - */ - if (delay > dsp_poll && !dsp->cmx_delay) { - if (dsp_debug & DEBUG_DSP_CLOCK) - printk(KERN_DEBUG - "%s lowest rx_delay of %d bytes for" - " dsp %s are now removed.\n", - __func__, delay, - dsp->name); - r = dsp->rx_R; - rr = (r + delay - (dsp_poll >> 1)) - & CMX_BUFF_MASK; - /* delete rx-data */ - while (r != rr) { - p[r] = dsp_silence; - r = (r + 1) & CMX_BUFF_MASK; - } - /* increment rx-buffer pointer */ - dsp->rx_R = r; - /* write incremented read pointer */ - } - /* find the lowest of all tx_delays */ - delay = dsp->tx_delay[0]; - i = 1; - while (i < MAX_SECONDS_JITTER_CHECK) { - if (delay > dsp->tx_delay[i]) - delay = dsp->tx_delay[i]; - i++; - } - /* - * remove delay only if we have delay AND we - * have enabled tx_dejitter - */ - if (delay > dsp_poll && dsp->tx_dejitter) { - if (dsp_debug & DEBUG_DSP_CLOCK) - printk(KERN_DEBUG - "%s lowest tx_delay of %d bytes for" - " dsp %s are now removed.\n", - __func__, delay, - dsp->name); - r = dsp->tx_R; - rr = (r + delay - (dsp_poll >> 1)) - & CMX_BUFF_MASK; - /* delete tx-data */ - while (r != rr) { - q[r] = dsp_silence; - r = (r + 1) & CMX_BUFF_MASK; - } - /* increment rx-buffer pointer */ - dsp->tx_R = r; - /* write incremented read pointer */ - } - /* scroll up delays */ - i = MAX_SECONDS_JITTER_CHECK - 1; - while (i) { - dsp->rx_delay[i] = dsp->rx_delay[i - 1]; - dsp->tx_delay[i] = dsp->tx_delay[i - 1]; - i--; - } - dsp->tx_delay[0] = CMX_BUFF_HALF; /* (infinite) delay */ - dsp->rx_delay[0] = CMX_BUFF_HALF; /* (infinite) delay */ + /* find the lowest of all tx_delays */ + delay = dsp->tx_delay[0]; + i = 1; + while (i < MAX_SECONDS_JITTER_CHECK) { + if (delay > dsp->tx_delay[i]) + delay = dsp->tx_delay[i]; + i++; } + /* + * remove delay only if we have delay AND we + * have enabled tx_dejitter + */ + if (delay > dsp_poll && dsp->tx_dejitter) { + if (dsp_debug & DEBUG_DSP_CLOCK) + printk(KERN_DEBUG + "%s lowest tx_delay of %d bytes for" + " dsp %s are now removed.\n", + __func__, delay, + dsp->name); + r = dsp->tx_R; + rr = (r + delay - (dsp_poll >> 1)) + & CMX_BUFF_MASK; + /* delete tx-data */ + while (r != rr) { + q[r] = dsp_silence; + r = (r + 1) & CMX_BUFF_MASK; + } + /* increment rx-buffer pointer */ + dsp->tx_R = r; + /* write incremented read pointer */ + } + /* scroll up delays */ + i = MAX_SECONDS_JITTER_CHECK - 1; + while (i) { + dsp->rx_delay[i] = dsp->rx_delay[i - 1]; + dsp->tx_delay[i] = dsp->tx_delay[i - 1]; + i--; + } + dsp->tx_delay[0] = CMX_BUFF_HALF; /* (infinite) delay */ + dsp->rx_delay[0] = CMX_BUFF_HALF; /* (infinite) delay */ } - - /* if next event would be in the past ... */ - if ((s32)(dsp_spl_jiffies + dsp_tics-jiffies) <= 0) - dsp_spl_jiffies = jiffies + 1; - else - dsp_spl_jiffies += dsp_tics; - - dsp_spl_tl.expires = dsp_spl_jiffies; - add_timer(&dsp_spl_tl); - - /* unlock */ - spin_unlock_irqrestore(&dsp_lock, flags); } + /* if next event would be in the past ... */ + if ((s32)(dsp_spl_jiffies + dsp_tics-jiffies) <= 0) + dsp_spl_jiffies = jiffies + 1; + else + dsp_spl_jiffies += dsp_tics; + + dsp_spl_tl.expires = dsp_spl_jiffies; + add_timer(&dsp_spl_tl); + + /* unlock */ + spin_unlock_irqrestore(&dsp_lock, flags); +} + /* * audio data is transmitted from upper layer to the dsp */ - void - dsp_cmx_transmit(struct dsp *dsp, struct sk_buff *skb) - { - u_int w, ww; - u8 *d, *p; - int space; /* todo: , l = skb->len; */ +void +dsp_cmx_transmit(struct dsp *dsp, struct sk_buff *skb) +{ + u_int w, ww; + u8 *d, *p; + int space; /* todo: , l = skb->len; */ #ifdef CMX_TX_DEBUG - char debugbuf[256] = ""; + char debugbuf[256] = ""; #endif - /* check if there is enough space, and then copy */ - w = dsp->tx_W; - ww = dsp->tx_R; - p = dsp->tx_buff; - d = skb->data; - space = (ww - w - 1) & CMX_BUFF_MASK; - /* write-pointer should not overrun nor reach read pointer */ - if (space < skb->len) { - /* write to the space we have left */ - ww = (ww - 1) & CMX_BUFF_MASK; /* end one byte prior tx_R */ - if (dsp_debug & DEBUG_DSP_CLOCK) - printk(KERN_DEBUG "%s: TX overflow space=%d skb->len=" - "%d, w=0x%04x, ww=0x%04x\n", __func__, space, - skb->len, w, ww); - } else - /* write until all byte are copied */ - ww = (w + skb->len) & CMX_BUFF_MASK; - dsp->tx_W = ww; - + /* check if there is enough space, and then copy */ + w = dsp->tx_W; + ww = dsp->tx_R; + p = dsp->tx_buff; + d = skb->data; + space = (ww - w - 1) & CMX_BUFF_MASK; + /* write-pointer should not overrun nor reach read pointer */ + if (space < skb->len) { + /* write to the space we have left */ + ww = (ww - 1) & CMX_BUFF_MASK; /* end one byte prior tx_R */ + if (dsp_debug & DEBUG_DSP_CLOCK) + printk(KERN_DEBUG "%s: TX overflow space=%d skb->len=" + "%d, w=0x%04x, ww=0x%04x\n", __func__, space, + skb->len, w, ww); + } else + /* write until all byte are copied */ + ww = (w + skb->len) & CMX_BUFF_MASK; + dsp->tx_W = ww; /* show current buffer */ #ifdef CMX_DEBUG - printk(KERN_DEBUG - "cmx_transmit(dsp=%lx) %d bytes to 0x%x-0x%x. %s\n", - (u_long)dsp, (ww - w) & CMX_BUFF_MASK, w, ww, dsp->name); + printk(KERN_DEBUG + "cmx_transmit(dsp=%lx) %d bytes to 0x%x-0x%x. %s\n", + (u_long)dsp, (ww - w) & CMX_BUFF_MASK, w, ww, dsp->name); #endif - /* copy transmit data to tx-buffer */ + /* copy transmit data to tx-buffer */ #ifdef CMX_TX_DEBUG - sprintf(debugbuf, "TX getting (%04x-%04x)%p: ", w, ww, p); + sprintf(debugbuf, "TX getting (%04x-%04x)%p: ", w, ww, p); #endif - while (w != ww) { + while (w != ww) { #ifdef CMX_TX_DEBUG - if (strlen(debugbuf) < 48) - sprintf(debugbuf + strlen(debugbuf), " %02x", *d); + if (strlen(debugbuf) < 48) + sprintf(debugbuf + strlen(debugbuf), " %02x", *d); #endif - p[w] = *d++; - w = (w + 1) & CMX_BUFF_MASK; - } -#ifdef CMX_TX_DEBUG - printk(KERN_DEBUG "%s\n", debugbuf); -#endif - + p[w] = *d++; + w = (w + 1) & CMX_BUFF_MASK; } +#ifdef CMX_TX_DEBUG + printk(KERN_DEBUG "%s\n", debugbuf); +#endif + +} /* * hdlc data is received from card and sent to all members. */ - void - dsp_cmx_hdlc(struct dsp *dsp, struct sk_buff *skb) - { - struct sk_buff *nskb = NULL; - struct dsp_conf_member *member; - struct mISDNhead *hh; +void +dsp_cmx_hdlc(struct dsp *dsp, struct sk_buff *skb) +{ + struct sk_buff *nskb = NULL; + struct dsp_conf_member *member; + struct mISDNhead *hh; - /* not if not active */ - if (!dsp->b_active) - return; + /* not if not active */ + if (!dsp->b_active) + return; - /* check if we have sompen */ - if (skb->len < 1) - return; + /* check if we have sompen */ + if (skb->len < 1) + return; - /* no conf */ - if (!dsp->conf) { - /* in case of software echo */ - if (dsp->echo.software) { - nskb = skb_clone(skb, GFP_ATOMIC); - if (nskb) { - hh = mISDN_HEAD_P(nskb); - hh->prim = PH_DATA_REQ; - hh->id = 0; - skb_queue_tail(&dsp->sendq, nskb); - schedule_work(&dsp->workq); - } + /* no conf */ + if (!dsp->conf) { + /* in case of software echo */ + if (dsp->echo.software) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + hh = mISDN_HEAD_P(nskb); + hh->prim = PH_DATA_REQ; + hh->id = 0; + skb_queue_tail(&dsp->sendq, nskb); + schedule_work(&dsp->workq); } - return; } - /* in case of hardware conference */ - if (dsp->conf->hardware) - return; - list_for_each_entry(member, &dsp->conf->mlist, list) { - if (dsp->echo.software || member->dsp != dsp) { - nskb = skb_clone(skb, GFP_ATOMIC); - if (nskb) { - hh = mISDN_HEAD_P(nskb); - hh->prim = PH_DATA_REQ; - hh->id = 0; - skb_queue_tail(&member->dsp->sendq, nskb); - schedule_work(&member->dsp->workq); - } + return; + } + /* in case of hardware conference */ + if (dsp->conf->hardware) + return; + list_for_each_entry(member, &dsp->conf->mlist, list) { + if (dsp->echo.software || member->dsp != dsp) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + hh = mISDN_HEAD_P(nskb); + hh->prim = PH_DATA_REQ; + hh->id = 0; + skb_queue_tail(&member->dsp->sendq, nskb); + schedule_work(&member->dsp->workq); } } } +} From b0d8d4363e523e952254619ae24dd0dfd7ea1181 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 May 2019 18:57:12 -0700 Subject: [PATCH 15/89] Documentation: net: move device drivers docs to a submenu Some of the device drivers have really long document titles making the networking table of contents hard to look through. Place vendor drivers under a submenu. Signed-off-by: Jakub Kicinski Acked-by: Dave Watson Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- .../networking/device_drivers/index.rst | 30 +++++++++++++++++++ Documentation/networking/index.rst | 14 +-------- 2 files changed, 31 insertions(+), 13 deletions(-) create mode 100644 Documentation/networking/device_drivers/index.rst diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst new file mode 100644 index 000000000000..75fa537763a4 --- /dev/null +++ b/Documentation/networking/device_drivers/index.rst @@ -0,0 +1,30 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +Vendor Device Drivers +===================== + +Contents: + +.. toctree:: + :maxdepth: 2 + + freescale/dpaa2/index + intel/e100 + intel/e1000 + intel/e1000e + intel/fm10k + intel/igb + intel/igbvf + intel/ixgb + intel/ixgbe + intel/ixgbevf + intel/i40e + intel/iavf + intel/ice + +.. only:: subproject + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index f390fe3cfdfb..7a2bfad6a762 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -11,19 +11,7 @@ Contents: batman-adv can can_ucan_protocol - device_drivers/freescale/dpaa2/index - device_drivers/intel/e100 - device_drivers/intel/e1000 - device_drivers/intel/e1000e - device_drivers/intel/fm10k - device_drivers/intel/igb - device_drivers/intel/igbvf - device_drivers/intel/ixgb - device_drivers/intel/ixgbe - device_drivers/intel/ixgbevf - device_drivers/intel/i40e - device_drivers/intel/iavf - device_drivers/intel/ice + device_drivers/index dsa/index devlink-info-versions ieee802154 From f3c0f3c6c2013e6caa7ab9c3c6a9fb12f6832c43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 May 2019 18:57:13 -0700 Subject: [PATCH 16/89] Documentation: tls: RSTify the ktls documentation Convert the TLS doc to RST. Use C code blocks for the code samples, and mark hyperlinks. Signed-off-by: Jakub Kicinski Acked-by: Dave Watson Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/{tls.txt => tls.rst} | 42 +++++++++++++------ 2 files changed, 30 insertions(+), 13 deletions(-) rename Documentation/networking/{tls.txt => tls.rst} (88%) diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 7a2bfad6a762..f0f97eef091c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -28,6 +28,7 @@ Contents: checksum-offloads segmentation-offloads scaling + tls .. only:: subproject diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.rst similarity index 88% rename from Documentation/networking/tls.txt rename to Documentation/networking/tls.rst index 58b5ef75f1b7..482bd73f18a2 100644 --- a/Documentation/networking/tls.txt +++ b/Documentation/networking/tls.rst @@ -1,3 +1,7 @@ +========== +Kernel TLS +========== + Overview ======== @@ -12,6 +16,8 @@ Creating a TLS connection First create a new TCP socket and set the TLS ULP. +.. code-block:: c + sock = socket(AF_INET, SOCK_STREAM, 0); setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls")); @@ -21,6 +27,8 @@ handshake is complete, we have all the parameters required to move the data-path to the kernel. There is a separate socket option for moving the transmit and the receive into the kernel. +.. code-block:: c + /* From linux/tls.h */ struct tls_crypto_info { unsigned short version; @@ -58,6 +66,8 @@ After setting the TLS_TX socket option all application data sent over this socket is encrypted using TLS and the parameters provided in the socket option. For example, we can send an encrypted hello world record as follows: +.. code-block:: c + const char *msg = "hello world\n"; send(sock, msg, strlen(msg)); @@ -67,6 +77,8 @@ to the encrypted kernel send buffer if possible. The sendfile system call will send the file's data over TLS records of maximum length (2^14). +.. code-block:: c + file = open(filename, O_RDONLY); fstat(file, &stat); sendfile(sock, file, &offset, stat.st_size); @@ -89,6 +101,8 @@ After setting the TLS_RX socket option, all recv family socket calls are decrypted using TLS parameters provided. A full TLS record must be received before decryption can happen. +.. code-block:: c + char buffer[16384]; recv(sock, buffer, 16384); @@ -97,12 +111,12 @@ large enough, and no additional allocations occur. If the userspace buffer is too small, data is decrypted in the kernel and copied to userspace. -EINVAL is returned if the TLS version in the received message does not +``EINVAL`` is returned if the TLS version in the received message does not match the version passed in setsockopt. -EMSGSIZE is returned if the received message is too big. +``EMSGSIZE`` is returned if the received message is too big. -EBADMSG is returned if decryption failed for any other reason. +``EBADMSG`` is returned if decryption failed for any other reason. Send TLS control messages ------------------------- @@ -113,9 +127,11 @@ These messages can be sent over the socket by providing the TLS record type via a CMSG. For example the following function sends @data of @length bytes using a record of type @record_type. -/* send TLS control message using record_type */ +.. code-block:: c + + /* send TLS control message using record_type */ static int klts_send_ctrl_message(int sock, unsigned char record_type, - void *data, size_t length) + void *data, size_t length) { struct msghdr msg = {0}; int cmsg_len = sizeof(record_type); @@ -151,6 +167,8 @@ type passed via cmsg. If no cmsg buffer is provided, an error is returned if a control message is received. Data messages may be received without a cmsg buffer set. +.. code-block:: c + char buffer[16384]; char cmsg[CMSG_SPACE(sizeof(unsigned char))]; struct msghdr msg = {0}; @@ -186,12 +204,10 @@ Integrating in to userspace TLS library At a high level, the kernel TLS ULP is a replacement for the record layer of a userspace TLS library. -A patchset to OpenSSL to use ktls as the record layer is here: +A patchset to OpenSSL to use ktls as the record layer is +`here `_. -https://github.com/Mellanox/openssl/commits/tls_rx2 - -An example of calling send directly after a handshake using -gnutls. Since it doesn't implement a full record layer, control -messages are not supported: - -https://github.com/ktls/af_ktls-tool/commits/RX +`An example `_ +of calling send directly after a handshake using gnutls. +Since it doesn't implement a full record layer, control +messages are not supported. From f42c104f2ec94a9255a835cd4cd1bd76279d4d06 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 May 2019 18:57:14 -0700 Subject: [PATCH 17/89] Documentation: add TLS offload documentation Describe existing kernel TLS offload (added back in Linux 4.19) - the mechanism, the expected behavior and the notable corner cases. This documentation is mostly targeting hardware vendors who want to implement offload, to ensure consistency between implementations. v2: - add emphasis around TLS_SW/TLS_HW/TLS_HW_RECORD; - remove mentions of ongoing work (Boris); - split the flow of data in SW vs. HW cases in TX overview (Boris); - call out which fields are updated by the device and which are filled by the stack (Boris); - move error handling into it's own section (Boris); - add more words about fallback (Boris); - note that checksum validation is required (Alexei); - note that drivers shouldn't pay attention to the TLS device features. Signed-off-by: Jakub Kicinski Acked-by: Dave Watson Acked-by: Alexei Starovoitov Acked-by: Boris Pismenny Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + .../networking/tls-offload-layers.svg | 1 + .../networking/tls-offload-reorder-bad.svg | 1 + .../networking/tls-offload-reorder-good.svg | 1 + Documentation/networking/tls-offload.rst | 482 ++++++++++++++++++ Documentation/networking/tls.rst | 2 + 6 files changed, 488 insertions(+) create mode 100644 Documentation/networking/tls-offload-layers.svg create mode 100644 Documentation/networking/tls-offload-reorder-bad.svg create mode 100644 Documentation/networking/tls-offload-reorder-good.svg create mode 100644 Documentation/networking/tls-offload.rst diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index f0f97eef091c..a46fca264bee 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -29,6 +29,7 @@ Contents: segmentation-offloads scaling tls + tls-offload .. only:: subproject diff --git a/Documentation/networking/tls-offload-layers.svg b/Documentation/networking/tls-offload-layers.svg new file mode 100644 index 000000000000..cf72f05dbb21 --- /dev/null +++ b/Documentation/networking/tls-offload-layers.svg @@ -0,0 +1 @@ + diff --git a/Documentation/networking/tls-offload-reorder-bad.svg b/Documentation/networking/tls-offload-reorder-bad.svg new file mode 100644 index 000000000000..d107aaf0f71e --- /dev/null +++ b/Documentation/networking/tls-offload-reorder-bad.svg @@ -0,0 +1 @@ + diff --git a/Documentation/networking/tls-offload-reorder-good.svg b/Documentation/networking/tls-offload-reorder-good.svg new file mode 100644 index 000000000000..10e17d91f70c --- /dev/null +++ b/Documentation/networking/tls-offload-reorder-good.svg @@ -0,0 +1 @@ + diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst new file mode 100644 index 000000000000..cb85af559dff --- /dev/null +++ b/Documentation/networking/tls-offload.rst @@ -0,0 +1,482 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================== +Kernel TLS offload +================== + +Kernel TLS operation +==================== + +Linux kernel provides TLS connection offload infrastructure. Once a TCP +connection is in ``ESTABLISHED`` state user space can enable the TLS Upper +Layer Protocol (ULP) and install the cryptographic connection state. +For details regarding the user-facing interface refer to the TLS +documentation in :ref:`Documentation/networking/tls.rst `. + +``ktls`` can operate in three modes: + + * Software crypto mode (``TLS_SW``) - CPU handles the cryptography. + In most basic cases only crypto operations synchronous with the CPU + can be used, but depending on calling context CPU may utilize + asynchronous crypto accelerators. The use of accelerators introduces extra + latency on socket reads (decryption only starts when a read syscall + is made) and additional I/O load on the system. + * Packet-based NIC offload mode (``TLS_HW``) - the NIC handles crypto + on a packet by packet basis, provided the packets arrive in order. + This mode integrates best with the kernel stack and is described in detail + in the remaining part of this document + (``ethtool`` flags ``tls-hw-tx-offload`` and ``tls-hw-rx-offload``). + * Full TCP NIC offload mode (``TLS_HW_RECORD``) - mode of operation where + NIC driver and firmware replace the kernel networking stack + with its own TCP handling, it is not usable in production environments + making use of the Linux networking stack for example any firewalling + abilities or QoS and packet scheduling (``ethtool`` flag ``tls-hw-record``). + +The operation mode is selected automatically based on device configuration, +offload opt-in or opt-out on per-connection basis is not currently supported. + +TX +-- + +At a high level user write requests are turned into a scatter list, the TLS ULP +intercepts them, inserts record framing, performs encryption (in ``TLS_SW`` +mode) and then hands the modified scatter list to the TCP layer. From this +point on the TCP stack proceeds as normal. + +In ``TLS_HW`` mode the encryption is not performed in the TLS ULP. +Instead packets reach a device driver, the driver will mark the packets +for crypto offload based on the socket the packet is attached to, +and send them to the device for encryption and transmission. + +RX +-- + +On the receive side if the device handled decryption and authentication +successfully, the driver will set the decrypted bit in the associated +:c:type:`struct sk_buff `. The packets reach the TCP stack and +are handled normally. ``ktls`` is informed when data is queued to the socket +and the ``strparser`` mechanism is used to delineate the records. Upon read +request, records are retrieved from the socket and passed to decryption routine. +If device decrypted all the segments of the record the decryption is skipped, +otherwise software path handles decryption. + +.. kernel-figure:: tls-offload-layers.svg + :alt: TLS offload layers + :align: center + :figwidth: 28em + + Layers of Kernel TLS stack + +Device configuration +==================== + +During driver initialization device sets the ``NETIF_F_HW_TLS_RX`` and +``NETIF_F_HW_TLS_TX`` features and installs its +:c:type:`struct tlsdev_ops ` +pointer in the :c:member:`tlsdev_ops` member of the +:c:type:`struct net_device `. + +When TLS cryptographic connection state is installed on a ``ktls`` socket +(note that it is done twice, once for RX and once for TX direction, +and the two are completely independent), the kernel checks if the underlying +network device is offload-capable and attempts the offload. In case offload +fails the connection is handled entirely in software using the same mechanism +as if the offload was never tried. + +Offload request is performed via the :c:member:`tls_dev_add` callback of +:c:type:`struct tlsdev_ops `: + +.. code-block:: c + + int (*tls_dev_add)(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn); + +``direction`` indicates whether the cryptographic information is for +the received or transmitted packets. Driver uses the ``sk`` parameter +to retrieve the connection 5-tuple and socket family (IPv4 vs IPv6). +Cryptographic information in ``crypto_info`` includes the key, iv, salt +as well as TLS record sequence number. ``start_offload_tcp_sn`` indicates +which TCP sequence number corresponds to the beginning of the record with +sequence number from ``crypto_info``. The driver can add its state +at the end of kernel structures (see :c:member:`driver_state` members +in ``include/net/tls.h``) to avoid additional allocations and pointer +dereferences. + +TX +-- + +After TX state is installed, the stack guarantees that the first segment +of the stream will start exactly at the ``start_offload_tcp_sn`` sequence +number, simplifying TCP sequence number matching. + +TX offload being fully initialized does not imply that all segments passing +through the driver and which belong to the offloaded socket will be after +the expected sequence number and will have kernel record information. +In particular, already encrypted data may have been queued to the socket +before installing the connection state in the kernel. + +RX +-- + +In RX direction local networking stack has little control over the segmentation, +so the initial records' TCP sequence number may be anywhere inside the segment. + +Normal operation +================ + +At the minimum the device maintains the following state for each connection, in +each direction: + + * crypto secrets (key, iv, salt) + * crypto processing state (partial blocks, partial authentication tag, etc.) + * record metadata (sequence number, processing offset and length) + * expected TCP sequence number + +There are no guarantees on record length or record segmentation. In particular +segments may start at any point of a record and contain any number of records. +Assuming segments are received in order, the device should be able to perform +crypto operations and authentication regardless of segmentation. For this +to be possible device has to keep small amount of segment-to-segment state. +This includes at least: + + * partial headers (if a segment carried only a part of the TLS header) + * partial data block + * partial authentication tag (all data had been seen but part of the + authentication tag has to be written or read from the subsequent segment) + +Record reassembly is not necessary for TLS offload. If the packets arrive +in order the device should be able to handle them separately and make +forward progress. + +TX +-- + +The kernel stack performs record framing reserving space for the authentication +tag and populating all other TLS header and tailer fields. + +Both the device and the driver maintain expected TCP sequence numbers +due to the possibility of retransmissions and the lack of software fallback +once the packet reaches the device. +For segments passed in order, the driver marks the packets with +a connection identifier (note that a 5-tuple lookup is insufficient to identify +packets requiring HW offload, see the :ref:`5tuple_problems` section) +and hands them to the device. The device identifies the packet as requiring +TLS handling and confirms the sequence number matches its expectation. +The device performs encryption and authentication of the record data. +It replaces the authentication tag and TCP checksum with correct values. + +RX +-- + +Before a packet is DMAed to the host (but after NIC's embedded switching +and packet transformation functions) the device validates the Layer 4 +checksum and performs a 5-tuple lookup to find any TLS connection the packet +may belong to (technically a 4-tuple +lookup is sufficient - IP addresses and TCP port numbers, as the protocol +is always TCP). If connection is matched device confirms if the TCP sequence +number is the expected one and proceeds to TLS handling (record delineation, +decryption, authentication for each record in the packet). The device leaves +the record framing unmodified, the stack takes care of record decapsulation. +Device indicates successful handling of TLS offload in the per-packet context +(descriptor) passed to the host. + +Upon reception of a TLS offloaded packet, the driver sets +the :c:member:`decrypted` mark in :c:type:`struct sk_buff ` +corresponding to the segment. Networking stack makes sure decrypted +and non-decrypted segments do not get coalesced (e.g. by GRO or socket layer) +and takes care of partial decryption. + +Resync handling +=============== + +In presence of packet drops or network packet reordering, the device may lose +synchronization with the TLS stream, and require a resync with the kernel's +TCP stack. + +Note that resync is only attempted for connections which were successfully +added to the device table and are in TLS_HW mode. For example, +if the table was full when cryptographic state was installed in the kernel, +such connection will never get offloaded. Therefore the resync request +does not carry any cryptographic connection state. + +TX +-- + +Segments transmitted from an offloaded socket can get out of sync +in similar ways to the receive side-retransmissions - local drops +are possible, though network reorders are not. + +Whenever an out of order segment is transmitted the driver provides +the device with enough information to perform cryptographic operations. +This means most likely that the part of the record preceding the current +segment has to be passed to the device as part of the packet context, +together with its TCP sequence number and TLS record number. The device +can then initialize its crypto state, process and discard the preceding +data (to be able to insert the authentication tag) and move onto handling +the actual packet. + +In this mode depending on the implementation the driver can either ask +for a continuation with the crypto state and the new sequence number +(next expected segment is the one after the out of order one), or continue +with the previous stream state - assuming that the out of order segment +was just a retransmission. The former is simpler, and does not require +retransmission detection therefore it is the recommended method until +such time it is proven inefficient. + +RX +-- + +A small amount of RX reorder events may not require a full resynchronization. +In particular the device should not lose synchronization +when record boundary can be recovered: + +.. kernel-figure:: tls-offload-reorder-good.svg + :alt: reorder of non-header segment + :align: center + + Reorder of non-header segment + +Green segments are successfully decrypted, blue ones are passed +as received on wire, red stripes mark start of new records. + +In above case segment 1 is received and decrypted successfully. +Segment 2 was dropped so 3 arrives out of order. The device knows +the next record starts inside 3, based on record length in segment 1. +Segment 3 is passed untouched, because due to lack of data from segment 2 +the remainder of the previous record inside segment 3 cannot be handled. +The device can, however, collect the authentication algorithm's state +and partial block from the new record in segment 3 and when 4 and 5 +arrive continue decryption. Finally when 2 arrives it's completely outside +of expected window of the device so it's passed as is without special +handling. ``ktls`` software fallback handles the decryption of record +spanning segments 1, 2 and 3. The device did not get out of sync, +even though two segments did not get decrypted. + +Kernel synchronization may be necessary if the lost segment contained +a record header and arrived after the next record header has already passed: + +.. kernel-figure:: tls-offload-reorder-bad.svg + :alt: reorder of header segment + :align: center + + Reorder of segment with a TLS header + +In this example segment 2 gets dropped, and it contains a record header. +Device can only detect that segment 4 also contains a TLS header +if it knows the length of the previous record from segment 2. In this case +the device will lose synchronization with the stream. + +When the device gets out of sync and the stream reaches TCP sequence +numbers more than a max size record past the expected TCP sequence number, +the device starts scanning for a known header pattern. For example +for TLS 1.2 and TLS 1.3 subsequent bytes of value ``0x03 0x03`` occur +in the SSL/TLS version field of the header. Once pattern is matched +the device continues attempting parsing headers at expected locations +(based on the length fields at guessed locations). +Whenever the expected location does not contain a valid header the scan +is restarted. + +When the header is matched the device sends a confirmation request +to the kernel, asking if the guessed location is correct (if a TLS record +really starts there), and which record sequence number the given header had. +The kernel confirms the guessed location was correct and tells the device +the record sequence number. Meanwhile, the device had been parsing +and counting all records since the just-confirmed one, it adds the number +of records it had seen to the record number provided by the kernel. +At this point the device is in sync and can resume decryption at next +segment boundary. + +In a pathological case the device may latch onto a sequence of matching +headers and never hear back from the kernel (there is no negative +confirmation from the kernel). The implementation may choose to periodically +restart scan. Given how unlikely falsely-matching stream is, however, +periodic restart is not deemed necessary. + +Special care has to be taken if the confirmation request is passed +asynchronously to the packet stream and record may get processed +by the kernel before the confirmation request. + +Error handling +============== + +TX +-- + +Packets may be redirected or rerouted by the stack to a different +device than the selected TLS offload device. The stack will handle +such condition using the :c:func:`sk_validate_xmit_skb` helper +(TLS offload code installs :c:func:`tls_validate_xmit_skb` at this hook). +Offload maintains information about all records until the data is +fully acknowledged, so if skbs reach the wrong device they can be handled +by software fallback. + +Any device TLS offload handling error on the transmission side must result +in the packet being dropped. For example if a packet got out of order +due to a bug in the stack or the device, reached the device and can't +be encrypted such packet must be dropped. + +RX +-- + +If the device encounters any problems with TLS offload on the receive +side it should pass the packet to the host's networking stack as it was +received on the wire. + +For example authentication failure for any record in the segment should +result in passing the unmodified packet to the software fallback. This means +packets should not be modified "in place". Splitting segments to handle partial +decryption is not advised. In other words either all records in the packet +had been handled successfully and authenticated or the packet has to be passed +to the host's stack as it was on the wire (recovering original packet in the +driver if device provides precise error is sufficient). + +The Linux networking stack does not provide a way of reporting per-packet +decryption and authentication errors, packets with errors must simply not +have the :c:member:`decrypted` mark set. + +A packet should also not be handled by the TLS offload if it contains +incorrect checksums. + +Performance metrics +=================== + +TLS offload can be characterized by the following basic metrics: + + * max connection count + * connection installation rate + * connection installation latency + * total cryptographic performance + +Note that each TCP connection requires a TLS session in both directions, +the performance may be reported treating each direction separately. + +Max connection count +-------------------- + +The number of connections device can support can be exposed via +``devlink resource`` API. + +Total cryptographic performance +------------------------------- + +Offload performance may depend on segment and record size. + +Overload of the cryptographic subsystem of the device should not have +significant performance impact on non-offloaded streams. + +Statistics +========== + +Following minimum set of TLS-related statistics should be reported +by the driver: + + * ``rx_tls_decrypted`` - number of successfully decrypted TLS segments + * ``tx_tls_encrypted`` - number of in-order TLS segments passed to device + for encryption + * ``tx_tls_ooo`` - number of TX packets which were part of a TLS stream + but did not arrive in the expected order + * ``tx_tls_drop_no_sync_data`` - number of TX packets dropped because + they arrived out of order and associated record could not be found + (see also :ref:`pre_tls_data`) + +Notable corner cases, exceptions and additional requirements +============================================================ + +.. _5tuple_problems: + +5-tuple matching limitations +---------------------------- + +The device can only recognize received packets based on the 5-tuple +of the socket. Current ``ktls`` implementation will not offload sockets +routed through software interfaces such as those used for tunneling +or virtual networking. However, many packet transformations performed +by the networking stack (most notably any BPF logic) do not require +any intermediate software device, therefore a 5-tuple match may +consistently miss at the device level. In such cases the device +should still be able to perform TX offload (encryption) and should +fallback cleanly to software decryption (RX). + +Out of order +------------ + +Introducing extra processing in NICs should not cause packets to be +transmitted or received out of order, for example pure ACK packets +should not be reordered with respect to data segments. + +Ingress reorder +--------------- + +A device is permitted to perform packet reordering for consecutive +TCP segments (i.e. placing packets in the correct order) but any form +of additional buffering is disallowed. + +Coexistence with standard networking offload features +----------------------------------------------------- + +Offloaded ``ktls`` sockets should support standard TCP stack features +transparently. Enabling device TLS offload should not cause any difference +in packets as seen on the wire. + +Transport layer transparency +---------------------------- + +The device should not modify any packet headers for the purpose +of the simplifying TLS offload. + +The device should not depend on any packet headers beyond what is strictly +necessary for TLS offload. + +Segment drops +------------- + +Dropping packets is acceptable only in the event of catastrophic +system errors and should never be used as an error handling mechanism +in cases arising from normal operation. In other words, reliance +on TCP retransmissions to handle corner cases is not acceptable. + +TLS device features +------------------- + +Drivers should ignore the changes to TLS the device feature flags. +These flags will be acted upon accordingly by the core ``ktls`` code. +TLS device feature flags only control adding of new TLS connection +offloads, old connections will remain active after flags are cleared. + +Known bugs +========== + +skb_orphan() leaks clear text +----------------------------- + +Currently drivers depend on the :c:member:`sk` member of +:c:type:`struct sk_buff ` to identify segments requiring +encryption. Any operation which removes or does not preserve the socket +association such as :c:func:`skb_orphan` or :c:func:`skb_clone` +will cause the driver to miss the packets and lead to clear text leaks. + +Redirects leak clear text +------------------------- + +In the RX direction, if segment has already been decrypted by the device +and it gets redirected or mirrored - clear text will be transmitted out. + +.. _pre_tls_data: + +Transmission of pre-TLS data +---------------------------- + +User can enqueue some already encrypted and framed records before enabling +``ktls`` on the socket. Those records have to get sent as they are. This is +perfectly easy to handle in the software case - such data will be waiting +in the TCP layer, TLS ULP won't see it. In the offloaded case when pre-queued +segment reaches transmission point it appears to be out of order (before the +expected TCP sequence number) and the stack does not have a record information +associated. + +All segments without record information cannot, however, be assumed to be +pre-queued data, because a race condition exists between TCP stack queuing +a retransmission, the driver seeing the retransmission and TCP ACK arriving +for the retransmitted data. diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst index 482bd73f18a2..5bcbf75e2025 100644 --- a/Documentation/networking/tls.rst +++ b/Documentation/networking/tls.rst @@ -1,3 +1,5 @@ +.. _kernel_tls: + ========== Kernel TLS ========== From 38030d7cb77963ba84cdbe034806e2b81245339f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 May 2019 19:02:00 -0700 Subject: [PATCH 18/89] net/tls: avoid NULL-deref on resync during device removal When netdev with active kTLS sockets in unregistered notifier callback walks the offloaded sockets and cleans up offload state. RX data may still be processed, however, and if resync was requested prior to device removal we would hit a NULL pointer dereference on ctx->netdev use. Make sure resync is under the device offload lock and NULL-check the netdev pointer. This should be safe, because the pointer is set to NULL either in the netdev notifier (under said lock) or when socket is completely dead and no resync can happen. The other access to ctx->netdev in tls_validate_xmit_skb() does not dereference the pointer, it just checks it against other device pointer, so it should be pretty safe (perhaps we can add a READ_ONCE/WRITE_ONCE there, if paranoid). Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- net/tls/tls_device.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index ca54a7c7ec81..aa33e4accc32 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -553,8 +553,8 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct net_device *netdev = tls_ctx->netdev; struct tls_offload_context_rx *rx_ctx; + struct net_device *netdev; u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -568,10 +568,15 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) is_req_pending = resync_req; if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, - seq + TLS_HEADER_SIZE - 1, - rcd_sn); + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { + seq += TLS_HEADER_SIZE - 1; + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (netdev) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, + rcd_sn); + up_read(&device_offload_lock); + } } static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) From 3686637e507b48525fcea6fb91e1988bdbc14530 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 May 2019 19:02:01 -0700 Subject: [PATCH 19/89] net/tls: fix state removal with feature flags off TLS offload drivers shouldn't (and currently don't) block the TLS offload feature changes based on whether there are active offloaded connections or not. This seems to be a good idea, because we want the admin to be able to disable the TLS offload at any time, and there is no clean way of disabling it for active connections (TX side is quite problematic). So if features are cleared existing connections will stay offloaded until they close, and new connections will not attempt offload to a given device. However, the offload state removal handling is currently broken if feature flags get cleared while there are active TLS offloads. RX side will completely bail from cleanup, even on normal remove path, leaving device state dangling, potentially causing issues when the 5-tuple is reused. It will also fail to release the netdev reference. Remove the RX-side warning message, in next release cycle it should be printed when features are disabled, rather than when connection dies, but for that we need a more efficient method of finding connection of a given netdev (a'la BPF offload code). Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- net/tls/tls_device.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index aa33e4accc32..07650446e892 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -939,12 +939,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk) if (!netdev) goto out; - if (!(netdev->features & NETIF_F_HW_TLS_RX)) { - pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n", - __func__); - goto out; - } - netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); From c3f4a6c39cf269a40d45f813c05fa830318ad875 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 May 2019 19:02:02 -0700 Subject: [PATCH 20/89] net/tls: don't ignore netdev notifications if no TLS features On device surprise removal path (the notifier) we can't bail just because the features are disabled. They may have been enabled during the lifetime of the device. This bug leads to leaking netdev references and use-after-frees if there are active connections while device features are cleared. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- net/tls/tls_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 07650446e892..b95c408fd771 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -997,7 +997,8 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) + if (!dev->tlsdev_ops && + !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { From c1e85c6ce57ef1eb73966152993a341c8123a8ea Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 22 May 2019 08:24:43 +0000 Subject: [PATCH 21/89] net: macb: save/restore the remaining registers and features SAMA5D2 SoC has a suspend mode where SoC's power is cut off. Due to this the registers content is lost after a suspend/resume cycle. The current suspend/resume implementation covers some of these registers. However there are few which were not treated (e.g. SCRT2 and USRIO). Apart from this, netdev features are not restored. Treat these issues. Signed-off-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb.h | 7 ++ drivers/net/ethernet/cadence/macb_main.c | 111 +++++++++++++++++------ 2 files changed, 91 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index acc66a7e7b95..00ee5e8e0ff0 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -1080,6 +1080,11 @@ struct macb_ptp_info { struct ifreq *ifr, int cmd); }; +struct macb_pm_data { + u32 scrt2; + u32 usrio; +}; + struct macb_config { u32 caps; unsigned int dma_burst_length; @@ -1220,6 +1225,8 @@ struct macb { int tx_bd_rd_prefetch; u32 rx_intr_mask; + + struct macb_pm_data pm_data; }; #ifdef CONFIG_MACB_USE_HWSTAMP diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index bebd9b1aeb64..f825e3960540 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -2849,10 +2849,14 @@ static int macb_get_ts_info(struct net_device *netdev, static void gem_enable_flow_filters(struct macb *bp, bool enable) { + struct net_device *netdev = bp->dev; struct ethtool_rx_fs_item *item; u32 t2_scr; int num_t2_scr; + if (!(netdev->features & NETIF_F_NTUPLE)) + return; + num_t2_scr = GEM_BFEXT(T2SCR, gem_readl(bp, DCFG8)); list_for_each_entry(item, &bp->rx_fs_list.list, list) { @@ -3012,8 +3016,7 @@ static int gem_add_flow_filter(struct net_device *netdev, gem_prog_cmp_regs(bp, fs); bp->rx_fs_list.count++; /* enable filtering if NTUPLE on */ - if (netdev->features & NETIF_F_NTUPLE) - gem_enable_flow_filters(bp, 1); + gem_enable_flow_filters(bp, 1); spin_unlock_irqrestore(&bp->rx_fs_lock, flags); return 0; @@ -3201,6 +3204,50 @@ static int macb_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) } } +static inline void macb_set_txcsum_feature(struct macb *bp, + netdev_features_t features) +{ + u32 val; + + if (!macb_is_gem(bp)) + return; + + val = gem_readl(bp, DMACFG); + if (features & NETIF_F_HW_CSUM) + val |= GEM_BIT(TXCOEN); + else + val &= ~GEM_BIT(TXCOEN); + + gem_writel(bp, DMACFG, val); +} + +static inline void macb_set_rxcsum_feature(struct macb *bp, + netdev_features_t features) +{ + struct net_device *netdev = bp->dev; + u32 val; + + if (!macb_is_gem(bp)) + return; + + val = gem_readl(bp, NCFGR); + if ((features & NETIF_F_RXCSUM) && !(netdev->flags & IFF_PROMISC)) + val |= GEM_BIT(RXCOEN); + else + val &= ~GEM_BIT(RXCOEN); + + gem_writel(bp, NCFGR, val); +} + +static inline void macb_set_rxflow_feature(struct macb *bp, + netdev_features_t features) +{ + if (!macb_is_gem(bp)) + return; + + gem_enable_flow_filters(bp, !!(features & NETIF_F_NTUPLE)); +} + static int macb_set_features(struct net_device *netdev, netdev_features_t features) { @@ -3208,39 +3255,35 @@ static int macb_set_features(struct net_device *netdev, netdev_features_t changed = features ^ netdev->features; /* TX checksum offload */ - if ((changed & NETIF_F_HW_CSUM) && macb_is_gem(bp)) { - u32 dmacfg; - - dmacfg = gem_readl(bp, DMACFG); - if (features & NETIF_F_HW_CSUM) - dmacfg |= GEM_BIT(TXCOEN); - else - dmacfg &= ~GEM_BIT(TXCOEN); - gem_writel(bp, DMACFG, dmacfg); - } + if (changed & NETIF_F_HW_CSUM) + macb_set_txcsum_feature(bp, features); /* RX checksum offload */ - if ((changed & NETIF_F_RXCSUM) && macb_is_gem(bp)) { - u32 netcfg; - - netcfg = gem_readl(bp, NCFGR); - if (features & NETIF_F_RXCSUM && - !(netdev->flags & IFF_PROMISC)) - netcfg |= GEM_BIT(RXCOEN); - else - netcfg &= ~GEM_BIT(RXCOEN); - gem_writel(bp, NCFGR, netcfg); - } + if (changed & NETIF_F_RXCSUM) + macb_set_rxcsum_feature(bp, features); /* RX Flow Filters */ - if ((changed & NETIF_F_NTUPLE) && macb_is_gem(bp)) { - bool turn_on = features & NETIF_F_NTUPLE; + if (changed & NETIF_F_NTUPLE) + macb_set_rxflow_feature(bp, features); - gem_enable_flow_filters(bp, turn_on); - } return 0; } +static void macb_restore_features(struct macb *bp) +{ + struct net_device *netdev = bp->dev; + netdev_features_t features = netdev->features; + + /* TX checksum offload */ + macb_set_txcsum_feature(bp, features); + + /* RX checksum offload */ + macb_set_rxcsum_feature(bp, features); + + /* RX Flow Filters */ + macb_set_rxflow_feature(bp, features); +} + static const struct net_device_ops macb_netdev_ops = { .ndo_open = macb_open, .ndo_stop = macb_close, @@ -4273,6 +4316,12 @@ static int __maybe_unused macb_suspend(struct device *dev) spin_lock_irqsave(&bp->lock, flags); macb_reset_hw(bp); spin_unlock_irqrestore(&bp->lock, flags); + + if (!(bp->caps & MACB_CAPS_USRIO_DISABLED)) + bp->pm_data.usrio = macb_or_gem_readl(bp, USRIO); + + if (netdev->hw_features & NETIF_F_NTUPLE) + bp->pm_data.scrt2 = gem_readl_n(bp, ETHT, SCRT2_ETHT); } netif_carrier_off(netdev); @@ -4301,6 +4350,13 @@ static int __maybe_unused macb_resume(struct device *dev) disable_irq_wake(bp->queues[0].irq); } else { macb_writel(bp, NCR, MACB_BIT(MPE)); + + if (netdev->hw_features & NETIF_F_NTUPLE) + gem_writel_n(bp, ETHT, SCRT2_ETHT, bp->pm_data.scrt2); + + if (!(bp->caps & MACB_CAPS_USRIO_DISABLED)) + macb_or_gem_writel(bp, USRIO, bp->pm_data.usrio); + for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) napi_enable(&queue->napi); @@ -4312,6 +4368,7 @@ static int __maybe_unused macb_resume(struct device *dev) bp->macbgem_ops.mog_init_rings(bp); macb_init_hw(bp); macb_set_rx_mode(netdev); + macb_restore_features(bp); netif_device_attach(netdev); if (bp->ptp_info) bp->ptp_info->ptp_init(netdev); From ccfb62f27beb295103e9392462b20a6ed807d0ea Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 22 May 2019 11:45:13 +0300 Subject: [PATCH 22/89] mISDN: make sure device name is NUL terminated The user can change the device_name with the IMSETDEVNAME ioctl, but we need to ensure that the user's name is NUL terminated. Otherwise it could result in a buffer overflow when we copy the name back to the user with IMGETDEVINFO ioctl. I also changed two strcpy() calls which handle the name to strscpy(). Hopefully, there aren't any other ways to create a too long name, but it's nice to do this as a kernel hardening measure. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/isdn/mISDN/socket.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index a14e35d40538..84e1d4c2db66 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -393,7 +393,7 @@ data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; - strcpy(di.name, dev_name(&dev->dev)); + strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else @@ -676,7 +676,7 @@ base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; - strcpy(di.name, dev_name(&dev->dev)); + strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else @@ -690,6 +690,7 @@ base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = -EFAULT; break; } + dn.name[sizeof(dn.name) - 1] = '\0'; dev = get_mdevice(dn.id); if (dev) err = device_rename(&dev->dev, dn.name); From 49ce881c0d4c4a7a35358d9dccd5f26d0e56fc61 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 22 May 2019 10:05:09 +0000 Subject: [PATCH 23/89] net: stmmac: fix reset gpio free missing Commit 984203ceff27 ("net: stmmac: mdio: remove reset gpio free") removed the reset gpio free, when the driver is unbinded or rmmod, we miss the gpio free. This patch uses managed API to request the reset gpio, so that the gpio could be freed properly. Fixes: 984203ceff27 ("net: stmmac: mdio: remove reset gpio free") Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index bdd351597b55..093a223fe408 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -267,7 +267,8 @@ int stmmac_mdio_reset(struct mii_bus *bus) of_property_read_u32_array(np, "snps,reset-delays-us", data->delays, 3); - if (gpio_request(data->reset_gpio, "mdio-reset")) + if (devm_gpio_request(priv->device, data->reset_gpio, + "mdio-reset")) return 0; } From 31680ac265802397937d75461a2809a067b9fb93 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 May 2019 15:12:18 -0700 Subject: [PATCH 24/89] ipv6: Fix redirect with VRF IPv6 redirect is broken for VRF. __ip6_route_redirect walks the FIB entries looking for an exact match on ifindex. With VRF the flowi6_oif is updated by l3mdev_update_flow to the l3mdev index and the FLOWI_FLAG_SKIP_NH_OIF set in the flags to tell the lookup to skip the device match. For redirects the device match is requires so use that flag to know when the oif needs to be reset to the skb device index. Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7a014ca877ed..848e944f07df 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2512,6 +2512,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_info *rt; struct fib6_node *fn; + /* l3mdev_update_flow overrides oif if the device is enslaved; in + * this case we must match on the real ingress device, so reset it + */ + if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) + fl6->flowi6_oif = skb->dev->ifindex; + /* Get the "current" route for this destination and * check if the redirect has come from appropriate router. * From 296d5b54163964b7ae536b8b57dfbd21d4e868e1 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 22 May 2019 19:12:54 -0400 Subject: [PATCH 25/89] bnxt_en: Fix aggregation buffer leak under OOM condition. For every RX packet, the driver replenishes all buffers used for that packet and puts them back into the RX ring and RX aggregation ring. In one code path where the RX packet has one RX buffer and one or more aggregation buffers, we missed recycling the aggregation buffer(s) if we are unable to allocate a new SKB buffer. This leads to the aggregation ring slowly running out of buffers over time. Fix it by properly recycling the aggregation buffers. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Reported-by: Rakesh Hemnani Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8314c00d7537..21f682610e6a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1642,6 +1642,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, skb = bnxt_copy_skb(bnapi, data_ptr, len, dma_addr); bnxt_reuse_rx_data(rxr, cons, data); if (!skb) { + if (agg_bufs) + bnxt_reuse_rx_agg_bufs(cpr, cp_cons, agg_bufs); rc = -ENOMEM; goto next_rx; } From 1b3f0b75c39f534278a895c117282014e9d0ae1f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 22 May 2019 19:12:55 -0400 Subject: [PATCH 26/89] bnxt_en: Fix possible BUG() condition when calling pci_disable_msix(). When making configuration changes, the driver calls bnxt_close_nic() and then bnxt_open_nic() for the changes to take effect. A parameter irq_re_init is passed to the call sequence to indicate if IRQ should be re-initialized. This irq_re_init parameter needs to be included in the bnxt_reserve_rings() call. bnxt_reserve_rings() can only call pci_disable_msix() if the irq_re_init parameter is true, otherwise it may hit BUG() because some IRQs may not have been freed yet. Fixes: 41e8d7983752 ("bnxt_en: Modify the ring reservation functions for 57500 series chips.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 13 +++++++------ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 21f682610e6a..cfcc33c52685 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7618,22 +7618,23 @@ static void bnxt_clear_int_mode(struct bnxt *bp) bp->flags &= ~BNXT_FLAG_USING_MSIX; } -int bnxt_reserve_rings(struct bnxt *bp) +int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init) { int tcs = netdev_get_num_tc(bp->dev); - bool reinit_irq = false; + bool irq_cleared = false; int rc; if (!bnxt_need_reserve_rings(bp)) return 0; - if (BNXT_NEW_RM(bp) && (bnxt_get_num_msix(bp) != bp->total_irqs)) { + if (irq_re_init && BNXT_NEW_RM(bp) && + bnxt_get_num_msix(bp) != bp->total_irqs) { bnxt_ulp_irq_stop(bp); bnxt_clear_int_mode(bp); - reinit_irq = true; + irq_cleared = true; } rc = __bnxt_reserve_rings(bp); - if (reinit_irq) { + if (irq_cleared) { if (!rc) rc = bnxt_init_int_mode(bp); bnxt_ulp_irq_restart(bp, rc); @@ -8532,7 +8533,7 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) return rc; } } - rc = bnxt_reserve_rings(bp); + rc = bnxt_reserve_rings(bp, irq_re_init); if (rc) return rc; if ((bp->flags & BNXT_FLAG_RFS) && diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index eca36dd6b751..acc73f301783 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1790,7 +1790,7 @@ unsigned int bnxt_get_avail_stat_ctxs_for_en(struct bnxt *bp); unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp); unsigned int bnxt_get_avail_cp_rings_for_en(struct bnxt *bp); int bnxt_get_avail_msix(struct bnxt *bp, int num); -int bnxt_reserve_rings(struct bnxt *bp); +int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init); void bnxt_tx_disable(struct bnxt *bp); void bnxt_tx_enable(struct bnxt *bp); int bnxt_hwrm_set_pause(struct bnxt *); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index b1263821a6e9..a6c7baf38036 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -831,7 +831,7 @@ static int bnxt_set_channels(struct net_device *dev, */ } } else { - rc = bnxt_reserve_rings(bp); + rc = bnxt_reserve_rings(bp, true); } return rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index cf475873ce81..bfa342a98d08 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -147,7 +147,7 @@ static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id, bnxt_close_nic(bp, true, false); rc = bnxt_open_nic(bp, true, false); } else { - rc = bnxt_reserve_rings(bp); + rc = bnxt_reserve_rings(bp, true); } } if (rc) { From d629522e1d66561f38e5c8d4f52bb6d254ec0707 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 22 May 2019 19:12:56 -0400 Subject: [PATCH 27/89] bnxt_en: Reduce memory usage when running in kdump kernel. Skip RDMA context memory allocations, reduce to 1 ring, and disable TPA when running in the kdump kernel. Without this patch, the driver fails to initialize with memory allocation errors when running in a typical kdump kernel. Fixes: cf6daed098d1 ("bnxt_en: Increase context memory allocations on 57500 chips for RDMA.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index cfcc33c52685..79812daa4127 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6379,7 +6379,7 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp) if (!ctx || (ctx->flags & BNXT_CTX_FLAG_INITED)) return 0; - if (bp->flags & BNXT_FLAG_ROCE_CAP) { + if ((bp->flags & BNXT_FLAG_ROCE_CAP) && !is_kdump_kernel()) { pg_lvl = 2; extra_qps = 65536; extra_srqs = 8192; @@ -10437,7 +10437,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh) if (sh) bp->flags |= BNXT_FLAG_SHARED_RINGS; - dflt_rings = netif_get_num_default_rss_queues(); + dflt_rings = is_kdump_kernel() ? 1 : netif_get_num_default_rss_queues(); /* Reduce default rings on multi-port cards so that total default * rings do not exceed CPU count. */ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index acc73f301783..be438d82f939 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -1369,7 +1370,8 @@ struct bnxt { #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0) #define BNXT_RX_PAGE_MODE(bp) ((bp)->flags & BNXT_FLAG_RX_PAGE_MODE) #define BNXT_SUPPORTS_TPA(bp) (!BNXT_CHIP_TYPE_NITRO_A0(bp) && \ - !(bp->flags & BNXT_FLAG_CHIP_P5)) + !(bp->flags & BNXT_FLAG_CHIP_P5) && \ + !is_kdump_kernel()) /* Chip class phase 5 */ #define BNXT_CHIP_P5(bp) \ From 2e9217d1e8b72dde2c7e3e2338cc1830f68cb58d Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Wed, 22 May 2019 19:12:57 -0400 Subject: [PATCH 28/89] bnxt_en: Device serial number is supported only for PFs. Don't read DSN on VFs that do not have the PCI capability. Fixes: 03213a996531 ("bnxt: move bp->switch_id initialization to PF probe") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 79812daa4127..f758b2e0591f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -10725,11 +10725,12 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) goto init_err_pci_clean; } - /* Read the adapter's DSN to use as the eswitch switch_id */ - rc = bnxt_pcie_dsn_get(bp, bp->switch_id); - if (rc) - goto init_err_pci_clean; - + if (BNXT_PF(bp)) { + /* Read the adapter's DSN to use as the eswitch switch_id */ + rc = bnxt_pcie_dsn_get(bp, bp->switch_id); + if (rc) + goto init_err_pci_clean; + } bnxt_hwrm_func_qcfg(bp); bnxt_hwrm_vnic_qcaps(bp); bnxt_hwrm_port_led_qcaps(bp); From 3580d04aa674383c42de7b635d28e52a1e5bc72c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 May 2019 16:51:22 -0700 Subject: [PATCH 29/89] ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet Cc: Hangbin Liu Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6c2febc39dca..1a8d36dd49d4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -633,6 +633,24 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) } } +static void ip_sf_list_clear_all(struct ip_sf_list *psf) +{ + struct ip_sf_list *next; + + while (psf) { + next = psf->sf_next; + kfree(psf); + psf = next; + } +} + +static void kfree_pmc(struct ip_mc_list *pmc) +{ + ip_sf_list_clear_all(pmc->sources); + ip_sf_list_clear_all(pmc->tomb); + kfree(pmc); +} + static void igmpv3_send_cr(struct in_device *in_dev) { struct ip_mc_list *pmc, *pmc_prev, *pmc_next; @@ -669,7 +687,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) else in_dev->mc_tomb = pmc_next; in_dev_put(pmc->interface); - kfree(pmc); + kfree_pmc(pmc); } else pmc_prev = pmc; } @@ -1215,14 +1233,18 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) im->interface = pmc->interface; if (im->sfmode == MCAST_INCLUDE) { im->tomb = pmc->tomb; + pmc->tomb = NULL; + im->sources = pmc->sources; + pmc->sources = NULL; + for (psf = im->sources; psf; psf = psf->sf_next) psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } else { im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } in_dev_put(pmc->interface); - kfree(pmc); + kfree_pmc(pmc); } spin_unlock_bh(&im->lock); } @@ -1243,21 +1265,18 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) nextpmc = pmc->next; ip_mc_clear_src(pmc); in_dev_put(pmc->interface); - kfree(pmc); + kfree_pmc(pmc); } /* clear dead sources, too */ rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { - struct ip_sf_list *psf, *psf_next; + struct ip_sf_list *psf; spin_lock_bh(&pmc->lock); psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); - for (; psf; psf = psf_next) { - psf_next = psf->sf_next; - kfree(psf); - } + ip_sf_list_clear_all(psf); } rcu_read_unlock(); } @@ -2123,7 +2142,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, static void ip_mc_clear_src(struct ip_mc_list *pmc) { - struct ip_sf_list *psf, *nextpsf, *tomb, *sources; + struct ip_sf_list *tomb, *sources; spin_lock_bh(&pmc->lock); tomb = pmc->tomb; @@ -2135,14 +2154,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) pmc->sfcount[MCAST_EXCLUDE] = 1; spin_unlock_bh(&pmc->lock); - for (psf = tomb; psf; psf = nextpsf) { - nextpsf = psf->sf_next; - kfree(psf); - } - for (psf = sources; psf; psf = nextpsf) { - nextpsf = psf->sf_next; - kfree(psf); - } + ip_sf_list_clear_all(tomb); + ip_sf_list_clear_all(sources); } /* Join a multicast group From 903869bd10e6719b9df6718e785be7ec725df59f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 May 2019 18:35:16 -0700 Subject: [PATCH 30/89] ipv4/igmp: fix build error if !CONFIG_IP_MULTICAST ip_sf_list_clear_all() needs to be defined even if !CONFIG_IP_MULTICAST Fixes: 3580d04aa674 ("ipv4/igmp: fix another memory leak in igmpv3_del_delrec()") Signed-off-by: Eric Dumazet Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1a8d36dd49d4..eb03153dfe12 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -188,6 +188,17 @@ static void ip_ma_put(struct ip_mc_list *im) pmc != NULL; \ pmc = rtnl_dereference(pmc->next_rcu)) +static void ip_sf_list_clear_all(struct ip_sf_list *psf) +{ + struct ip_sf_list *next; + + while (psf) { + next = psf->sf_next; + kfree(psf); + psf = next; + } +} + #ifdef CONFIG_IP_MULTICAST /* @@ -633,17 +644,6 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) } } -static void ip_sf_list_clear_all(struct ip_sf_list *psf) -{ - struct ip_sf_list *next; - - while (psf) { - next = psf->sf_next; - kfree(psf); - psf = next; - } -} - static void kfree_pmc(struct ip_mc_list *pmc) { ip_sf_list_clear_all(pmc->sources); From 3f6f7a175ad4cf9a6a3dda72d7f5d122107f4b9e Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 23 May 2019 10:47:24 +0200 Subject: [PATCH 31/89] net: mvpp2: cls: Fix leaked ethtool_rx_flow_rule The flow_rule is only used when configuring the classification tables, and should be free'd once we're done using it. The current code only frees it in the error path. Fixes: 90b509b39ac9 ("net: mvpp2: cls: Add Classification offload support") Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c index d046f7a1dcf5..a57d17ab91f0 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c @@ -1271,6 +1271,9 @@ int mvpp2_ethtool_cls_rule_ins(struct mvpp2_port *port, if (ret) goto clean_eth_rule; + ethtool_rx_flow_rule_destroy(ethtool_rule); + efs->rule.flow = NULL; + memcpy(&efs->rxnfc, info, sizeof(*info)); port->rfs_rules[efs->rule.loc] = efs; port->n_rfs_rules++; From d2daa127ed51ac41217962d2b8f9c00be6e9c0d9 Mon Sep 17 00:00:00 2001 From: Andreas Oetken Date: Thu, 23 May 2019 13:57:14 +0200 Subject: [PATCH 32/89] hsr: fix don't prune the master node from the node_db Don't prune the master node in the hsr_prune_nodes function. Neither time_in[HSR_PT_SLAVE_A] nor time_in[HSR_PT_SLAVE_B] will ever be updated by hsr_register_frame_in for the master port. Thus, the master node will be repeatedly pruned leading to repeated packet loss. This bug never appeared because the hsr_prune_nodes function was only called once. Since commit 5150b45fd355 ("net: hsr: Fix node prune function for forget time expiry") this issue is fixed unveiling the issue described above. Fixes: 5150b45fd355 ("net: hsr: Fix node prune function for forget time expiry") Signed-off-by: Andreas Oetken Tested-by: Murali Karicheri Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 9fa9abd83018..2d7a19750436 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -365,6 +365,14 @@ void hsr_prune_nodes(struct timer_list *t) rcu_read_lock(); list_for_each_entry_rcu(node, &hsr->node_db, mac_list) { + /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] + * nor time_in[HSR_PT_SLAVE_B], will ever be updated for + * the master port. Thus the master node will be repeatedly + * pruned leading to packet loss. + */ + if (hsr_addr_is_self(hsr, node->macaddress_A)) + continue; + /* Shorthand */ time_a = node->time_in[HSR_PT_SLAVE_A]; time_b = node->time_in[HSR_PT_SLAVE_B]; From b5730061d1056abf317caea823b94d6e12b5b4f6 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Thu, 23 May 2019 20:41:44 +0530 Subject: [PATCH 33/89] cxgb4: offload VLAN flows regardless of VLAN ethtype VLAN flows never get offloaded unless ivlan_vld is set in filter spec. It's not compulsory for vlan_ethtype to be set. So, always enable ivlan_vld bit for offloading VLAN flows regardless of vlan_ethtype is set or not. Fixes: ad9af3e09c (cxgb4: add tc flower match support for vlan) Signed-off-by: Raju Rangoju Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index 6e2d80008a79..cfaf8f618d1f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -197,6 +197,9 @@ static void cxgb4_process_flow_match(struct net_device *dev, fs->val.ivlan = vlan_tci; fs->mask.ivlan = vlan_tci_mask; + fs->val.ivlan_vld = 1; + fs->mask.ivlan_vld = 1; + /* Chelsio adapters use ivlan_vld bit to match vlan packets * as 802.1Q. Also, when vlan tag is present in packets, * ethtype match is used then to match on ethtype of inner @@ -207,8 +210,6 @@ static void cxgb4_process_flow_match(struct net_device *dev, * ethtype value with ethtype of inner header. */ if (fs->val.ethtype == ETH_P_8021Q) { - fs->val.ivlan_vld = 1; - fs->mask.ivlan_vld = 1; fs->val.ethtype = 0; fs->mask.ethtype = 0; } From ce8d24f9a5965a58c588f9342689702a1024433c Mon Sep 17 00:00:00 2001 From: Andy Duan Date: Thu, 23 May 2019 01:55:28 +0000 Subject: [PATCH 34/89] net: fec: fix the clk mismatch in failed_reset path Fix the clk mismatch in the error path "failed_reset" because below error path will disable clk_ahb and clk_ipg directly, it should use pm_runtime_put_noidle() instead of pm_runtime_put() to avoid to call runtime resume callback. Reported-by: Baruch Siach Signed-off-by: Fugang Duan Tested-by: Baruch Siach Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index aa7d4e27c5d1..38f10f7dcbc3 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3556,7 +3556,7 @@ failed_init: if (fep->reg_phy) regulator_disable(fep->reg_phy); failed_reset: - pm_runtime_put(&pdev->dev); + pm_runtime_put_noidle(&pdev->dev); pm_runtime_disable(&pdev->dev); failed_regulator: clk_disable_unprepare(fep->clk_ahb); From ab0610efabb4c4f419a531455708caf1dd29357e Mon Sep 17 00:00:00 2001 From: Vishal Kulkarni Date: Thu, 23 May 2019 08:07:21 +0530 Subject: [PATCH 35/89] cxgb4: Revert "cxgb4: Remove SGE_HOST_PAGE_SIZE dependency on page size" This reverts commit 2391b0030e241386d710df10e53e2cfc3c5d4fc1 which has introduced regression. Now SGE's BAR2 Doorbell/GTS Page Size is interpreted correctly in the firmware itself by using actual host page size. Hence previous commit needs to be reverted. Signed-off-by: Vishal Kulkarni Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index f9b70be59792..93feb258067b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -7253,10 +7253,21 @@ int t4_fixup_host_params(struct adapter *adap, unsigned int page_size, unsigned int cache_line_size) { unsigned int page_shift = fls(page_size) - 1; + unsigned int sge_hps = page_shift - 10; unsigned int stat_len = cache_line_size > 64 ? 128 : 64; unsigned int fl_align = cache_line_size < 32 ? 32 : cache_line_size; unsigned int fl_align_log = fls(fl_align) - 1; + t4_write_reg(adap, SGE_HOST_PAGE_SIZE_A, + HOSTPAGESIZEPF0_V(sge_hps) | + HOSTPAGESIZEPF1_V(sge_hps) | + HOSTPAGESIZEPF2_V(sge_hps) | + HOSTPAGESIZEPF3_V(sge_hps) | + HOSTPAGESIZEPF4_V(sge_hps) | + HOSTPAGESIZEPF5_V(sge_hps) | + HOSTPAGESIZEPF6_V(sge_hps) | + HOSTPAGESIZEPF7_V(sge_hps)); + if (is_t4(adap->params.chip)) { t4_set_reg_field(adap, SGE_CONTROL_A, INGPADBOUNDARY_V(INGPADBOUNDARY_M) | From 4097e9d250fb17958c1d9b94538386edd3f20144 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 23 May 2019 09:32:31 +0300 Subject: [PATCH 36/89] net: sched: don't use tc_action->order during action dump Function tcf_action_dump() relies on tc_action->order field when starting nested nla to send action data to userspace. This approach breaks in several cases: - When multiple filters point to same shared action, tc_action->order field is overwritten each time it is attached to filter. This causes filter dump to output action with incorrect attribute for all filters that have the action in different position (different order) from the last set tc_action->order value. - When action data is displayed using tc action API (RTM_GETACTION), action order is overwritten by tca_action_gd() according to its position in resulting array of nl attributes, which will break filter dump for all filters attached to that shared action that expect it to have different order value. Don't rely on tc_action->order when dumping actions. Set nla according to action position in resulting array of actions instead. Signed-off-by: Vlad Buslov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_api.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 683fcc00da49..c42ecf4b3c10 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -800,7 +800,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { a = actions[i]; - nest = nla_nest_start_noflag(skb, a->order); + nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; err = tcf_action_dump_1(skb, a, bind, ref); @@ -1303,7 +1303,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, ret = PTR_ERR(act); goto err; } - act->order = i; attr_size += tcf_action_fill_size(act); actions[i - 1] = act; } From 95baa60a0da80a0143e3ddd4d3725758b4513825 Mon Sep 17 00:00:00 2001 From: Gen Zhang Date: Fri, 24 May 2019 11:19:46 +0800 Subject: [PATCH 37/89] ipv6_sockglue: Fix a missing-check bug in ip6_ra_control() In function ip6_ra_control(), the pointer new_ra is allocated a memory space via kmalloc(). And it is used in the following codes. However, when there is a memory allocation error, kmalloc() fails. Thus null pointer dereference may happen. And it will cause the kernel to crash. Therefore, we should check the return value and handle the error. Signed-off-by: Gen Zhang Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 40f21fef25ff..0a3d035feb61 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -68,6 +68,8 @@ int ip6_ra_control(struct sock *sk, int sel) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + if (sel >= 0 && !new_ra) + return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { From 425aa0e1d01513437668fa3d4a971168bbaa8515 Mon Sep 17 00:00:00 2001 From: Gen Zhang Date: Fri, 24 May 2019 11:24:26 +0800 Subject: [PATCH 38/89] ip_sockglue: Fix missing-check bug in ip_ra_control() In function ip_ra_control(), the pointer new_ra is allocated a memory space via kmalloc(). And it is used in the following codes. However, when there is a memory allocation error, kmalloc() fails. Thus null pointer dereference may happen. And it will cause the kernel to crash. Therefore, we should check the return value and handle the error. Signed-off-by: Gen Zhang Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 82f341e84fae..aa3fd61818c4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -343,6 +343,8 @@ int ip_ra_control(struct sock *sk, unsigned char on, return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + if (on && !new_ra) + return -ENOMEM; mutex_lock(&net->ipv4.ra_mutex); for (rap = &net->ipv4.ra_chain; From 4523a5611526709ec9b4e2574f1bb7818212651e Mon Sep 17 00:00:00 2001 From: Biao Huang Date: Fri, 24 May 2019 14:26:07 +0800 Subject: [PATCH 39/89] net: stmmac: update rx tail pointer register to fix rx dma hang issue. Currently we will not update the receive descriptor tail pointer in stmmac_rx_refill. Rx dma will think no available descriptors and stop once received packets exceed DMA_RX_SIZE, so that the rx only test will fail. Update the receive tail pointer in stmmac_rx_refill to add more descriptors to the rx channel, so packets can be received continually Fixes: 54139cf3bb33 ("net: stmmac: adding multiple buffers for rx") Signed-off-by: Biao Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 2a1052704885..bc93c355f403 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3338,6 +3338,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue) entry = STMMAC_GET_ENTRY(entry, DMA_RX_SIZE); } rx_q->dirty_rx = entry; + stmmac_set_rx_tail_ptr(priv, priv->ioaddr, rx_q->rx_tail_addr, queue); } /** From 5e7f7fc538d894b2d9aa41876b8dcf35f5fe11e6 Mon Sep 17 00:00:00 2001 From: Biao Huang Date: Fri, 24 May 2019 14:26:08 +0800 Subject: [PATCH 40/89] net: stmmac: fix csr_clk can't be zero issue The specific clk_csr value can be zero, and stmmac_clk is necessary for MDC clock which can be set dynamically. So, change the condition from plat->clk_csr to plat->stmmac_clk to fix clk_csr can't be zero issue. Fixes: cd7201f477b9 ("stmmac: MDC clock dynamically based on the csr clock input") Signed-off-by: Biao Huang Acked-by: Alexandre TORGUE Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 +++--- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index bc93c355f403..65e57b9f6887 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4380,10 +4380,10 @@ int stmmac_dvr_probe(struct device *device, * set the MDC clock dynamically according to the csr actual * clock input. */ - if (!priv->plat->clk_csr) - stmmac_clk_csr_set(priv); - else + if (priv->plat->clk_csr >= 0) priv->clk_csr = priv->plat->clk_csr; + else + stmmac_clk_csr_set(priv); stmmac_check_pcs_mode(priv); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3031f2bf15d6..f45bfbef97d0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -408,7 +408,10 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) /* Default to phy auto-detection */ plat->phy_addr = -1; - /* Get clk_csr from device tree */ + /* Default to get clk_csr from stmmac_clk_crs_set(), + * or get clk_csr from device tree. + */ + plat->clk_csr = -1; of_property_read_u32(np, "clk_csr", &plat->clk_csr); /* "snps,phy-addr" is not a standard property. Mark it as deprecated From f4ca7a9260dfe700f2a16f0881825de625067515 Mon Sep 17 00:00:00 2001 From: Biao Huang Date: Fri, 24 May 2019 14:26:09 +0800 Subject: [PATCH 41/89] net: stmmac: dwmac-mediatek: modify csr_clk value to fix mdio read/write fail 1. the frequency of csr clock is 66.5MHz, so the csr_clk value should be 0 other than 5. 2. the csr_clk can be got from device tree, so remove initialization here. Fixes: 9992f37e346b ("stmmac: dwmac-mediatek: add support for mt2712") Signed-off-by: Biao Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index bf2562995fc8..126b66bb73a6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -346,8 +346,6 @@ static int mediatek_dwmac_probe(struct platform_device *pdev) return PTR_ERR(plat_dat); plat_dat->interface = priv_plat->phy_mode; - /* clk_csr_i = 250-300MHz & MDC = clk_csr_i/124 */ - plat_dat->clk_csr = 5; plat_dat->has_gmac4 = 1; plat_dat->has_gmac = 0; plat_dat->pmt = 0; From f4bcf14e3997a422169e5a661fa811868ca221d8 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 24 May 2019 12:05:30 +0200 Subject: [PATCH 42/89] net: ethtool: Document get_rxfh_context and set_rxfh_context ethtool ops ethtool ops get_rxfh_context and set_rxfh_context are used to create, remove and access parameters associated to RSS contexts, in a similar fashion to get_rxfh and set_rxfh. Add a small descritopn of these callbacks in the struct ethtool_ops doc. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/ethtool.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e6ebc9761822..95991e4300bf 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -260,6 +260,15 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * will remain unchanged. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. + * @get_rxfh_context: Get the contents of the RX flow hash indirection table, + * hash key, and/or hash function assiciated to the given rss context. + * Returns a negative error code or zero. + * @set_rxfh_context: Create, remove and configure RSS contexts. Allows setting + * the contents of the RX flow hash indirection table, hash key, and/or + * hash function associated to the given context. Arguments which are set + * to %NULL or zero will remain unchanged. + * Returns a negative error code or zero. An error code must be returned + * if at least one unsupported change was requested. * @get_channels: Get number of channels. * @set_channels: Set number of channels. Returns a negative error code or * zero. From 334031219a84b9994594015aab85ed7754c80176 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 24 May 2019 09:49:28 -0400 Subject: [PATCH 43/89] bonding/802.3ad: fix slave link initialization transition states Once in a while, with just the right timing, 802.3ad slaves will fail to properly initialize, winding up in a weird state, with a partner system mac address of 00:00:00:00:00:00. This started happening after a fix to properly track link_failure_count tracking, where an 802.3ad slave that reported itself as link up in the miimon code, but wasn't able to get a valid speed/duplex, started getting set to BOND_LINK_FAIL instead of BOND_LINK_DOWN. That was the proper thing to do for the general "my link went down" case, but has created a link initialization race that can put the interface in this odd state. The simple fix is to instead set the slave link to BOND_LINK_DOWN again, if the link has never been up (last_link_up == 0), so the link state doesn't bounce from BOND_LINK_DOWN to BOND_LINK_FAIL -- it hasn't failed in this case, it simply hasn't been up yet, and this prevents the unnecessary state change from DOWN to FAIL and getting stuck in an init failure w/o a partner mac. Fixes: ea53abfab960 ("bonding/802.3ad: fix link_failure_count tracking") CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: "David S. Miller" CC: netdev@vger.kernel.org Tested-by: Heesoon Kim Signed-off-by: Jarod Wilson Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 062fa7e3af4c..407f4095a37a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3122,13 +3122,18 @@ static int bond_slave_netdev_event(unsigned long event, case NETDEV_CHANGE: /* For 802.3ad mode only: * Getting invalid Speed/Duplex values here will put slave - * in weird state. So mark it as link-fail for the time - * being and let link-monitoring (miimon) set it right when - * correct speeds/duplex are available. + * in weird state. Mark it as link-fail if the link was + * previously up or link-down if it hasn't yet come up, and + * let link-monitoring (miimon) set it right when correct + * speeds/duplex are available. */ if (bond_update_speed_duplex(slave) && - BOND_MODE(bond) == BOND_MODE_8023AD) - slave->link = BOND_LINK_FAIL; + BOND_MODE(bond) == BOND_MODE_8023AD) { + if (slave->last_link_up) + slave->link = BOND_LINK_FAIL; + else + slave->link = BOND_LINK_DOWN; + } if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_adapter_speed_duplex_changed(slave); From 5a20a093d965560f632b2ec325f8876918f78165 Mon Sep 17 00:00:00 2001 From: Ioana Radulescu Date: Fri, 24 May 2019 18:15:15 +0300 Subject: [PATCH 44/89] dpaa2-eth: Fix potential spectre issue Smatch reports a potential spectre vulnerability in the dpaa2-eth driver, where the value of rxnfc->fs.location (which is provided from user-space) is used as index in an array. Add a call to array_index_nospec() to sanitize the access. Signed-off-by: Ioana Radulescu Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c index 76bd8d2872cc..7b182f4b263c 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c @@ -4,6 +4,7 @@ */ #include +#include #include "dpni.h" /* DPNI_LINK_OPT_* */ #include "dpaa2-eth.h" @@ -648,6 +649,8 @@ static int dpaa2_eth_get_rxnfc(struct net_device *net_dev, case ETHTOOL_GRXCLSRULE: if (rxnfc->fs.location >= max_rules) return -EINVAL; + rxnfc->fs.location = array_index_nospec(rxnfc->fs.location, + max_rules); if (!priv->cls_rules[rxnfc->fs.location].in_use) return -EINVAL; rxnfc->fs = priv->cls_rules[rxnfc->fs.location].fs; From bd8460fa4de46e9d6177af4fe33bf0763a7af4b7 Mon Sep 17 00:00:00 2001 From: Ioana Radulescu Date: Fri, 24 May 2019 18:15:16 +0300 Subject: [PATCH 45/89] dpaa2-eth: Use PTR_ERR_OR_ZERO where appropriate Use PTR_ERR_OR_ZERO instead of PTR_ERR in cases where zero is a valid input. Reported by smatch. Signed-off-by: Ioana Radulescu Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 63b1ecc18c26..7d2390e3df77 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1972,7 +1972,7 @@ alloc_channel(struct dpaa2_eth_priv *priv) channel->dpcon = setup_dpcon(priv); if (IS_ERR_OR_NULL(channel->dpcon)) { - err = PTR_ERR(channel->dpcon); + err = PTR_ERR_OR_ZERO(channel->dpcon); goto err_setup; } @@ -2028,7 +2028,7 @@ static int setup_dpio(struct dpaa2_eth_priv *priv) /* Try to allocate a channel */ channel = alloc_channel(priv); if (IS_ERR_OR_NULL(channel)) { - err = PTR_ERR(channel); + err = PTR_ERR_OR_ZERO(channel); if (err != -EPROBE_DEFER) dev_info(dev, "No affine channel for cpu %d and above\n", i); From 4ca6dee5220fe2377bf12b354ef85978425c9ec7 Mon Sep 17 00:00:00 2001 From: Ioana Radulescu Date: Fri, 24 May 2019 18:15:17 +0300 Subject: [PATCH 46/89] dpaa2-eth: Make constant 64-bit long Function dpaa2_eth_cls_key_size() expects a 64bit argument, but DPAA2_ETH_DIST_ALL is defined as UINT_MAX. Fix this. Signed-off-by: Ioana Radulescu Reported-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 5fb8f5c0dc9f..e180d5a68c98 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -467,7 +467,7 @@ enum dpaa2_eth_rx_dist { #define DPAA2_ETH_DIST_IPPROTO BIT(6) #define DPAA2_ETH_DIST_L4SRC BIT(7) #define DPAA2_ETH_DIST_L4DST BIT(8) -#define DPAA2_ETH_DIST_ALL (~0U) +#define DPAA2_ETH_DIST_ALL (~0ULL) static inline unsigned int dpaa2_eth_needed_headroom(struct dpaa2_eth_priv *priv, From 46a1695960d0600d58da7af33c65f24f3d839674 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 24 May 2019 10:34:30 -0700 Subject: [PATCH 47/89] net/tls: fix lowat calculation if some data came from previous record If some of the data came from the previous record, i.e. from the rx_list it had already been decrypted, so it's not counted towards the "decrypted" variable, but the "copied" variable. Take that into account when checking lowat. When calculating lowat target we need to pass the original len. E.g. if lowat is at 80, len is 100 and we had 30 bytes on rx_list target would currently be incorrectly calculated as 70, even though we only need 50 more bytes to make up the 80. Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Tested-by: David Beckett Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d93f83f77864..fc13234db74a 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1712,13 +1712,12 @@ int tls_sw_recvmsg(struct sock *sk, copied = err; } - len = len - copied; - if (len) { - target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - } else { + if (len <= copied) goto recv_end; - } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + len = len - copied; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { bool retain_skb = false; @@ -1853,7 +1852,7 @@ pick_next_record: } /* If we have a new message from strparser, continue now. */ - if (decrypted >= target && !ctx->recv_pkt) + if (decrypted + copied >= target && !ctx->recv_pkt) break; } while (len); From 7718a855cd7ae9fc27a2aa1532ee105d52eb7634 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 24 May 2019 10:34:31 -0700 Subject: [PATCH 48/89] selftests/tls: test for lowat overshoot with multiple records Set SO_RCVLOWAT and test it gets respected when gathering data from multiple records. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 47ddfc154036..01efbcd2258c 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -575,6 +575,25 @@ TEST_F(tls, recv_peek_large_buf_mult_recs) EXPECT_EQ(memcmp(test_str, buf, len), 0); } +TEST_F(tls, recv_lowat) +{ + char send_mem[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + char recv_mem[20]; + int lowat = 8; + + EXPECT_EQ(send(self->fd, send_mem, 10, 0), 10); + EXPECT_EQ(send(self->fd, send_mem, 5, 0), 5); + + memset(recv_mem, 0, 20); + EXPECT_EQ(setsockopt(self->cfd, SOL_SOCKET, SO_RCVLOWAT, + &lowat, sizeof(lowat)), 0); + EXPECT_EQ(recv(self->cfd, recv_mem, 1, MSG_WAITALL), 1); + EXPECT_EQ(recv(self->cfd, recv_mem + 1, 6, MSG_WAITALL), 6); + EXPECT_EQ(recv(self->cfd, recv_mem + 7, 10, 0), 8); + + EXPECT_EQ(memcmp(send_mem, recv_mem, 10), 0); + EXPECT_EQ(memcmp(send_mem, recv_mem + 10, 5), 0); +} TEST_F(tls, pollin) { From 04b25a5411f966c2e586909a8496553b71876fae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 24 May 2019 10:34:32 -0700 Subject: [PATCH 49/89] net/tls: fix no wakeup on partial reads When tls_sw_recvmsg() partially copies a record it pops that record from ctx->recv_pkt and places it on rx_list. Next iteration of tls_sw_recvmsg() reads from rx_list via process_rx_list() before it enters the decryption loop. If there is no more records to be read tls_wait_data() will put the process on the wait queue and got to sleep. This is incorrect, because some data was already copied in process_rx_list(). In case of RPC connections process may never get woken up, because peer also simply blocks in read(). I think this may also fix a similar issue when BPF is at play, because after __tcp_bpf_recvmsg() returns some data we subtract it from len and use continue to restart the loop, but len could have just reached 0, so again we'd sleep unnecessarily. That's added by: commit d3b18ad31f93 ("tls: add bpf support to sk_msg handling") Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Reported-by: David Beckett Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Tested-by: David Beckett Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index fc13234db74a..960494f437ac 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1719,7 +1719,7 @@ int tls_sw_recvmsg(struct sock *sk, len = len - copied; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - do { + while (len && (decrypted + copied < target || ctx->recv_pkt)) { bool retain_skb = false; bool zc = false; int to_decrypt; @@ -1850,11 +1850,7 @@ pick_next_record: } else { break; } - - /* If we have a new message from strparser, continue now. */ - if (decrypted + copied >= target && !ctx->recv_pkt) - break; - } while (len); + } recv_end: if (num_async) { From 043556d0917a1a5ea58795fe1656a2bce06d2649 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 24 May 2019 10:34:33 -0700 Subject: [PATCH 50/89] selftests/tls: add test for sleeping even though there is data Add a test which sends 15 bytes of data, and then tries to read 10 byes twice. Previously the second read would sleep indifinitely, since the record was already decrypted and there is only 5 bytes left, not full 10. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 01efbcd2258c..278c86134556 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -442,6 +442,21 @@ TEST_F(tls, multiple_send_single_recv) EXPECT_EQ(memcmp(send_mem, recv_mem + send_len, send_len), 0); } +TEST_F(tls, single_send_multiple_recv_non_align) +{ + const unsigned int total_len = 15; + const unsigned int recv_len = 10; + char recv_mem[recv_len * 2]; + char send_mem[total_len]; + + EXPECT_GE(send(self->fd, send_mem, total_len, 0), 0); + memset(recv_mem, 0, total_len); + + EXPECT_EQ(recv(self->cfd, recv_mem, recv_len, 0), recv_len); + EXPECT_EQ(recv(self->cfd, recv_mem + recv_len, recv_len, 0), 5); + EXPECT_EQ(memcmp(send_mem, recv_mem, total_len), 0); +} + TEST_F(tls, recv_partial) { char const *test_str = "test_read_partial"; From 3e66b7cc50ef921121babc91487e1fb98af1ba6e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 24 May 2019 13:20:19 -0700 Subject: [PATCH 51/89] net: tulip: de4x5: Drop redundant MODULE_DEVICE_TABLE() Building with Clang reports the redundant use of MODULE_DEVICE_TABLE(): drivers/net/ethernet/dec/tulip/de4x5.c:2110:1: error: redefinition of '__mod_eisa__de4x5_eisa_ids_device_table' MODULE_DEVICE_TABLE(eisa, de4x5_eisa_ids); ^ ./include/linux/module.h:229:21: note: expanded from macro 'MODULE_DEVICE_TABLE' extern typeof(name) __mod_##type##__##name##_device_table \ ^ :90:1: note: expanded from here __mod_eisa__de4x5_eisa_ids_device_table ^ drivers/net/ethernet/dec/tulip/de4x5.c:2100:1: note: previous definition is here MODULE_DEVICE_TABLE(eisa, de4x5_eisa_ids); ^ ./include/linux/module.h:229:21: note: expanded from macro 'MODULE_DEVICE_TABLE' extern typeof(name) __mod_##type##__##name##_device_table \ ^ :85:1: note: expanded from here __mod_eisa__de4x5_eisa_ids_device_table ^ This drops the one further from the table definition to match the common use of MODULE_DEVICE_TABLE(). Fixes: 07563c711fbc ("EISA bus MODALIAS attributes support") Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/de4x5.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c index 66535d1653f6..f16853c3c851 100644 --- a/drivers/net/ethernet/dec/tulip/de4x5.c +++ b/drivers/net/ethernet/dec/tulip/de4x5.c @@ -2107,7 +2107,6 @@ static struct eisa_driver de4x5_eisa_driver = { .remove = de4x5_eisa_remove, } }; -MODULE_DEVICE_TABLE(eisa, de4x5_eisa_ids); #endif #ifdef CONFIG_PCI From 31bafc49a7736989e4c2d9f7280002c66536e590 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sat, 25 May 2019 09:57:59 +0000 Subject: [PATCH 52/89] net: aquantia: tx clean budget logic error In case no other traffic happening on the ring, full tx cleanup may not be completed. That may cause socket buffer to overflow and tx traffic to stuck until next activity on the ring happens. This is due to logic error in budget variable decrementor. Variable is compared with zero, and then post decremented, causing it to become MAX_INT. Solution is remove decrementor from the `for` statement and rewrite it in a clear way. Fixes: b647d3980948e ("net: aquantia: Add tx clean budget and valid budget handling logic") Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 350e385528fd..63ed00415904 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -223,10 +223,10 @@ void aq_ring_queue_stop(struct aq_ring_s *ring) bool aq_ring_tx_clean(struct aq_ring_s *self) { struct device *dev = aq_nic_get_dev(self->aq_nic); - unsigned int budget = AQ_CFG_TX_CLEAN_BUDGET; + unsigned int budget; - for (; self->sw_head != self->hw_head && budget--; - self->sw_head = aq_ring_next_dx(self, self->sw_head)) { + for (budget = AQ_CFG_TX_CLEAN_BUDGET; + budget && self->sw_head != self->hw_head; budget--) { struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head]; if (likely(buff->is_mapped)) { @@ -251,6 +251,7 @@ bool aq_ring_tx_clean(struct aq_ring_s *self) buff->pa = 0U; buff->eop_index = 0xffffU; + self->sw_head = aq_ring_next_dx(self, self->sw_head); } return !!budget; From f38f1ee8aeb2c19f65fc29de49bed231a868198c Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Sat, 25 May 2019 09:58:01 +0000 Subject: [PATCH 53/89] net: aquantia: check rx csum for all packets in LRO session Atlantic hardware does not aggregate nor breaks LRO sessions with bad csum packets. This means driver should take care of that. If in LRO session there is a non-first descriptor with invalid checksum (L2/L3/L4), the driver must account this information in csum application logic. Fixes: 018423e90bee8 ("net: ethernet: aquantia: Add ring support code") Signed-off-by: Igor Russkikh Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_ring.c | 44 +++++++++++++------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 63ed00415904..941b0beb87ef 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -299,35 +299,47 @@ int aq_ring_rx_clean(struct aq_ring_s *self, unsigned int i = 0U; u16 hdr_len; - if (buff->is_error) - continue; - if (buff->is_cleaned) continue; if (!buff->is_eop) { - for (next_ = buff->next, - buff_ = &self->buff_ring[next_]; true; - next_ = buff_->next, - buff_ = &self->buff_ring[next_]) { + buff_ = buff; + do { + next_ = buff_->next, + buff_ = &self->buff_ring[next_]; is_rsc_completed = aq_ring_dx_in_range(self->sw_head, next_, self->hw_head); - if (unlikely(!is_rsc_completed)) { - is_rsc_completed = false; + if (unlikely(!is_rsc_completed)) break; - } - if (buff_->is_eop) - break; - } + buff->is_error |= buff_->is_error; + + } while (!buff_->is_eop); if (!is_rsc_completed) { err = 0; goto err_exit; } + if (buff->is_error) { + buff_ = buff; + do { + next_ = buff_->next, + buff_ = &self->buff_ring[next_]; + + buff_->is_cleaned = true; + } while (!buff_->is_eop); + + ++self->stats.rx.errors; + continue; + } + } + + if (buff->is_error) { + ++self->stats.rx.errors; + continue; } dma_sync_single_range_for_cpu(aq_nic_get_dev(self->aq_nic), @@ -390,6 +402,12 @@ int aq_ring_rx_clean(struct aq_ring_s *self, AQ_CFG_RX_FRAME_MAX); page_ref_inc(buff_->rxdata.page); buff_->is_cleaned = 1; + + buff->is_ip_cso &= buff_->is_ip_cso; + buff->is_udp_cso &= buff_->is_udp_cso; + buff->is_tcp_cso &= buff_->is_tcp_cso; + buff->is_cso_err |= buff_->is_cso_err; + } while (!buff_->is_eop); } } From eaeb3b7494ba9159323814a8ce8af06a9277d99b Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Sat, 25 May 2019 09:58:03 +0000 Subject: [PATCH 54/89] net: aquantia: fix LRO with FCS error Driver stops producing skbs on ring if a packet with FCS error was coalesced into LRO session. Ring gets hang forever. Thats a logical error in driver processing descriptors: When rx_stat indicates MAC Error, next pointer and eop flags are not filled. This confuses driver so it waits for descriptor 0 to be filled by HW. Solution is fill next pointer and eop flag even for packets with FCS error. Fixes: bab6de8fd180b ("net: ethernet: aquantia: Atlantic A0 and B0 specific functions.") Signed-off-by: Igor Russkikh Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- .../aquantia/atlantic/hw_atl/hw_atl_b0.c | 59 ++++++++++--------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index bfcda12d73de..e979f227cf0b 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -713,38 +713,41 @@ static int hw_atl_b0_hw_ring_rx_receive(struct aq_hw_s *self, if ((rx_stat & BIT(0)) || rxd_wb->type & 0x1000U) { /* MAC error or DMA error */ buff->is_error = 1U; - } else { - if (self->aq_nic_cfg->is_rss) { - /* last 4 byte */ - u16 rss_type = rxd_wb->type & 0xFU; + } + if (self->aq_nic_cfg->is_rss) { + /* last 4 byte */ + u16 rss_type = rxd_wb->type & 0xFU; - if (rss_type && rss_type < 0x8U) { - buff->is_hash_l4 = (rss_type == 0x4 || - rss_type == 0x5); - buff->rss_hash = rxd_wb->rss_hash; - } + if (rss_type && rss_type < 0x8U) { + buff->is_hash_l4 = (rss_type == 0x4 || + rss_type == 0x5); + buff->rss_hash = rxd_wb->rss_hash; } + } - if (HW_ATL_B0_RXD_WB_STAT2_EOP & rxd_wb->status) { - buff->len = rxd_wb->pkt_len % - AQ_CFG_RX_FRAME_MAX; - buff->len = buff->len ? - buff->len : AQ_CFG_RX_FRAME_MAX; - buff->next = 0U; - buff->is_eop = 1U; + if (HW_ATL_B0_RXD_WB_STAT2_EOP & rxd_wb->status) { + buff->len = rxd_wb->pkt_len % + AQ_CFG_RX_FRAME_MAX; + buff->len = buff->len ? + buff->len : AQ_CFG_RX_FRAME_MAX; + buff->next = 0U; + buff->is_eop = 1U; + } else { + buff->len = + rxd_wb->pkt_len > AQ_CFG_RX_FRAME_MAX ? + AQ_CFG_RX_FRAME_MAX : rxd_wb->pkt_len; + + if (HW_ATL_B0_RXD_WB_STAT2_RSCCNT & + rxd_wb->status) { + /* LRO */ + buff->next = rxd_wb->next_desc_ptr; + ++ring->stats.rx.lro_packets; } else { - if (HW_ATL_B0_RXD_WB_STAT2_RSCCNT & - rxd_wb->status) { - /* LRO */ - buff->next = rxd_wb->next_desc_ptr; - ++ring->stats.rx.lro_packets; - } else { - /* jumbo */ - buff->next = - aq_ring_next_dx(ring, - ring->hw_head); - ++ring->stats.rx.jumbo_packets; - } + /* jumbo */ + buff->next = + aq_ring_next_dx(ring, + ring->hw_head); + ++ring->stats.rx.jumbo_packets; } } } From 76f254d4afe2f9c5860922d5304821b4ef05b712 Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Sat, 25 May 2019 09:58:05 +0000 Subject: [PATCH 55/89] net: aquantia: tcp checksum 0xffff being handled incorrectly Thats a known quirk in windows tcp stack it can produce 0xffff checksum. Thats incorrect but it is. Atlantic HW with LRO enabled handles that incorrectly and changes csum to 0xfffe - but indicates that csum is invalid. This causes driver to pass packet to linux networking stack with CSUM_NONE, stack eventually drops the packet. There is a quirk in atlantic HW to enable correct processing of 0xffff incorrect csum. Enable it. The visible bug is that windows link partner with software generated csums caused TCP connection to be unstable since all packets that csum value are dropped. Reported-by: Dmitry Bezrukov Signed-off-by: Nikita Danilov Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index e979f227cf0b..5c3065bdfddf 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -266,12 +266,11 @@ static int hw_atl_b0_hw_offload_set(struct aq_hw_s *self, */ hw_atl_rpo_lro_max_coalescing_interval_set(self, 50); - hw_atl_rpo_lro_qsessions_lim_set(self, 1U); hw_atl_rpo_lro_total_desc_lim_set(self, 2U); - hw_atl_rpo_lro_patch_optimization_en_set(self, 0U); + hw_atl_rpo_lro_patch_optimization_en_set(self, 1U); hw_atl_rpo_lro_min_pay_of_first_pkt_set(self, 10U); From 54ed6fd2e03fd38819afa60a5b4cb61874230d34 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 27 May 2019 12:52:51 +0200 Subject: [PATCH 56/89] net: stmmac: Do not output error on deferred probe If the subdriver defers probe, do not show an error message. It's perfectly fine for this error to occur since the driver will get another chance to probe after some time and will usually succeed after all of the resources that it requires have been registered. Signed-off-by: Thierry Reding Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 3256e5cbad27..5bc224834c77 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -455,7 +455,11 @@ static int dwc_eth_dwmac_probe(struct platform_device *pdev) priv = data->probe(pdev, plat_dat, &stmmac_res); if (IS_ERR(priv)) { ret = PTR_ERR(priv); - dev_err(&pdev->dev, "failed to probe subdriver: %d\n", ret); + + if (ret != -EPROBE_DEFER) + dev_err(&pdev->dev, "failed to probe subdriver: %d\n", + ret); + goto remove_config; } From d484e06e25ebb937d841dac02ac1fe76ec7d4ddd Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 27 May 2019 11:04:17 +0000 Subject: [PATCH 57/89] net: mvneta: Fix err code path of probe Fix below issues in err code path of probe: 1. we don't need to unregister_netdev() because the netdev isn't registered. 2. when register_netdev() fails, we also need to destroy bm pool for HWBM case. Fixes: dc35a10f68d3 ("net: mvneta: bm: add support for hardware buffer management") Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index e758650b2c26..269bd73be1a0 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4674,7 +4674,7 @@ static int mvneta_probe(struct platform_device *pdev) err = register_netdev(dev); if (err < 0) { dev_err(&pdev->dev, "failed to register\n"); - goto err_free_stats; + goto err_netdev; } netdev_info(dev, "Using %s mac address %pM\n", mac_from, @@ -4685,14 +4685,12 @@ static int mvneta_probe(struct platform_device *pdev) return 0; err_netdev: - unregister_netdev(dev); if (pp->bm_priv) { mvneta_bm_pool_destroy(pp->bm_priv, pp->pool_long, 1 << pp->id); mvneta_bm_pool_destroy(pp->bm_priv, pp->pool_short, 1 << pp->id); mvneta_bm_put(pp->bm_priv); } -err_free_stats: free_percpu(pp->stats); err_free_ports: free_percpu(pp->ports); From 7aae703f8096d21e34ce5f34f16715587bc30902 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 27 May 2019 15:24:05 +0300 Subject: [PATCH 58/89] dpaa_eth: use only online CPU portals Make sure only the portals for the online CPUs are used. Without this change, there are issues when someone boots with maxcpus=n, with n < actual number of cores available as frames either received or corresponding to the transmit confirmation path would be offered for dequeue to the offline CPU portals, getting lost. Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 9 ++++----- drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index d3f2408dc9e8..f38c3fa7d705 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -780,7 +780,7 @@ static void dpaa_eth_add_channel(u16 channel) struct qman_portal *portal; int cpu; - for_each_cpu(cpu, cpus) { + for_each_cpu_and(cpu, cpus, cpu_online_mask) { portal = qman_get_affine_portal(cpu); qman_p_static_dequeue_add(portal, pool); } @@ -896,7 +896,7 @@ static void dpaa_fq_setup(struct dpaa_priv *priv, u16 channels[NR_CPUS]; struct dpaa_fq *fq; - for_each_cpu(cpu, affine_cpus) + for_each_cpu_and(cpu, affine_cpus, cpu_online_mask) channels[num_portals++] = qman_affine_channel(cpu); if (num_portals == 0) @@ -2174,7 +2174,6 @@ static int dpaa_eth_poll(struct napi_struct *napi, int budget) if (cleaned < budget) { napi_complete_done(napi, cleaned); qman_p_irqsource_add(np->p, QM_PIRQ_DQRI); - } else if (np->down) { qman_p_irqsource_add(np->p, QM_PIRQ_DQRI); } @@ -2448,7 +2447,7 @@ static void dpaa_eth_napi_enable(struct dpaa_priv *priv) struct dpaa_percpu_priv *percpu_priv; int i; - for_each_possible_cpu(i) { + for_each_online_cpu(i) { percpu_priv = per_cpu_ptr(priv->percpu_priv, i); percpu_priv->np.down = 0; @@ -2461,7 +2460,7 @@ static void dpaa_eth_napi_disable(struct dpaa_priv *priv) struct dpaa_percpu_priv *percpu_priv; int i; - for_each_possible_cpu(i) { + for_each_online_cpu(i) { percpu_priv = per_cpu_ptr(priv->percpu_priv, i); percpu_priv->np.down = 1; diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c index bdee441bc3b7..7ce2e99b594d 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c @@ -569,7 +569,7 @@ static int dpaa_set_coalesce(struct net_device *dev, qman_dqrr_get_ithresh(portal, &prev_thresh); /* set new values */ - for_each_cpu(cpu, cpus) { + for_each_cpu_and(cpu, cpus, cpu_online_mask) { portal = qman_get_affine_portal(cpu); res = qman_portal_set_iperiod(portal, period); if (res) @@ -586,7 +586,7 @@ static int dpaa_set_coalesce(struct net_device *dev, revert_values: /* restore previous values */ - for_each_cpu(cpu, cpus) { + for_each_cpu_and(cpu, cpus, cpu_online_mask) { if (!needs_revert[cpu]) continue; portal = qman_get_affine_portal(cpu); From 73f51d151e6c760d74d7d4bde75337ebe5693b3e Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Mon, 27 May 2019 19:42:23 +0200 Subject: [PATCH 59/89] selftests: pmtu: Fix encapsulating device in pmtu_vti6_link_change_mtu In the pmtu_vti6_link_change_mtu test, both local and remote addresses for the vti6 tunnel are assigned to the same address given to the dummy interface that we use as encapsulating device with a known MTU. This works as long as the dummy interface is actually selected, via rt6_lookup(), as encapsulating device. But if the remote address of the tunnel is a local address too, the loopback interface could also be selected, and there's nothing wrong with it. This is what some older -stable kernels do (3.18.z, at least), and nothing prevents us from subtly changing FIB implementation to revert back to that behaviour in the future. Define an IPv6 prefix instead, and use two separate addresses as local and remote for vti6, so that the encapsulating device can't be a loopback interface. Reported-by: Xiumei Mu Fixes: 1fad59ea1c34 ("selftests: pmtu: Add pmtu_vti6_link_change_mtu test") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index b9171a7b3aaa..317dafcd605d 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -208,8 +208,8 @@ tunnel6_a_addr="fd00:2::a" tunnel6_b_addr="fd00:2::b" tunnel6_mask="64" -dummy6_0_addr="fc00:1000::0" -dummy6_1_addr="fc00:1001::0" +dummy6_0_prefix="fc00:1000::" +dummy6_1_prefix="fc00:1001::" dummy6_mask="64" cleanup_done=1 @@ -1005,13 +1005,13 @@ test_pmtu_vti6_link_change_mtu() { run_cmd ${ns_a} ip link set dummy0 up run_cmd ${ns_a} ip link set dummy1 up - run_cmd ${ns_a} ip addr add ${dummy6_0_addr}/${dummy6_mask} dev dummy0 - run_cmd ${ns_a} ip addr add ${dummy6_1_addr}/${dummy6_mask} dev dummy1 + run_cmd ${ns_a} ip addr add ${dummy6_0_prefix}1/${dummy6_mask} dev dummy0 + run_cmd ${ns_a} ip addr add ${dummy6_1_prefix}1/${dummy6_mask} dev dummy1 fail=0 # Create vti6 interface bound to device, passing MTU, check it - run_cmd ${ns_a} ip link add vti6_a mtu 1300 type vti6 remote ${dummy6_0_addr} local ${dummy6_0_addr} + run_cmd ${ns_a} ip link add vti6_a mtu 1300 type vti6 remote ${dummy6_0_prefix}2 local ${dummy6_0_prefix}1 mtu="$(link_get_mtu "${ns_a}" vti6_a)" if [ ${mtu} -ne 1300 ]; then err " vti6 MTU ${mtu} doesn't match configured value 1300" @@ -1020,7 +1020,7 @@ test_pmtu_vti6_link_change_mtu() { # Move to another device with different MTU, without passing MTU, check # MTU is adjusted - run_cmd ${ns_a} ip link set vti6_a type vti6 remote ${dummy6_1_addr} local ${dummy6_1_addr} + run_cmd ${ns_a} ip link set vti6_a type vti6 remote ${dummy6_1_prefix}2 local ${dummy6_1_prefix}1 mtu="$(link_get_mtu "${ns_a}" vti6_a)" if [ ${mtu} -ne $((3000 - 40)) ]; then err " vti MTU ${mtu} is not dummy MTU 3000 minus IPv6 header length" @@ -1028,7 +1028,7 @@ test_pmtu_vti6_link_change_mtu() { fi # Move it back, passing MTU, check MTU is not overridden - run_cmd ${ns_a} ip link set vti6_a mtu 1280 type vti6 remote ${dummy6_0_addr} local ${dummy6_0_addr} + run_cmd ${ns_a} ip link set vti6_a mtu 1280 type vti6 remote ${dummy6_0_prefix}2 local ${dummy6_0_prefix}1 mtu="$(link_get_mtu "${ns_a}" vti6_a)" if [ ${mtu} -ne 1280 ]; then err " vti6 MTU ${mtu} doesn't match configured value 1280" From 8fb44d60d4142cd2a440620cd291d346e23c131e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 May 2019 17:35:52 -0700 Subject: [PATCH 60/89] llc: fix skb leak in llc_build_and_send_ui_pkt() If llc_mac_hdr_init() returns an error, we must drop the skb since no llc_build_and_send_ui_pkt() caller will take care of this. BUG: memory leak unreferenced object 0xffff8881202b6800 (size 2048): comm "syz-executor907", pid 7074, jiffies 4294943781 (age 8.590s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 1a 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 ...@............ backtrace: [<00000000e25b5abe>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<00000000e25b5abe>] slab_post_alloc_hook mm/slab.h:439 [inline] [<00000000e25b5abe>] slab_alloc mm/slab.c:3326 [inline] [<00000000e25b5abe>] __do_kmalloc mm/slab.c:3658 [inline] [<00000000e25b5abe>] __kmalloc+0x161/0x2c0 mm/slab.c:3669 [<00000000a1ae188a>] kmalloc include/linux/slab.h:552 [inline] [<00000000a1ae188a>] sk_prot_alloc+0xd6/0x170 net/core/sock.c:1608 [<00000000ded25bbe>] sk_alloc+0x35/0x2f0 net/core/sock.c:1662 [<000000002ecae075>] llc_sk_alloc+0x35/0x170 net/llc/llc_conn.c:950 [<00000000551f7c47>] llc_ui_create+0x7b/0x140 net/llc/af_llc.c:173 [<0000000029027f0e>] __sock_create+0x164/0x250 net/socket.c:1430 [<000000008bdec225>] sock_create net/socket.c:1481 [inline] [<000000008bdec225>] __sys_socket+0x69/0x110 net/socket.c:1523 [<00000000b6439228>] __do_sys_socket net/socket.c:1532 [inline] [<00000000b6439228>] __se_sys_socket net/socket.c:1530 [inline] [<00000000b6439228>] __x64_sys_socket+0x1e/0x30 net/socket.c:1530 [<00000000cec820c1>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<000000000c32554f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 BUG: memory leak unreferenced object 0xffff88811d750d00 (size 224): comm "syz-executor907", pid 7074, jiffies 4294943781 (age 8.600s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 f0 0c 24 81 88 ff ff 00 68 2b 20 81 88 ff ff ...$.....h+ .... backtrace: [<0000000053026172>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<0000000053026172>] slab_post_alloc_hook mm/slab.h:439 [inline] [<0000000053026172>] slab_alloc_node mm/slab.c:3269 [inline] [<0000000053026172>] kmem_cache_alloc_node+0x153/0x2a0 mm/slab.c:3579 [<00000000fa8f3c30>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:198 [<00000000d96fdafb>] alloc_skb include/linux/skbuff.h:1058 [inline] [<00000000d96fdafb>] alloc_skb_with_frags+0x5f/0x250 net/core/skbuff.c:5327 [<000000000a34a2e7>] sock_alloc_send_pskb+0x269/0x2a0 net/core/sock.c:2225 [<00000000ee39999b>] sock_alloc_send_skb+0x32/0x40 net/core/sock.c:2242 [<00000000e034d810>] llc_ui_sendmsg+0x10a/0x540 net/llc/af_llc.c:933 [<00000000c0bc8445>] sock_sendmsg_nosec net/socket.c:652 [inline] [<00000000c0bc8445>] sock_sendmsg+0x54/0x70 net/socket.c:671 [<000000003b687167>] __sys_sendto+0x148/0x1f0 net/socket.c:1964 [<00000000922d78d9>] __do_sys_sendto net/socket.c:1976 [inline] [<00000000922d78d9>] __se_sys_sendto net/socket.c:1972 [inline] [<00000000922d78d9>] __x64_sys_sendto+0x2a/0x30 net/socket.c:1972 [<00000000cec820c1>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<000000000c32554f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/llc/llc_output.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c index 94425e421213..9e4b6bcf6920 100644 --- a/net/llc/llc_output.c +++ b/net/llc/llc_output.c @@ -72,6 +72,8 @@ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac); if (likely(!rc)) rc = dev_queue_xmit(skb); + else + kfree_skb(skb); return rc; } From a6cd0d2d493ab7806b49f738b4f66362437cc09e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 27 May 2019 19:06:38 -0700 Subject: [PATCH 61/89] Documentation: net-sysfs: Remove duplicate PHY device documentation Both sysfs-bus-mdio and sysfs-class-net-phydev contain the same duplication information. There is not currently any MDIO bus specific attribute, but there are PHY device (struct phy_device) specific attributes. Use the more precise description from sysfs-bus-mdio and carry that over to sysfs-class-net-phydev. Fixes: 86f22d04dfb5 ("net: sysfs: Document PHY device sysfs attributes") Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-bus-mdio | 29 ------------------- .../ABI/testing/sysfs-class-net-phydev | 19 ++++++++---- 2 files changed, 13 insertions(+), 35 deletions(-) delete mode 100644 Documentation/ABI/testing/sysfs-bus-mdio diff --git a/Documentation/ABI/testing/sysfs-bus-mdio b/Documentation/ABI/testing/sysfs-bus-mdio deleted file mode 100644 index 491baaf4285f..000000000000 --- a/Documentation/ABI/testing/sysfs-bus-mdio +++ /dev/null @@ -1,29 +0,0 @@ -What: /sys/bus/mdio_bus/devices/.../phy_id -Date: November 2012 -KernelVersion: 3.8 -Contact: netdev@vger.kernel.org -Description: - This attribute contains the 32-bit PHY Identifier as reported - by the device during bus enumeration, encoded in hexadecimal. - This ID is used to match the device with the appropriate - driver. - -What: /sys/bus/mdio_bus/devices/.../phy_interface -Date: February 2014 -KernelVersion: 3.15 -Contact: netdev@vger.kernel.org -Description: - This attribute contains the PHY interface as configured by the - Ethernet driver during bus enumeration, encoded in string. - This interface mode is used to configure the Ethernet MAC with the - appropriate mode for its data lines to the PHY hardware. - -What: /sys/bus/mdio_bus/devices/.../phy_has_fixups -Date: February 2014 -KernelVersion: 3.15 -Contact: netdev@vger.kernel.org -Description: - This attribute contains the boolean value whether a given PHY - device has had any "fixup" workaround running on it, encoded as - a boolean. This information is provided to help troubleshooting - PHY configurations. diff --git a/Documentation/ABI/testing/sysfs-class-net-phydev b/Documentation/ABI/testing/sysfs-class-net-phydev index 6ebabfb27912..2a5723343aba 100644 --- a/Documentation/ABI/testing/sysfs-class-net-phydev +++ b/Documentation/ABI/testing/sysfs-class-net-phydev @@ -11,24 +11,31 @@ Date: February 2014 KernelVersion: 3.15 Contact: netdev@vger.kernel.org Description: - Boolean value indicating whether the PHY device has - any fixups registered against it (phy_register_fixup) + This attribute contains the boolean value whether a given PHY + device has had any "fixup" workaround running on it, encoded as + a boolean. This information is provided to help troubleshooting + PHY configurations. What: /sys/class/mdio_bus///phy_id Date: November 2012 KernelVersion: 3.8 Contact: netdev@vger.kernel.org Description: - 32-bit hexadecimal value corresponding to the PHY device's OUI, - model and revision number. + This attribute contains the 32-bit PHY Identifier as reported + by the device during bus enumeration, encoded in hexadecimal. + This ID is used to match the device with the appropriate + driver. What: /sys/class/mdio_bus///phy_interface Date: February 2014 KernelVersion: 3.15 Contact: netdev@vger.kernel.org Description: - String value indicating the PHY interface, possible - values are:. + This attribute contains the PHY interface as configured by the + Ethernet driver during bus enumeration, encoded in string. + This interface mode is used to configure the Ethernet MAC with the + appropriate mode for its data lines to the PHY hardware. + Possible values are: (not available), mii, gmii, sgmii, tbi, rev-mii, rmii, rgmii, rgmii-id, rgmii-rxid, rgmii-txid, rtbi, smii xgmii, moca, qsgmii, trgmii, 1000base-x, 2500base-x, rxaui, From 8788392995e7f1ea87d4efa03c14d872b05f9f25 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 16 May 2019 11:11:11 -0700 Subject: [PATCH 62/89] net/mlx5: Fix error handling in mlx5_load() In case mlx5_core_set_hca_defaults fails, it should jump to mlx5_cleanup_fs, fix that. Fixes: c85023e153e3 ("IB/mlx5: Add raw ethernet local loopback support") Signed-off-by: Saeed Mahameed Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 61fa1d162d28..23d53163ce15 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1067,7 +1067,7 @@ static int mlx5_load(struct mlx5_core_dev *dev) err = mlx5_core_set_hca_defaults(dev); if (err) { mlx5_core_err(dev, "Failed to set hca defaults\n"); - goto err_fs; + goto err_sriov; } err = mlx5_sriov_attach(dev); From 905f6bd30bb6c244f97cdf5fb5d55cf263844490 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 9 May 2019 17:23:24 -0500 Subject: [PATCH 63/89] net/mlx5: Avoid double free of root ns in the error flow path When root ns setup for rdma, sniffer tx and sniffer rx fails, such root ns cleanup is done by the error unwinding path of mlx5_cleanup_fs(). Below call graph shows an example for sniffer_rx_root_ns. mlx5_init_fs() init_sniffer_rx_root_ns() cleanup_root_ns(steering->sniffer_rx_root_ns); mlx5_cleanup_fs() cleanup_root_ns(steering->sniffer_rx_root_ns); /* double free of sniffer_rx_root_ns */ Hence, use the existing cleanup_fs to cleanup. Fixes: d83eb50e29de3 ("net/mlx5: Add support in RDMA RX steering") Fixes: 87d22483ce68e ("net/mlx5: Add sniffer namespaces") Signed-off-by: Parav Pandit Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/fs_core.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index d7ca7e82a832..4fa87ca63bca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2474,11 +2474,7 @@ static int init_sniffer_tx_root_ns(struct mlx5_flow_steering *steering) /* Create single prio */ prio = fs_create_prio(&steering->sniffer_tx_root_ns->ns, 0, 1); - if (IS_ERR(prio)) { - cleanup_root_ns(steering->sniffer_tx_root_ns); - return PTR_ERR(prio); - } - return 0; + return PTR_ERR_OR_ZERO(prio); } static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering) @@ -2491,11 +2487,7 @@ static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering) /* Create single prio */ prio = fs_create_prio(&steering->sniffer_rx_root_ns->ns, 0, 1); - if (IS_ERR(prio)) { - cleanup_root_ns(steering->sniffer_rx_root_ns); - return PTR_ERR(prio); - } - return 0; + return PTR_ERR_OR_ZERO(prio); } static int init_rdma_rx_root_ns(struct mlx5_flow_steering *steering) @@ -2511,11 +2503,7 @@ static int init_rdma_rx_root_ns(struct mlx5_flow_steering *steering) /* Create single prio */ prio = fs_create_prio(&steering->rdma_rx_root_ns->ns, 0, 1); - if (IS_ERR(prio)) { - cleanup_root_ns(steering->rdma_rx_root_ns); - return PTR_ERR(prio); - } - return 0; + return PTR_ERR_OR_ZERO(prio); } static int init_fdb_root_ns(struct mlx5_flow_steering *steering) { From 9414277a5df3669c67e818708c0f881597e0118e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 10 May 2019 10:26:23 -0500 Subject: [PATCH 64/89] net/mlx5: Avoid double free in fs init error unwinding path In below code flow, for ingress acl table root ns memory leads to double free. mlx5_init_fs init_ingress_acls_root_ns() init_ingress_acl_root_ns kfree(steering->esw_ingress_root_ns); /* steering->esw_ingress_root_ns is not marked NULL */ mlx5_cleanup_fs cleanup_ingress_acls_root_ns steering->esw_ingress_root_ns non NULL check passes. kfree(steering->esw_ingress_root_ns); /* double free */ Similar issue exist for other tables. Hence zero out the pointers to not process the table again. Fixes: 9b93ab981e3bf ("net/mlx5: Separate ingress/egress namespaces for each vport") Fixes: 40c3eebb49e51 ("net/mlx5: Add support in RDMA RX steering") Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 4fa87ca63bca..34276a2b6da2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2427,6 +2427,7 @@ static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev) cleanup_root_ns(steering->esw_egress_root_ns[i]); kfree(steering->esw_egress_root_ns); + steering->esw_egress_root_ns = NULL; } static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev) @@ -2441,6 +2442,7 @@ static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev) cleanup_root_ns(steering->esw_ingress_root_ns[i]); kfree(steering->esw_ingress_root_ns); + steering->esw_ingress_root_ns = NULL; } void mlx5_cleanup_fs(struct mlx5_core_dev *dev) @@ -2625,6 +2627,7 @@ cleanup_root_ns: for (i--; i >= 0; i--) cleanup_root_ns(steering->esw_egress_root_ns[i]); kfree(steering->esw_egress_root_ns); + steering->esw_egress_root_ns = NULL; return err; } @@ -2652,6 +2655,7 @@ cleanup_root_ns: for (i--; i >= 0; i--) cleanup_root_ns(steering->esw_ingress_root_ns[i]); kfree(steering->esw_ingress_root_ns); + steering->esw_ingress_root_ns = NULL; return err; } From 25fa506b70cadb580c1e9cbd836d6417276d4bcd Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 10 May 2019 10:40:08 -0500 Subject: [PATCH 65/89] net/mlx5: Allocate root ns memory using kzalloc to match kfree root ns is yet another fs core node which is freed using kfree() by tree_put_node(). Rest of the other fs core objects are also allocated using kmalloc variants. However, root ns memory is allocated using kvzalloc(). Hence allocate root ns memory using kzalloc(). Fixes: 2530236303d9e ("net/mlx5_core: Flow steering tree initialization") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 34276a2b6da2..fe76c6fd6d80 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2284,7 +2284,7 @@ static struct mlx5_flow_root_namespace cmds = mlx5_fs_cmd_get_default_ipsec_fpga_cmds(table_type); /* Create the root namespace */ - root_ns = kvzalloc(sizeof(*root_ns), GFP_KERNEL); + root_ns = kzalloc(sizeof(*root_ns), GFP_KERNEL); if (!root_ns) return NULL; From 24bcd210e272c1e5fc712fbc5e5c21f59a21abf7 Mon Sep 17 00:00:00 2001 From: wenxu Date: Wed, 15 May 2019 17:25:45 +0800 Subject: [PATCH 66/89] net/mlx5e: restrict the real_dev of vlan device is the same as uplink device When register indr block for vlan device, it should check the real_dev of vlan device is same as uplink device. Or it will set offload rule to mlx5e which will never hit. Fixes: 35a605db168c ("net/mlx5e: Offload TC e-switch rules with ingress VLAN device") Signed-off-by: wenxu Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 5283e16c69e4..9aea9c5b2ce8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -813,7 +813,7 @@ static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb, struct net_device *netdev = netdev_notifier_info_to_dev(ptr); if (!mlx5e_tc_tun_device_to_offload(priv, netdev) && - !is_vlan_dev(netdev)) + !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev)) return NOTIFY_OK; switch (event) { From c0194e2d0ef0e5ce5e21a35640d23a706827ae28 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 23 May 2019 12:55:10 -0700 Subject: [PATCH 67/89] net/mlx5e: Disable rxhash when CQE compress is enabled When CQE compression is enabled (Multi-host systems), compressed CQEs might arrive to the driver rx, compressed CQEs don't have a valid hash offload and the driver already reports a hash value of 0 and invalid hash type on the skb for compressed CQEs, but this is not good enough. On a congested PCIe, where CQE compression will kick in aggressively, gro will deliver lots of out of order packets due to the invalid hash and this might cause a serious performance drop. The only valid solution, is to disable rxhash offload at all when CQE compression is favorable (Multi-host systems). Fixes: 7219ab34f184 ("net/mlx5e: CQE compression") Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 457cc39423f2..c65cefd84eda 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3687,6 +3687,12 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev, netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n"); } + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) { + features &= ~NETIF_F_RXHASH; + if (netdev->features & NETIF_F_RXHASH) + netdev_warn(netdev, "Disabling rxhash, not supported when CQE compress is active\n"); + } + mutex_unlock(&priv->state_lock); return features; @@ -3812,6 +3818,9 @@ int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr) memcpy(&priv->tstamp, &config, sizeof(config)); mutex_unlock(&priv->state_lock); + /* might need to fix some features */ + netdev_update_features(priv->netdev); + return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? -EFAULT : 0; } @@ -4680,6 +4689,10 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) if (!priv->channels.params.scatter_fcs_en) netdev->features &= ~NETIF_F_RXFCS; + /* prefere CQE compression over rxhash */ + if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS)) + netdev->features &= ~NETIF_F_RXHASH; + #define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f) if (FT_CAP(flow_modify_en) && FT_CAP(modify_root) && From 315ca92dd863fecbffc0bb52ae0ac11e0398726a Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Tue, 28 May 2019 13:10:46 +0900 Subject: [PATCH 68/89] net: sh_eth: fix mdio access in sh_eth_close() for R-Car Gen2 and RZ/A1 SoCs The sh_eth_close() resets the MAC and then calls phy_stop() so that mdio read access result is incorrect without any error according to kernel trace like below: ifconfig-216 [003] .n.. 109.133124: mdio_access: ee700000.ethernet-ffffffff read phy:0x01 reg:0x00 val:0xffff According to the hardware manual, the RMII mode should be set to 1 before operation the Ethernet MAC. However, the previous code was not set to 1 after the driver issued the soft_reset in sh_eth_dev_exit() so that the mdio read access result seemed incorrect. To fix the issue, this patch adds a condition and set the RMII mode register in sh_eth_dev_exit() for R-Car Gen2 and RZ/A1 SoCs. Note that when I have tried to move the sh_eth_dev_exit() calling after phy_stop() on sh_eth_close(), but it gets worse (kernel panic happened and it seems that a register is accessed while the clock is off). Signed-off-by: Yoshihiro Shimoda Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 6354f19a31eb..7ba35a0bdb29 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -1594,6 +1594,10 @@ static void sh_eth_dev_exit(struct net_device *ndev) sh_eth_get_stats(ndev); mdp->cd->soft_reset(ndev); + /* Set the RMII mode again if required */ + if (mdp->cd->rmiimode) + sh_eth_write(ndev, 0x1, RMIIMODE); + /* Set MAC address again */ update_mac_address(ndev); } From c678726305b9425454be7c8a7624290b602602fc Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 28 May 2019 10:27:21 +0100 Subject: [PATCH 69/89] net: phylink: ensure consistent phy interface mode Ensure that we supply the same phy interface mode to mac_link_down() as we did for the corresponding mac_link_up() call. This ensures that MAC drivers that use the phy interface mode in these methods can depend on mac_link_down() always corresponding to a mac_link_up() call for the same interface mode. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 74983593834b..9044b95d2afe 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -51,6 +51,10 @@ struct phylink { /* The link configuration settings */ struct phylink_link_state link_config; + + /* The current settings */ + phy_interface_t cur_interface; + struct gpio_desc *link_gpio; struct timer_list link_poll; void (*get_fixed_state)(struct net_device *dev, @@ -446,12 +450,12 @@ static void phylink_resolve(struct work_struct *w) if (!link_state.link) { netif_carrier_off(ndev); pl->ops->mac_link_down(ndev, pl->link_an_mode, - pl->phy_state.interface); + pl->cur_interface); netdev_info(ndev, "Link is Down\n"); } else { + pl->cur_interface = link_state.interface; pl->ops->mac_link_up(ndev, pl->link_an_mode, - pl->phy_state.interface, - pl->phydev); + pl->cur_interface, pl->phydev); netif_carrier_on(ndev); From 3d3ced2ec5d71b99d72ae6910fbdf890bc2eccf0 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 28 May 2019 10:34:42 +0100 Subject: [PATCH 70/89] net: phy: marvell10g: report if the PHY fails to boot firmware Some boards do not have the PHY firmware programmed in the 3310's flash, which leads to the PHY not working as expected. Warn the user when the PHY fails to boot the firmware and refuse to initialise. Fixes: 20b2af32ff3f ("net: phy: add Marvell Alaska X 88X3310 10Gigabit PHY support") Signed-off-by: Russell King Tested-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell10g.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 238a20e13d6a..3b99882692e3 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -31,6 +31,9 @@ #define MV_PHY_ALASKA_NBT_QUIRK_REV (MARVELL_PHY_ID_88X3310 | 0xa) enum { + MV_PMA_BOOT = 0xc050, + MV_PMA_BOOT_FATAL = BIT(0), + MV_PCS_BASE_T = 0x0000, MV_PCS_BASE_R = 0x1000, MV_PCS_1000BASEX = 0x2000, @@ -213,6 +216,16 @@ static int mv3310_probe(struct phy_device *phydev) (phydev->c45_ids.devices_in_package & mmd_mask) != mmd_mask) return -ENODEV; + ret = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MV_PMA_BOOT); + if (ret < 0) + return ret; + + if (ret & MV_PMA_BOOT_FATAL) { + dev_warn(&phydev->mdio.dev, + "PHY failed to boot firmware, status=%04x\n", ret); + return -ENODEV; + } + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; From 333061b924539c0de081339643f45514f5f1c1e6 Mon Sep 17 00:00:00 2001 From: Max Uvarov Date: Tue, 28 May 2019 13:00:49 +0300 Subject: [PATCH 71/89] net: phy: dp83867: fix speed 10 in sgmii mode For supporting 10Mps speed in SGMII mode DP83867_10M_SGMII_RATE_ADAPT bit of DP83867_10M_SGMII_CFG register has to be cleared by software. That does not affect speeds 100 and 1000 so can be done on init. Signed-off-by: Max Uvarov Cc: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index fd35131a0c39..1091a625bf4c 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -30,6 +30,8 @@ #define DP83867_STRAP_STS1 0x006E #define DP83867_RGMIIDCTL 0x0086 #define DP83867_IO_MUX_CFG 0x0170 +#define DP83867_10M_SGMII_CFG 0x016F +#define DP83867_10M_SGMII_RATE_ADAPT_MASK BIT(7) #define DP83867_SW_RESET BIT(15) #define DP83867_SW_RESTART BIT(14) @@ -277,6 +279,21 @@ static int dp83867_config_init(struct phy_device *phydev) DP83867_IO_MUX_CFG_IO_IMPEDANCE_CTRL); } + if (phydev->interface == PHY_INTERFACE_MODE_SGMII) { + /* For support SPEED_10 in SGMII mode + * DP83867_10M_SGMII_RATE_ADAPT bit + * has to be cleared by software. That + * does not affect SPEED_100 and + * SPEED_1000. + */ + ret = phy_modify_mmd(phydev, DP83867_DEVADDR, + DP83867_10M_SGMII_CFG, + DP83867_10M_SGMII_RATE_ADAPT_MASK, + 0); + if (ret) + return ret; + } + /* Enable Interrupt output INT_OE in CFG3 register */ if (phy_interrupt_is_valid(phydev)) { val = phy_read(phydev, DP83867_CFG3); From 1a97a477e666cbdededab93bd3754e508f0c09d7 Mon Sep 17 00:00:00 2001 From: Max Uvarov Date: Tue, 28 May 2019 13:00:50 +0300 Subject: [PATCH 72/89] net: phy: dp83867: increase SGMII autoneg timer duration After reset SGMII Autoneg timer is set to 2us (bits 6 and 5 are 01). That is not enough to finalize autonegatiation on some devices. Increase this timer duration to maximum supported 16ms. Signed-off-by: Max Uvarov Cc: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 1091a625bf4c..14e9e8a94639 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -26,6 +26,12 @@ /* Extended Registers */ #define DP83867_CFG4 0x0031 +#define DP83867_CFG4_SGMII_ANEG_MASK (BIT(5) | BIT(6)) +#define DP83867_CFG4_SGMII_ANEG_TIMER_11MS (3 << 5) +#define DP83867_CFG4_SGMII_ANEG_TIMER_800US (2 << 5) +#define DP83867_CFG4_SGMII_ANEG_TIMER_2US (1 << 5) +#define DP83867_CFG4_SGMII_ANEG_TIMER_16MS (0 << 5) + #define DP83867_RGMIICTL 0x0032 #define DP83867_STRAP_STS1 0x006E #define DP83867_RGMIIDCTL 0x0086 @@ -292,6 +298,18 @@ static int dp83867_config_init(struct phy_device *phydev) 0); if (ret) return ret; + + /* After reset SGMII Autoneg timer is set to 2us (bits 6 and 5 + * are 01). That is not enough to finalize autoneg on some + * devices. Increase this timer duration to maximum 16ms. + */ + ret = phy_modify_mmd(phydev, DP83867_DEVADDR, + DP83867_CFG4, + DP83867_CFG4_SGMII_ANEG_MASK, + DP83867_CFG4_SGMII_ANEG_TIMER_16MS); + + if (ret) + return ret; } /* Enable Interrupt output INT_OE in CFG3 register */ From c8081fc397fa04675e410900693a57085ef4b760 Mon Sep 17 00:00:00 2001 From: Max Uvarov Date: Tue, 28 May 2019 13:00:51 +0300 Subject: [PATCH 73/89] net: phy: dp83867: do not call config_init twice Phy state machine calls _config_init just after reset. Signed-off-by: Max Uvarov Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 14e9e8a94639..1ec48ecf4133 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -342,7 +342,7 @@ static int dp83867_phy_reset(struct phy_device *phydev) usleep_range(10, 20); - return dp83867_config_init(phydev); + return 0; } static struct phy_driver dp83867_driver[] = { From 2b892649254fec01678c64f16427622b41fa27f4 Mon Sep 17 00:00:00 2001 From: Max Uvarov Date: Tue, 28 May 2019 13:00:52 +0300 Subject: [PATCH 74/89] net: phy: dp83867: Set up RGMII TX delay PHY_INTERFACE_MODE_RGMII_RXID is less then TXID so code to set tx delay is never called. Fixes: 2a10154abcb75 ("net: phy: dp83867: Add TI dp83867 phy") Signed-off-by: Max Uvarov Cc: Florian Fainelli Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 1ec48ecf4133..c71c7d0f53f0 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -255,10 +255,8 @@ static int dp83867_config_init(struct phy_device *phydev) ret = phy_write(phydev, MII_DP83867_PHYCTRL, val); if (ret) return ret; - } - if ((phydev->interface >= PHY_INTERFACE_MODE_RGMII_ID) && - (phydev->interface <= PHY_INTERFACE_MODE_RGMII_RXID)) { + /* Set up RGMII delays */ val = phy_read_mmd(phydev, DP83867_DEVADDR, DP83867_RGMIICTL); if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID) From f2696099c6c619aec4fe2b9691f0a81429957e65 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Tue, 28 May 2019 12:22:54 -0600 Subject: [PATCH 75/89] udp: Avoid post-GRO UDP checksum recalculation Currently, when resegmenting an unexpected UDP GRO packet, the full UDP checksum will be calculated for every new SKB created by skb_segment() because the netdev features passed in by udp_rcv_segment() lack any information about checksum offload capabilities. Usually, we have no need to perform this calculation again, as 1) The GRO implementation guarantees that any packets making it to the udp_rcv_segment() function had correct checksums, and, more importantly, 2) Upon the successful return of udp_rcv_segment(), we immediately pull the UDP header off and either queue the segment to the socket or hand it off to a new protocol handler. Unless userspace has set the IP_CHECKSUM sockopt to indicate that they want the final checksum values, we can pass the needed netdev feature flags to __skb_gso_segment() to avoid checksumming each segment in skb_segment(). Fixes: cf329aa42b66 ("udp: cope with UDP GRO packet misdirection") Cc: Paolo Abeni Cc: Subash Abhinov Kasiviswanathan Signed-off-by: Sean Tranchetti Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/net/udp.h b/include/net/udp.h index d8ce937bc395..dbe030da20a1 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -471,12 +471,19 @@ void udpv6_encap_enable(void); static inline struct sk_buff *udp_rcv_segment(struct sock *sk, struct sk_buff *skb, bool ipv4) { + netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; + /* Avoid csum recalculation by skb_segment unless userspace explicitly + * asks for the final checksum values + */ + if (!inet_get_convert_csum(sk)) + features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + /* the GSO CB lays after the UDP one, no need to save and restore any * CB fragment */ - segs = __skb_gso_segment(skb, NETIF_F_SG, false); + segs = __skb_gso_segment(skb, features, false); if (unlikely(IS_ERR_OR_NULL(segs))) { int segs_nr = skb_shinfo(skb)->gso_segs; From 996ed04741467f6d1552440c92988b132a9487ec Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 28 May 2019 11:47:30 -0700 Subject: [PATCH 76/89] netvsc: unshare skb in VF rx handler The netvsc VF skb handler should make sure that skb is not shared. Similar logic already exists in bonding and team device drivers. This is not an issue in practice because the VF devicex does not send up shared skb's. But the netvsc driver should do the right thing if it did. Fixes: 0c195567a8f6 ("netvsc: transparent VF management") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 06393b215102..9873b8679f81 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2000,6 +2000,12 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb) struct netvsc_vf_pcpu_stats *pcpu_stats = this_cpu_ptr(ndev_ctx->vf_stats); + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return RX_HANDLER_CONSUMED; + + *pskb = skb; + skb->dev = ndev; u64_stats_update_begin(&pcpu_stats->syncp); From 458bf2f224f04a513b0be972f8708e78ee2c986e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 28 May 2019 11:47:31 -0700 Subject: [PATCH 77/89] net: core: support XDP generic on stacked devices. When a device is stacked like (team, bonding, failsafe or netvsc) the XDP generic program for the parent device was not called. Move the call to XDP generic inside __netif_receive_skb_core where it can be done multiple times for stacked case. Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 58 +++++++++++--------------------------------------- 1 file changed, 12 insertions(+), 46 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index b6b8505cfb3e..cc2a4e257324 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4502,23 +4502,6 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); - if (static_branch_unlikely(&generic_xdp_needed_key)) { - int ret; - - preempt_disable(); - rcu_read_lock(); - ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); - rcu_read_unlock(); - preempt_enable(); - - /* Consider XDP consuming the packet a success from - * the netdev point of view we do not want to count - * this as an error. - */ - if (ret != XDP_PASS) - return NET_RX_SUCCESS; - } - #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -4858,6 +4841,18 @@ another_round: __this_cpu_inc(softnet_data.processed); + if (static_branch_unlikely(&generic_xdp_needed_key)) { + int ret2; + + preempt_disable(); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + preempt_enable(); + + if (ret2 != XDP_PASS) + return NET_RX_DROP; + skb_reset_mac_len(skb); + } + if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = skb_vlan_untag(skb); @@ -5178,19 +5173,6 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - if (static_branch_unlikely(&generic_xdp_needed_key)) { - int ret; - - preempt_disable(); - rcu_read_lock(); - ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); - rcu_read_unlock(); - preempt_enable(); - - if (ret != XDP_PASS) - return NET_RX_DROP; - } - rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { @@ -5211,7 +5193,6 @@ static int netif_receive_skb_internal(struct sk_buff *skb) static void netif_receive_skb_list_internal(struct list_head *head) { - struct bpf_prog *xdp_prog = NULL; struct sk_buff *skb, *next; struct list_head sublist; @@ -5224,21 +5205,6 @@ static void netif_receive_skb_list_internal(struct list_head *head) } list_splice_init(&sublist, head); - if (static_branch_unlikely(&generic_xdp_needed_key)) { - preempt_disable(); - rcu_read_lock(); - list_for_each_entry_safe(skb, next, head, list) { - xdp_prog = rcu_dereference(skb->dev->xdp_prog); - skb_list_del_init(skb); - if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) - list_add_tail(&skb->list, &sublist); - } - rcu_read_unlock(); - preempt_enable(); - /* Put passed packets back on main list */ - list_splice_init(&sublist, head); - } - rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { From 59715171fbd0172a579576f46821031800a63bc5 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 29 May 2019 07:44:01 +0200 Subject: [PATCH 78/89] r8169: fix MAC address being lost in PCI D3 (At least) RTL8168e forgets its MAC address in PCI D3. To fix this set the MAC address when resuming. For resuming from runtime-suspend we had this in place already, for resuming from S3/S5 it was missing. The commit referenced as being fixed isn't wrong, it's just the first one where the patch applies cleanly. Fixes: 0f07bd850d36 ("r8169: use dev_get_drvdata where possible") Signed-off-by: Heiner Kallweit Reported-by: Albert Astals Cid Tested-by: Albert Astals Cid Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 8e404186ef87..d06a61f00e78 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -6722,6 +6722,8 @@ static int rtl8169_resume(struct device *device) struct net_device *dev = dev_get_drvdata(device); struct rtl8169_private *tp = netdev_priv(dev); + rtl_rar_set(tp, dev->dev_addr); + clk_prepare_enable(tp->clk); if (netif_running(dev)) @@ -6755,6 +6757,7 @@ static int rtl8169_runtime_resume(struct device *device) { struct net_device *dev = dev_get_drvdata(device); struct rtl8169_private *tp = netdev_priv(dev); + rtl_rar_set(tp, dev->dev_addr); if (!tp->TxDescArray) From 84b3fd1fc9592d431e23b077e692fa4e3fd0f086 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 29 May 2019 07:02:11 +0000 Subject: [PATCH 79/89] net: dsa: mv88e6xxx: fix handling of upper half of STATS_TYPE_PORT Currently, the upper half of a 4-byte STATS_TYPE_PORT statistic ends up in bits 47:32 of the return value, instead of bits 31:16 as they should. Fixes: 6e46e2d821bb ("net: dsa: mv88e6xxx: Fix u64 statistics") Signed-off-by: Rasmus Villemoes Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 28414db979b0..12f165a71a6c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -785,7 +785,7 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip, err = mv88e6xxx_port_read(chip, port, s->reg + 1, ®); if (err) return U64_MAX; - high = reg; + low |= ((u32)reg) << 16; } break; case STATS_TYPE_BANK1: From ef74422020aa8c224b00a927e3e47faac4d8fae3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 29 May 2019 10:59:44 +0300 Subject: [PATCH 80/89] mlxsw: spectrum_acl: Avoid warning after identical rules insertion When identical rules are inserted, the latter one goes to C-TCAM. For that, a second eRP with the same mask is created. These 2 eRPs by the nature cannot be merged and also one cannot be parent of another. Teach mlxsw_sp_acl_erp_delta_fill() about this possibility and handle it gracefully. Reported-by: Alex Kushnarov Fixes: c22291f7cf45 ("mlxsw: spectrum: acl: Implement delta for ERP") Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c index c1a9cc9a3292..4c98950380d5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c @@ -1171,13 +1171,12 @@ mlxsw_sp_acl_erp_delta_fill(const struct mlxsw_sp_acl_erp_key *parent_key, return -EINVAL; } if (si == -1) { - /* The masks are the same, this cannot happen. - * That means the caller is broken. + /* The masks are the same, this can happen in case eRPs with + * the same mask were created in both A-TCAM and C-TCAM. + * The only possible condition under which this can happen + * is identical rule insertion. Delta is not possible here. */ - WARN_ON(1); - *delta_start = 0; - *delta_mask = 0; - return 0; + return -EINVAL; } pmask = (unsigned char) parent_key->mask[__MASK_IDX(si)]; mask = (unsigned char) key->mask[__MASK_IDX(si)]; From 275e928f19117d22f6d26dee94548baf4041b773 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 29 May 2019 10:59:45 +0300 Subject: [PATCH 81/89] mlxsw: spectrum: Prevent force of 56G Force of 56G is not supported by hardware in Ethernet devices. This configuration fails with a bad parameter error from firmware. Add check of this case. Instead of trying to set 56G with autoneg off, return a meaningful error. Fixes: 56ade8fe3fe1 ("mlxsw: spectrum: Add initial support for Spectrum ASIC") Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index dbb425717f5e..dfe6b44baf63 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3128,6 +3128,10 @@ mlxsw_sp_port_set_link_ksettings(struct net_device *dev, ops->reg_ptys_eth_unpack(mlxsw_sp, ptys_pl, ð_proto_cap, NULL, NULL); autoneg = cmd->base.autoneg == AUTONEG_ENABLE; + if (!autoneg && cmd->base.speed == SPEED_56000) { + netdev_err(dev, "56G not supported with autoneg off\n"); + return -EINVAL; + } eth_proto_new = autoneg ? ops->to_ptys_advert_link(mlxsw_sp, cmd) : ops->to_ptys_speed(mlxsw_sp, cmd->base.speed); From 9609dad263f8bea347f41fddca29353dbf8a7d37 Mon Sep 17 00:00:00 2001 From: Young Xiao <92siuyang@gmail.com> Date: Wed, 29 May 2019 16:10:59 +0800 Subject: [PATCH 82/89] ipv4: tcp_input: fix stack out of bounds when parsing TCP options. The TCP option parsing routines in tcp_parse_options function could read one byte out of the buffer of the TCP options. 1 while (length > 0) { 2 int opcode = *ptr++; 3 int opsize; 4 5 switch (opcode) { 6 case TCPOPT_EOL: 7 return; 8 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 9 length--; 10 continue; 11 default: 12 opsize = *ptr++; //out of bound access If length = 1, then there is an access in line2. And another access is occurred in line 12. This would lead to out-of-bound access. Therefore, in the patch we check that the available data length is larger enough to pase both TCP option code and size. Signed-off-by: Young Xiao <92siuyang@gmail.com> Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c61edd023b35..08a477e74cf3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3791,6 +3791,8 @@ void tcp_parse_options(const struct net *net, length--; continue; default: + if (length < 2) + return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; From 21808437214637952b61beaba6034d97880fbeb3 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 29 May 2019 15:59:48 +0200 Subject: [PATCH 83/89] net: mvpp2: fix bad MVPP2_TXQ_SCHED_TOKEN_CNTR_REG queue value MVPP2_TXQ_SCHED_TOKEN_CNTR_REG() expects the logical queue id but the current code is passing the global tx queue offset, so it ends up writing to unknown registers (between 0x8280 and 0x82fc, which seemed to be unused by the hardware). This fixes the issue by using the logical queue id instead. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index d38952eb7aa9..7a67e23a2c2b 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -1455,7 +1455,7 @@ static inline void mvpp2_xlg_max_rx_size_set(struct mvpp2_port *port) /* Set defaults to the MVPP2 port */ static void mvpp2_defaults_set(struct mvpp2_port *port) { - int tx_port_num, val, queue, ptxq, lrxq; + int tx_port_num, val, queue, lrxq; if (port->priv->hw_version == MVPP21) { /* Update TX FIFO MIN Threshold */ @@ -1476,11 +1476,9 @@ static void mvpp2_defaults_set(struct mvpp2_port *port) mvpp2_write(port->priv, MVPP2_TXP_SCHED_FIXED_PRIO_REG, 0); /* Close bandwidth for all queues */ - for (queue = 0; queue < MVPP2_MAX_TXQ; queue++) { - ptxq = mvpp2_txq_phys(port->id, queue); + for (queue = 0; queue < MVPP2_MAX_TXQ; queue++) mvpp2_write(port->priv, - MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(ptxq), 0); - } + MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(queue), 0); /* Set refill period to 1 usec, refill tokens * and bucket size to maximum @@ -2336,7 +2334,7 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port, txq->descs_dma = 0; /* Set minimum bandwidth for disabled TXQs */ - mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->id), 0); + mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->log_id), 0); /* Set Tx descriptors queue starting address and size */ thread = mvpp2_cpu_to_thread(port->priv, get_cpu()); From d34d2baa9173f6e0c0f22d005d18e83d1cb54d8d Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Thu, 30 May 2019 00:42:30 +0300 Subject: [PATCH 84/89] net: dsa: tag_8021q: Change order of rx_vid setup The 802.1Q tagging performs an unbalanced setup in terms of RX VIDs on the CPU port. For the ingress path of a 802.1Q switch to work, the RX VID of a port needs to be seen as tagged egress on the CPU port. While configuring the other front-panel ports to be part of this VID, for bridge scenarios, the untagged flag is applied even on the CPU port in dsa_switch_vlan_add. This happens because DSA applies the same flags on the CPU port as on the (bridge-controlled) slave ports, and the effect in this case is that the CPU port tagged settings get deleted. Instead of fixing DSA by introducing a way to control VLAN flags on the CPU port (and hence stop inheriting from the slave ports) - a hard, perhaps intractable problem - avoid this situation by moving the setup part of the RX VID on the CPU port after all the other front-panel ports have been added to the VID. Fixes: f9bbe4477c30 ("net: dsa: Optional VLAN-based port separation for switches without tagging") Signed-off-by: Ioana Ciornei Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_8021q.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 8ae48c7e1e76..4adec6bbfe59 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -128,10 +128,7 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) u16 flags; if (i == upstream) - /* CPU port needs to see this port's RX VID - * as tagged egress. - */ - flags = 0; + continue; else if (i == port) /* The RX VID is pvid on this port */ flags = BRIDGE_VLAN_INFO_UNTAGGED | @@ -150,6 +147,20 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) return err; } } + + /* CPU port needs to see this port's RX VID + * as tagged egress. + */ + if (enabled) + err = dsa_port_vid_add(upstream_dp, rx_vid, 0); + else + err = dsa_port_vid_del(upstream_dp, rx_vid); + if (err) { + dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n", + rx_vid, port, err); + return err; + } + /* Finally apply the TX VID on this port and on the CPU port */ if (enabled) err = dsa_port_vid_add(dp, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED); From 0471dd429cea6507a4000169ff6a33f41ba371b3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 30 May 2019 00:42:31 +0300 Subject: [PATCH 85/89] net: dsa: tag_8021q: Create a stable binary format Tools like tcpdump need to be able to decode the significance of fake VLAN headers that DSA uses to separate switch ports. But currently these have no global significance - they are simply an ordered list of DSA_MAX_SWITCHES x DSA_MAX_PORTS numbers ending at 4095. The reason why this is submitted as a fix is that the existing mapping of VIDs should not enter into a stable kernel, so we can pretend that only the new format exists. This way tcpdump won't need to try to make something out of the VLAN tags on 5.2 kernels. Fixes: f9bbe4477c30 ("net: dsa: Optional VLAN-based port separation for switches without tagging") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_8021q.c | 60 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 4adec6bbfe59..65a35e976d7b 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -11,20 +11,59 @@ #include "dsa_priv.h" -/* Allocating two VLAN tags per port - one for the RX VID and - * the other for the TX VID - see below +/* Binary structure of the fake 12-bit VID field (when the TPID is + * ETH_P_DSA_8021Q): + * + * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | + * +-----------+-----+-----------------+-----------+-----------------------+ + * | DIR | RSV | SWITCH_ID | RSV | PORT | + * +-----------+-----+-----------------+-----------+-----------------------+ + * + * DIR - VID[11:10]: + * Direction flags. + * * 1 (0b01) for RX VLAN, + * * 2 (0b10) for TX VLAN. + * These values make the special VIDs of 0, 1 and 4095 to be left + * unused by this coding scheme. + * + * RSV - VID[9]: + * To be used for further expansion of SWITCH_ID or for other purposes. + * + * SWITCH_ID - VID[8:6]: + * Index of switch within DSA tree. Must be between 0 and + * DSA_MAX_SWITCHES - 1. + * + * RSV - VID[5:4]: + * To be used for further expansion of PORT or for other purposes. + * + * PORT - VID[3:0]: + * Index of switch port. Must be between 0 and DSA_MAX_PORTS - 1. */ -#define DSA_8021Q_VID_RANGE (DSA_MAX_SWITCHES * DSA_MAX_PORTS) -#define DSA_8021Q_VID_BASE (VLAN_N_VID - 2 * DSA_8021Q_VID_RANGE - 1) -#define DSA_8021Q_RX_VID_BASE (DSA_8021Q_VID_BASE) -#define DSA_8021Q_TX_VID_BASE (DSA_8021Q_VID_BASE + DSA_8021Q_VID_RANGE) + +#define DSA_8021Q_DIR_SHIFT 10 +#define DSA_8021Q_DIR_MASK GENMASK(11, 10) +#define DSA_8021Q_DIR(x) (((x) << DSA_8021Q_DIR_SHIFT) & \ + DSA_8021Q_DIR_MASK) +#define DSA_8021Q_DIR_RX DSA_8021Q_DIR(1) +#define DSA_8021Q_DIR_TX DSA_8021Q_DIR(2) + +#define DSA_8021Q_SWITCH_ID_SHIFT 6 +#define DSA_8021Q_SWITCH_ID_MASK GENMASK(8, 6) +#define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \ + DSA_8021Q_SWITCH_ID_MASK) + +#define DSA_8021Q_PORT_SHIFT 0 +#define DSA_8021Q_PORT_MASK GENMASK(3, 0) +#define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ + DSA_8021Q_PORT_MASK) /* Returns the VID to be inserted into the frame from xmit for switch steering * instructions on egress. Encodes switch ID and port ID. */ u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port) { - return DSA_8021Q_TX_VID_BASE + (DSA_MAX_PORTS * ds->index) + port; + return DSA_8021Q_DIR_TX | DSA_8021Q_SWITCH_ID(ds->index) | + DSA_8021Q_PORT(port); } EXPORT_SYMBOL_GPL(dsa_8021q_tx_vid); @@ -33,21 +72,22 @@ EXPORT_SYMBOL_GPL(dsa_8021q_tx_vid); */ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) { - return DSA_8021Q_RX_VID_BASE + (DSA_MAX_PORTS * ds->index) + port; + return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | + DSA_8021Q_PORT(port); } EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) { - return ((vid - DSA_8021Q_RX_VID_BASE) / DSA_MAX_PORTS); + return (vid & DSA_8021Q_SWITCH_ID_MASK) >> DSA_8021Q_SWITCH_ID_SHIFT; } EXPORT_SYMBOL_GPL(dsa_8021q_rx_switch_id); /* Returns the decoded port ID from the RX VID. */ int dsa_8021q_rx_source_port(u16 vid) { - return ((vid - DSA_8021Q_RX_VID_BASE) % DSA_MAX_PORTS); + return (vid & DSA_8021Q_PORT_MASK) >> DSA_8021Q_PORT_SHIFT; } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); From a4270d6795b0580287453ea55974d948393e66ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 May 2019 15:36:10 -0700 Subject: [PATCH 86/89] net-gro: fix use-after-free read in napi_gro_frags() If a network driver provides to napi_gro_frags() an skb with a page fragment of exactly 14 bytes, the call to gro_pull_from_frag0() will 'consume' the fragment by calling skb_frag_unref(skb, 0), and the page might be freed and reused. Reading eth->h_proto at the end of napi_frags_skb() might read mangled data, or crash under specific debugging features. BUG: KASAN: use-after-free in napi_frags_skb net/core/dev.c:5833 [inline] BUG: KASAN: use-after-free in napi_gro_frags+0xc6f/0xd10 net/core/dev.c:5841 Read of size 2 at addr ffff88809366840c by task syz-executor599/8957 CPU: 1 PID: 8957 Comm: syz-executor599 Not tainted 5.2.0-rc1+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 kasan_report+0x12/0x20 mm/kasan/common.c:614 __asan_report_load_n_noabort+0xf/0x20 mm/kasan/generic_report.c:142 napi_frags_skb net/core/dev.c:5833 [inline] napi_gro_frags+0xc6f/0xd10 net/core/dev.c:5841 tun_get_user+0x2f3c/0x3ff0 drivers/net/tun.c:1991 tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2037 call_write_iter include/linux/fs.h:1872 [inline] do_iter_readv_writev+0x5f8/0x8f0 fs/read_write.c:693 do_iter_write fs/read_write.c:970 [inline] do_iter_write+0x184/0x610 fs/read_write.c:951 vfs_writev+0x1b3/0x2f0 fs/read_write.c:1015 do_writev+0x15b/0x330 fs/read_write.c:1058 Fixes: a50e233c50db ("net-gro: restore frag0 optimization") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index cc2a4e257324..66f7508825bd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5775,7 +5775,6 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) skb_reset_mac_header(skb); skb_gro_reset_offset(skb); - eth = skb_gro_header_fast(skb, 0); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { @@ -5785,6 +5784,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) return NULL; } } else { + eth = (const struct ethhdr *)skb->data; gro_pull_from_frag0(skb, hlen); NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; From 2b81f8161dfeda4017cef4f2498ccb64b13f0d61 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 May 2019 16:33:23 -0700 Subject: [PATCH 87/89] net: don't clear sock->sk early to avoid trouble in strparser af_inet sets sock->sk to NULL which trips strparser over: BUG: kernel NULL pointer dereference, address: 0000000000000012 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 7 PID: 0 Comm: swapper/7 Not tainted 5.2.0-rc1-00139-g14629453a6d3 #21 RIP: 0010:tcp_peek_len+0x10/0x60 RSP: 0018:ffffc02e41c54b98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff9cf924c4e030 RCX: 0000000000000051 RDX: 0000000000000000 RSI: 000000000000000c RDI: ffff9cf97128f480 RBP: ffff9cf9365e0300 R08: ffff9cf94fe7d2c0 R09: 0000000000000000 R10: 000000000000036b R11: ffff9cf939735e00 R12: ffff9cf91ad9ae40 R13: ffff9cf924c4e000 R14: ffff9cf9a8fcbaae R15: 0000000000000020 FS: 0000000000000000(0000) GS:ffff9cf9af7c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000012 CR3: 000000013920a003 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strp_data_ready+0x48/0x90 tls_data_ready+0x22/0xd0 [tls] tcp_rcv_established+0x569/0x620 tcp_v4_do_rcv+0x127/0x1e0 tcp_v4_rcv+0xad7/0xbf0 ip_protocol_deliver_rcu+0x2c/0x1c0 ip_local_deliver_finish+0x41/0x50 ip_local_deliver+0x6b/0xe0 ? ip_protocol_deliver_rcu+0x1c0/0x1c0 ip_rcv+0x52/0xd0 ? ip_rcv_finish_core.isra.20+0x380/0x380 __netif_receive_skb_one_core+0x7e/0x90 netif_receive_skb_internal+0x42/0xf0 napi_gro_receive+0xed/0x150 nfp_net_poll+0x7a2/0xd30 [nfp] ? kmem_cache_free_bulk+0x286/0x310 net_rx_action+0x149/0x3b0 __do_softirq+0xe3/0x30a ? handle_irq_event_percpu+0x6a/0x80 irq_exit+0xe8/0xf0 do_IRQ+0x85/0xd0 common_interrupt+0xf/0xf RIP: 0010:cpuidle_enter_state+0xbc/0x450 To avoid this issue set sock->sk after sk_prot->close. My grepping and testing did not discover any code which would depend on the current behaviour. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Reported-by: David Beckett Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5183a2daba64..aff93e7cdb31 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -428,8 +428,8 @@ int inet_release(struct socket *sock) if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) timeout = sk->sk_lingertime; - sock->sk = NULL; sk->sk_prot->close(sk, timeout); + sock->sk = NULL; } return 0; } From b73484b2fc0d0ba84a13e9d86eb4adcae718163b Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 30 May 2019 16:08:40 +0200 Subject: [PATCH 88/89] ethtool: Check for vlan etype or vlan tci when parsing flow_rule When parsing an ethtool flow spec to build a flow_rule, the code checks if both the vlan etype and the vlan tci are specified by the user to add a FLOW_DISSECTOR_KEY_VLAN match. However, when the user only specified a vlan etype or a vlan tci, this check silently ignores these parameters. For example, the following rule : ethtool -N eth0 flow-type udp4 vlan 0x0010 action -1 loc 0 will result in no error being issued, but the equivalent rule will be created and passed to the NIC driver : ethtool -N eth0 flow-type udp4 action -1 loc 0 In the end, neither the NIC driver using the rule nor the end user have a way to know that these keys were dropped along the way, or that incorrect parameters were entered. This kind of check should be left to either the driver, or the ethtool flow spec layer. This commit makes so that ethtool parameters are forwarded as-is to the NIC driver. Since none of the users of ethtool_rx_flow_rule_create are using the VLAN dissector, I don't think this qualifies as a regression. Fixes: eca4205f9ec3 ("ethtool: add ethtool_rx_flow_spec to flow_rule structure translator") Signed-off-by: Maxime Chevallier Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/ethtool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4a593853cbf2..43e9add58340 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -3010,11 +3010,12 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; - if (ext_m_spec->vlan_etype && - ext_m_spec->vlan_tci) { + if (ext_m_spec->vlan_etype) { match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype; match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype; + } + if (ext_m_spec->vlan_tci) { match->key.vlan.vlan_id = ntohs(ext_h_spec->vlan_tci) & 0x0fff; match->mask.vlan.vlan_id = @@ -3024,7 +3025,10 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; match->mask.vlan.vlan_priority = (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13; + } + if (ext_m_spec->vlan_etype || + ext_m_spec->vlan_tci) { match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN); match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = From 100f6d8e09905c59be45b6316f8f369c0be1b2d8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 30 May 2019 18:01:21 -0400 Subject: [PATCH 89/89] net: correct zerocopy refcnt with udp MSG_MORE TCP zerocopy takes a uarg reference for every skb, plus one for the tcp_sendmsg_locked datapath temporarily, to avoid reaching refcnt zero as it builds, sends and frees skbs inside its inner loop. UDP and RAW zerocopy do not send inside the inner loop so do not need the extra sock_zerocopy_get + sock_zerocopy_put pair. Commit 52900d22288ed ("udp: elide zerocopy operation in hot path") introduced extra_uref to pass the initial reference taken in sock_zerocopy_alloc to the first generated skb. But, sock_zerocopy_realloc takes this extra reference at the start of every call. With MSG_MORE, no new skb may be generated to attach the extra_uref to, so refcnt is incorrectly 2 with only one skb. Do not take the extra ref if uarg && !tcp, which implies MSG_MORE. Update extra_uref accordingly. This conditional assignment triggers a false positive may be used uninitialized warning, so have to initialize extra_uref at define. Changes v1->v2: fix typo in Fixes SHA1 Fixes: 52900d22288e7 ("udp: elide zerocopy operation in hot path") Reported-by: syzbot Diagnosed-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++++- net/ipv4/ip_output.c | 4 ++-- net/ipv6/ip6_output.c | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e89be6282693..eaad23f9c7b5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1036,7 +1036,11 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, uarg->len++; uarg->bytelen = bytelen; atomic_set(&sk->sk_zckey, ++next); - sock_zerocopy_get(uarg); + + /* no extra ref when appending to datagram (MSG_MORE) */ + if (sk->sk_type == SOCK_STREAM) + sock_zerocopy_get(uarg); + return uarg; } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bfd0ca554977..8c9189a41b13 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -878,7 +878,7 @@ static int __ip_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; - bool paged, extra_uref; + bool paged, extra_uref = false; u32 tskey = 0; skb = skb_peek_tail(queue); @@ -918,7 +918,7 @@ static int __ip_append_data(struct sock *sk, uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; - extra_uref = true; + extra_uref = !skb; /* only extra ref if !MSG_MORE */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index adef2236abe2..f9e43323e667 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1275,7 +1275,7 @@ static int __ip6_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged, extra_uref; + bool paged, extra_uref = false; skb = skb_peek_tail(queue); if (!skb) { @@ -1344,7 +1344,7 @@ emsgsize: uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; - extra_uref = true; + extra_uref = !skb; /* only extra ref if !MSG_MORE */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true;