From 8f4d19aacb64f2b3d65c8cf7974c3d153224b5f2 Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Wed, 30 May 2018 10:29:31 +0800
Subject: [PATCH 01/81] netfilter: xt_CT: Reject the non-null terminated string
 from user space

The helper and timeout strings are from user-space, we need to make
sure they are null terminated. If not, evil user could make kernel
read the unexpected memory, even print it when fail to find by the
following codes.

pr_info_ratelimited("No such helper \"%s\"\n", helper_name);

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_CT.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 8790190c6feb..03b9a50ec93b 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -245,12 +245,22 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	}
 
 	if (info->helper[0]) {
+		if (strnlen(info->helper, sizeof(info->helper)) == sizeof(info->helper)) {
+			ret = -ENAMETOOLONG;
+			goto err3;
+		}
+
 		ret = xt_ct_set_helper(ct, info->helper, par);
 		if (ret < 0)
 			goto err3;
 	}
 
 	if (info->timeout[0]) {
+		if (strnlen(info->timeout, sizeof(info->timeout)) == sizeof(info->timeout)) {
+			ret = -ENAMETOOLONG;
+			goto err4;
+		}
+
 		ret = xt_ct_set_timeout(ct, par, info->timeout);
 		if (ret < 0)
 			goto err4;

From 9c7f96fd77b0dbe1fe7ed1f9c462c45dc48a1076 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Thu, 31 May 2018 19:53:33 +0300
Subject: [PATCH 02/81] netfilter: nf_tables: check msg_type before
 nft_trans_set(trans)

The patch moves the "trans->msg_type == NFT_MSG_NEWSET" check before
using nft_trans_set(trans). Otherwise we can get out of bounds read.

For example, KASAN reported the one when running 0001_cache_handling_0 nft
test. In this case "trans->msg_type" was NFT_MSG_NEWTABLE:

[75517.177808] BUG: KASAN: slab-out-of-bounds in nft_set_lookup_global+0x22f/0x270 [nf_tables]
[75517.279094] Read of size 8 at addr ffff881bdb643fc8 by task nft/7356
...
[75517.375605] CPU: 26 PID: 7356 Comm: nft Tainted: G  E   4.17.0-rc7.1.x86_64 #1
[75517.489587] Hardware name: Oracle Corporation SUN SERVER X4-2
[75517.618129] Call Trace:
[75517.648821]  dump_stack+0xd1/0x13b
[75517.691040]  ? show_regs_print_info+0x5/0x5
[75517.742519]  ? kmsg_dump_rewind_nolock+0xf5/0xf5
[75517.799300]  ? lock_acquire+0x143/0x310
[75517.846738]  print_address_description+0x85/0x3a0
[75517.904547]  kasan_report+0x18d/0x4b0
[75517.949892]  ? nft_set_lookup_global+0x22f/0x270 [nf_tables]
[75518.019153]  ? nft_set_lookup_global+0x22f/0x270 [nf_tables]
[75518.088420]  ? nft_set_lookup_global+0x22f/0x270 [nf_tables]
[75518.157689]  nft_set_lookup_global+0x22f/0x270 [nf_tables]
[75518.224869]  nf_tables_newsetelem+0x1a5/0x5d0 [nf_tables]
[75518.291024]  ? nft_add_set_elem+0x2280/0x2280 [nf_tables]
[75518.357154]  ? nla_parse+0x1a5/0x300
[75518.401455]  ? kasan_kmalloc+0xa6/0xd0
[75518.447842]  nfnetlink_rcv+0xc43/0x1bdf [nfnetlink]
[75518.507743]  ? nfnetlink_rcv+0x7a5/0x1bdf [nfnetlink]
[75518.569745]  ? nfnl_err_reset+0x3c0/0x3c0 [nfnetlink]
[75518.631711]  ? lock_acquire+0x143/0x310
[75518.679133]  ? netlink_deliver_tap+0x9b/0x1070
[75518.733840]  ? kasan_unpoison_shadow+0x31/0x40
[75518.788542]  netlink_unicast+0x45d/0x680
[75518.837111]  ? __isolate_free_page+0x890/0x890
[75518.891913]  ? netlink_attachskb+0x6b0/0x6b0
[75518.944542]  netlink_sendmsg+0x6fa/0xd30
[75518.993107]  ? netlink_unicast+0x680/0x680
[75519.043758]  ? netlink_unicast+0x680/0x680
[75519.094402]  sock_sendmsg+0xd9/0x160
[75519.138810]  ___sys_sendmsg+0x64d/0x980
[75519.186234]  ? copy_msghdr_from_user+0x350/0x350
[75519.243118]  ? lock_downgrade+0x650/0x650
[75519.292738]  ? do_raw_spin_unlock+0x5d/0x250
[75519.345456]  ? _raw_spin_unlock+0x24/0x30
[75519.395065]  ? __handle_mm_fault+0xbde/0x3410
[75519.448830]  ? sock_setsockopt+0x3d2/0x1940
[75519.500516]  ? __lock_acquire.isra.25+0xdc/0x19d0
[75519.558448]  ? lock_downgrade+0x650/0x650
[75519.608057]  ? __audit_syscall_entry+0x317/0x720
[75519.664960]  ? __fget_light+0x58/0x250
[75519.711325]  ? __sys_sendmsg+0xde/0x170
[75519.758850]  __sys_sendmsg+0xde/0x170
[75519.804193]  ? __ia32_sys_shutdown+0x90/0x90
[75519.856725]  ? syscall_trace_enter+0x897/0x10e0
[75519.912354]  ? trace_event_raw_event_sys_enter+0x920/0x920
[75519.979432]  ? __audit_syscall_entry+0x720/0x720
[75520.036118]  do_syscall_64+0xa3/0x3d0
[75520.081248]  ? prepare_exit_to_usermode+0x47/0x1d0
[75520.139904]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[75520.201680] RIP: 0033:0x7fc153320ba0
[75520.245772] RSP: 002b:00007ffe294c3638 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[75520.337708] RAX: ffffffffffffffda RBX: 00007ffe294c4820 RCX: 00007fc153320ba0
[75520.424547] RDX: 0000000000000000 RSI: 00007ffe294c46b0 RDI: 0000000000000003
[75520.511386] RBP: 00007ffe294c47b0 R08: 0000000000000004 R09: 0000000002114090
[75520.598225] R10: 00007ffe294c30a0 R11: 0000000000000246 R12: 00007ffe294c3660
[75520.684961] R13: 0000000000000001 R14: 00007ffe294c3650 R15: 0000000000000001

[75520.790946] Allocated by task 7356:
[75520.833994]  kasan_kmalloc+0xa6/0xd0
[75520.878088]  __kmalloc+0x189/0x450
[75520.920107]  nft_trans_alloc_gfp+0x20/0x190 [nf_tables]
[75520.983961]  nf_tables_newtable+0xcd0/0x1bd0 [nf_tables]
[75521.048857]  nfnetlink_rcv+0xc43/0x1bdf [nfnetlink]
[75521.108655]  netlink_unicast+0x45d/0x680
[75521.157013]  netlink_sendmsg+0x6fa/0xd30
[75521.205271]  sock_sendmsg+0xd9/0x160
[75521.249365]  ___sys_sendmsg+0x64d/0x980
[75521.296686]  __sys_sendmsg+0xde/0x170
[75521.341822]  do_syscall_64+0xa3/0x3d0
[75521.386957]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[75521.467867] Freed by task 23454:
[75521.507804]  __kasan_slab_free+0x132/0x180
[75521.558137]  kfree+0x14d/0x4d0
[75521.596005]  free_rt_sched_group+0x153/0x280
[75521.648410]  sched_autogroup_create_attach+0x19a/0x520
[75521.711330]  ksys_setsid+0x2ba/0x400
[75521.755529]  __ia32_sys_setsid+0xa/0x10
[75521.802850]  do_syscall_64+0xa3/0x3d0
[75521.848090]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[75521.929000] The buggy address belongs to the object at ffff881bdb643f80
 which belongs to the cache kmalloc-96 of size 96
[75522.079797] The buggy address is located 72 bytes inside of
 96-byte region [ffff881bdb643f80, ffff881bdb643fe0)
[75522.221234] The buggy address belongs to the page:
[75522.280100] page:ffffea006f6d90c0 count:1 mapcount:0 mapping:0000000000000000 index:0x0
[75522.377443] flags: 0x2fffff80000100(slab)
[75522.426956] raw: 002fffff80000100 0000000000000000 0000000000000000 0000000180200020
[75522.521275] raw: ffffea006e6fafc0 0000000c0000000c ffff881bf180f400 0000000000000000
[75522.615601] page dumped because: kasan: bad access detected

Fixes: 37a9cc525525 ("netfilter: nf_tables: add generation mask to sets")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 501e48a7965b..8d8dfe417014 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2728,12 +2728,13 @@ static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
 	u32 id = ntohl(nla_get_be32(nla));
 
 	list_for_each_entry(trans, &net->nft.commit_list, list) {
-		struct nft_set *set = nft_trans_set(trans);
+		if (trans->msg_type == NFT_MSG_NEWSET) {
+			struct nft_set *set = nft_trans_set(trans);
 
-		if (trans->msg_type == NFT_MSG_NEWSET &&
-		    id == nft_trans_set_id(trans) &&
-		    nft_active_genmask(set, genmask))
-			return set;
+			if (id == nft_trans_set_id(trans) &&
+			    nft_active_genmask(set, genmask))
+				return set;
+		}
 	}
 	return ERR_PTR(-ENOENT);
 }

From 31875d4970baa02e08b719fdfea6f43e9e2f7e77 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 24 May 2018 23:40:12 +0300
Subject: [PATCH 03/81] ipvs: register conntrack hooks for ftp

ip_vs_ftp requires conntrack modules for mangling
of FTP command responses in passive mode.

Make sure the conntrack hooks are registered when
real servers use NAT method in FTP virtual service.
The hooks will be registered while the service is
present.

Fixes: 0c66dc1ea3f0 ("netfilter: conntrack: register hooks in netns when needed by ruleset")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h            | 30 ++++++++++++++++++++++++++++++
 net/netfilter/ipvs/ip_vs_ctl.c |  4 ++++
 2 files changed, 34 insertions(+)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index eb0bec043c96..ae72d9057eda 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -643,6 +643,7 @@ struct ip_vs_service {
 
 	/* alternate persistence engine */
 	struct ip_vs_pe __rcu	*pe;
+	int			conntrack_afmask;
 
 	struct rcu_head		rcu_head;
 };
@@ -1620,6 +1621,35 @@ static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp,
 	return false;
 }
 
+static inline int ip_vs_register_conntrack(struct ip_vs_service *svc)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	int afmask = (svc->af == AF_INET6) ? 2 : 1;
+	int ret = 0;
+
+	if (!(svc->conntrack_afmask & afmask)) {
+		ret = nf_ct_netns_get(svc->ipvs->net, svc->af);
+		if (ret >= 0)
+			svc->conntrack_afmask |= afmask;
+	}
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+static inline void ip_vs_unregister_conntrack(struct ip_vs_service *svc)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	int afmask = (svc->af == AF_INET6) ? 2 : 1;
+
+	if (svc->conntrack_afmask & afmask) {
+		nf_ct_netns_put(svc->ipvs->net, svc->af);
+		svc->conntrack_afmask &= ~afmask;
+	}
+#endif
+}
+
 static inline int
 ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
 {
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 3ecca0616d8c..ee0ab278f1f1 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -835,6 +835,9 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		 *    For now only for NAT!
 		 */
 		ip_vs_rs_hash(ipvs, dest);
+		/* FTP-NAT requires conntrack for mangling */
+		if (svc->port == FTPPORT)
+			ip_vs_register_conntrack(svc);
 	}
 	atomic_set(&dest->conn_flags, conn_flags);
 
@@ -1458,6 +1461,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
  */
 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
 {
+	ip_vs_unregister_conntrack(svc);
 	/* Hold svc to avoid double release from dest_trash */
 	atomic_inc(&svc->refcnt);
 	/*

From 0cafa3926f0d8d72a2a814843f4db8cfef66d4ce Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 1 Jun 2018 19:12:28 +0900
Subject: [PATCH 04/81] netfilter: nft_reject_bridge: fix skb allocation size
 in nft_reject_br_send_v6_unreach

In order to allocate icmpv6 skb, sizeof(struct ipv6hdr) should be used.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nft_reject_bridge.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index eaf05de37f75..6de981270566 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -261,7 +261,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net,
 	if (!reject6_br_csum_ok(oldskb, hook))
 		return;
 
-	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmp6hdr) +
+	nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) +
 			 LL_MAX_HEADER + len, GFP_ATOMIC);
 	if (!nskb)
 		return;

From 6fcc02e3c2bddeaf628fde3c6a5ab3216d45691a Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sat, 2 Jun 2018 21:52:15 +0300
Subject: [PATCH 05/81] ipvs: fix check on xmit to non-local addresses

There is mistake in the rt_mode_allow_non_local assignment.
It should be used to check if sending to non-local addresses is
allowed, now it checks if local addresses are allowed.

As local addresses are allowed for most of the cases, the only
places that are affected are for traffic to transparent cache
servers:

- bypass connections when cache server is not available
- related ICMP in FORWARD hook when sent to cache server

Fixes: 4a4739d56b00 ("ipvs: Pull out crosses_local_route_boundary logic")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4527921b1c3a..8f7fff774283 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -168,7 +168,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
 						bool new_rt_is_local)
 {
 	bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
-	bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
+	bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL);
 	bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
 	bool source_is_loopback;
 	bool old_rt_is_local;

From 9e8c8dabb78e886ace989729e763d28c76f5169e Mon Sep 17 00:00:00 2001
From: Alin Nastac <alin.nastac@gmail.com>
Date: Wed, 30 May 2018 15:19:36 +0200
Subject: [PATCH 06/81] netfilter: ebtables: fix compat entry padding

On arm64, ebt_entry_{match,watcher,target} structs are 40 bytes long
while on 32-bit arm these structs have a size of 36 bytes.

COMPAT_XT_ALIGN() macro cannot be used here to determine the necessary
padding for the CONFIG_COMPAT because it imposes an 8-byte boundary
alignment, condition that is not found in 32-bit ebtables application.

Signed-off-by: Alin Nastac <alin.nastac@gmail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 6ba639f6c51d..5f459c8b7937 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1610,16 +1610,16 @@ struct compat_ebt_entry_mwt {
 		compat_uptr_t ptr;
 	} u;
 	compat_uint_t match_size;
-	compat_uint_t data[0];
+	compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace))));
 };
 
 /* account for possible padding between match_size and ->data */
 static int ebt_compat_entry_padsize(void)
 {
-	BUILD_BUG_ON(XT_ALIGN(sizeof(struct ebt_entry_match)) <
-			COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt)));
-	return (int) XT_ALIGN(sizeof(struct ebt_entry_match)) -
-			COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt));
+	BUILD_BUG_ON(sizeof(struct ebt_entry_match) <
+			sizeof(struct compat_ebt_entry_mwt));
+	return (int) sizeof(struct ebt_entry_match) -
+			sizeof(struct compat_ebt_entry_mwt);
 }
 
 static int ebt_compat_match_offset(const struct xt_match *match,

From 9dcceb1378b6d66633f613805b2d5a22af4d5383 Mon Sep 17 00:00:00 2001
From: Serhey Popovych <serhe.popovych@gmail.com>
Date: Tue, 5 Jun 2018 11:46:13 +0200
Subject: [PATCH 07/81] netfilter: xt_set: Check hook mask correctly

Inserting rule before one with SET target we get error with warning in
dmesg(1) output:

  # iptables -A FORWARD -t mangle -j SET --map-set test src --map-prio
  # iptables -I FORWARD 1 -t mangle -j ACCEPT
  iptables: Invalid argument. Run `dmesg' for more information.
  # dmesg |tail -n1
  [268578.026643] mapping of prio or/and queue is allowed only from \
  OUTPUT/FORWARD/POSTROUTING chains

Rather than checking for supported hook bits for SET target check for
unsupported one as done in all rest of matches and targets.

Signed-off-by: Serhey Popovych <serhe.popovych@gmail.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/xt_set.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 6f4c5217d835..07af7dbf7a30 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -470,7 +470,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
 		}
 		if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
 		     (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
-		     !(par->hook_mask & (1 << NF_INET_FORWARD |
+		     (par->hook_mask & ~(1 << NF_INET_FORWARD |
 					 1 << NF_INET_LOCAL_OUT |
 					 1 << NF_INET_POST_ROUTING))) {
 			pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");

From bd975e691486ba52790ba23cc9b4fecab7bc0d31 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 31 May 2018 18:45:21 +0200
Subject: [PATCH 08/81] netfilter: ipset: List timing out entries with "timeout
 1" instead of zero

When listing sets with timeout support, there's a probability that
just timing out entries with "0" timeout value is listed/saved.
However when restoring the saved list, the zero timeout value means
permanent elelements.

The new behaviour is that timing out entries are listed with "timeout 1"
instead of zero.

Fixes netfilter bugzilla #1258.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set_timeout.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index bfb3531fd88a..7ad8ddf9ca8a 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -65,8 +65,14 @@ ip_set_timeout_set(unsigned long *timeout, u32 value)
 static inline u32
 ip_set_timeout_get(const unsigned long *timeout)
 {
-	return *timeout == IPSET_ELEM_PERMANENT ? 0 :
-		jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC;
+	u32 t;
+
+	if (*timeout == IPSET_ELEM_PERMANENT)
+		return 0;
+
+	t = jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC;
+	/* Zero value in userspace means no timeout */
+	return t == 0 ? 1 : t;
 }
 
 #endif	/* __KERNEL__ */

From 30a2e107108c66cbcb7776b58cbcd7db223a1cc9 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 5 Jun 2018 11:53:35 +0200
Subject: [PATCH 09/81] netfilter: ipset: Limit max timeout value

Due to the negative value condition in msecs_to_jiffies(), the real
max possible timeout value must be set to (UINT_MAX >> 1)/MSEC_PER_SEC.

Neutron Soutmun proposed the proper fix, but an insufficient one was
applied, see https://patchwork.ozlabs.org/patch/400405/.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set_timeout.h | 10 ++++++----
 net/netfilter/xt_set.c                         |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 7ad8ddf9ca8a..8ce271e187b6 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -23,6 +23,9 @@
 /* Set is defined with timeout support: timeout value may be 0 */
 #define IPSET_NO_TIMEOUT	UINT_MAX
 
+/* Max timeout value, see msecs_to_jiffies() in jiffies.h */
+#define IPSET_MAX_TIMEOUT	(UINT_MAX >> 1)/MSEC_PER_SEC
+
 #define ip_set_adt_opt_timeout(opt, set)	\
 ((opt)->ext.timeout != IPSET_NO_TIMEOUT ? (opt)->ext.timeout : (set)->timeout)
 
@@ -32,11 +35,10 @@ ip_set_timeout_uget(struct nlattr *tb)
 	unsigned int timeout = ip_set_get_h32(tb);
 
 	/* Normalize to fit into jiffies */
-	if (timeout > UINT_MAX/MSEC_PER_SEC)
-		timeout = UINT_MAX/MSEC_PER_SEC;
+	if (timeout > IPSET_MAX_TIMEOUT)
+		timeout = IPSET_MAX_TIMEOUT;
 
-	/* Userspace supplied TIMEOUT parameter: adjust crazy size */
-	return timeout == IPSET_NO_TIMEOUT ? IPSET_NO_TIMEOUT - 1 : timeout;
+	return timeout;
 }
 
 static inline bool
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 07af7dbf7a30..bf2890b13212 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -372,8 +372,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 
 	/* Normalize to fit into jiffies */
 	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
-	    add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
-		add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
+	    add_opt.ext.timeout > IPSET_MAX_TIMEOUT)
+		add_opt.ext.timeout = IPSET_MAX_TIMEOUT;
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
@@ -407,8 +407,8 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 
 	/* Normalize to fit into jiffies */
 	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
-	    add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
-		add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
+	    add_opt.ext.timeout > IPSET_MAX_TIMEOUT)
+		add_opt.ext.timeout = IPSET_MAX_TIMEOUT;
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)

From cbdebe481a14b42c45aa9f4ceb5ff19b55de2c57 Mon Sep 17 00:00:00 2001
From: Florent Fourcot <florent.fourcot@wifirst.fr>
Date: Mon, 4 Jun 2018 16:51:19 +0200
Subject: [PATCH 10/81] netfilter: ipset: forbid family for hash:mac sets

Userspace `ipset` command forbids family option for hash:mac type:

ipset create test hash:mac family inet4
ipset v6.30: Unknown argument: `family'

However, this check is not done in kernel itself. When someone use
external netlink applications (pyroute2 python library for example), one
can create hash:mac with invalid family and inconsistant results from
userspace (`ipset` command cannot read set content anymore).

This patch enforce the logic in kernel, and forbids insertion of
hash:mac with a family set.

Since IP_SET_PROTO_UNDEF is defined only for hash:mac, this patch has no
impact on other hash:* sets

Signed-off-by: Florent Fourcot <florent.fourcot@wifirst.fr>
Signed-off-by: Victorien Molle <victorien.molle@wifirst.fr>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/ipset/ip_set_hash_gen.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index bbad940c0137..8a33dac4e805 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1234,7 +1234,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	pr_debug("Create set %s with family %s\n",
 		 set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
 
-#ifndef IP_SET_PROTO_UNDEF
+#ifdef IP_SET_PROTO_UNDEF
+	if (set->family != NFPROTO_UNSPEC)
+		return -IPSET_ERR_INVALID_FAMILY;
+#else
 	if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
 		return -IPSET_ERR_INVALID_FAMILY;
 #endif

From 11ff7288beb2b7da889a014aff0a7b80bf8efcf3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Jun 2018 12:14:56 +0200
Subject: [PATCH 11/81] netfilter: ebtables: reject non-bridge targets

the ebtables evaluation loop expects targets to return
positive values (jumps), or negative values (absolute verdicts).

This is completely different from what xtables does.
In xtables, targets are expected to return the standard netfilter
verdicts, i.e. NF_DROP, NF_ACCEPT, etc.

ebtables will consider these as jumps.

Therefore reject any target found due to unspec fallback.
v2: also reject watchers.  ebtables ignores their return value, so
a target that assumes skb ownership (and returns NF_STOLEN) causes
use-after-free.

The only watchers in the 'ebtables' front-end are log and nflog;
both have AF_BRIDGE specific wrappers on kernel side.

Reported-by: syzbot+2b43f681169a2a0d306a@syzkaller.appspotmail.com
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 5f459c8b7937..08a65e4a77d0 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -396,6 +396,12 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par,
 	watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0);
 	if (IS_ERR(watcher))
 		return PTR_ERR(watcher);
+
+	if (watcher->family != NFPROTO_BRIDGE) {
+		module_put(watcher->me);
+		return -ENOENT;
+	}
+
 	w->u.watcher = watcher;
 
 	par->target   = watcher;
@@ -715,6 +721,13 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
 		goto cleanup_watchers;
 	}
 
+	/* Reject UNSPEC, xtables verdicts/return values are incompatible */
+	if (target->family != NFPROTO_BRIDGE) {
+		module_put(target->me);
+		ret = -ENOENT;
+		goto cleanup_watchers;
+	}
+
 	t->u.target = target;
 	if (t->u.target == &ebt_standard_target) {
 		if (gap < sizeof(struct ebt_standard_target)) {

From 82e20b44477ffe90a5866caa209ecc9df818c6a1 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 7 Jun 2018 02:05:12 +0900
Subject: [PATCH 12/81] netfilter: nft_set_rbtree: fix parameter of
 __nft_rbtree_lookup()

The parameter this doesn't have a flags value. so that it can't be
used by nft_rbtree_interval_end().

test commands:
   %nft add table ip filter
   %nft add set ip filter s { type ipv4_addr \; flags interval \; }
   %nft add element ip filter s {0-1}
   %nft add element ip filter s {2-10}
   %nft add chain ip filter input { type filter hook input priority 0\; }
   %nft add rule ip filter input ip saddr @s

Splat looks like:
[  246.752502] BUG: KASAN: slab-out-of-bounds in __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree]
[  246.752502] Read of size 1 at addr ffff88010d9efa47 by task http/1092

[  246.752502] CPU: 1 PID: 1092 Comm: http Not tainted 4.17.0-rc6+ #185
[  246.752502] Call Trace:
[  246.752502]  <IRQ>
[  246.752502]  dump_stack+0x74/0xbb
[  246.752502]  ? __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree]
[  246.752502]  print_address_description+0xc7/0x290
[  246.752502]  ? __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree]
[  246.752502]  kasan_report+0x22c/0x350
[  246.752502]  __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree]
[  246.752502]  nft_rbtree_lookup+0xc9/0x2d2 [nft_set_rbtree]
[  246.752502]  ? sched_clock_cpu+0x144/0x180
[  246.752502]  nft_lookup_eval+0x149/0x3a0 [nf_tables]
[  246.752502]  ? __lock_acquire+0xcea/0x4ed0
[  246.752502]  ? nft_lookup_init+0x6b0/0x6b0 [nf_tables]
[  246.752502]  nft_do_chain+0x263/0xf50 [nf_tables]
[  246.752502]  ? __nft_trace_packet+0x1a0/0x1a0 [nf_tables]
[  246.752502]  ? sched_clock_cpu+0x144/0x180
[ ... ]

Fixes: f9121355eb6f ("netfilter: nft_set_rbtree: incorrect assumption on lower interval lookups")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index e6f08bc5f359..26fa93b23805 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -65,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
 			parent = rcu_dereference_raw(parent->rb_left);
 			if (interval &&
 			    nft_rbtree_equal(set, this, interval) &&
-			    nft_rbtree_interval_end(this) &&
+			    nft_rbtree_interval_end(rbe) &&
 			    !nft_rbtree_interval_end(interval))
 				continue;
 			interval = rbe;

From 64e6dd1fb2f1ed799f317dc34aa6e251c64f4981 Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Thu, 7 Jun 2018 18:15:14 +0800
Subject: [PATCH 13/81] netfilter: nf_conntrack: Increase __IPS_MAX_BIT with
 new bit IPS_OFFLOAD_BIT

The __IPS_MAX_BIT is used in __ctnetlink_change_status as the max bit
value. When add new bit IPS_OFFLOAD_BIT whose value is 14, we should
increase the __IPS_MAX_BIT too, from 14 to 15.

There is no any bug in current codes, although it lost one loop in
__ctnetlink_change_status. Because the new bit IPS_OFFLOAD_BIT belongs
the IPS_UNCHANGEABLE_MASK.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index c712eb6879f1..336014bf8868 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -112,7 +112,7 @@ enum ip_conntrack_status {
 				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
 				 IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
 
-	__IPS_MAX_BIT = 14,
+	__IPS_MAX_BIT = 15,
 };
 
 /* Connection tracking event types */

From c568503ef02030f169c9e19204def610a3510918 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 7 Jun 2018 21:34:43 +0200
Subject: [PATCH 14/81] netfilter: x_tables: initialise match/target check
 parameter struct

syzbot reports following splat:

BUG: KMSAN: uninit-value in ebt_stp_mt_check+0x24b/0x450
 net/bridge/netfilter/ebt_stp.c:162
 ebt_stp_mt_check+0x24b/0x450 net/bridge/netfilter/ebt_stp.c:162
 xt_check_match+0x1438/0x1650 net/netfilter/x_tables.c:506
 ebt_check_match net/bridge/netfilter/ebtables.c:372 [inline]
 ebt_check_entry net/bridge/netfilter/ebtables.c:702 [inline]

The uninitialised access is
   xt_mtchk_param->nft_compat

... which should be set to 0.
Fix it by zeroing the struct beforehand, same for tgchk.

ip(6)tables targetinfo uses c99-style initialiser, so no change
needed there.

Reported-by: syzbot+da4494182233c23a5fcf@syzkaller.appspotmail.com
Fixes: 55917a21d0cc0 ("netfilter: x_tables: add context to know if extension runs from nft_compat")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 2 ++
 net/ipv4/netfilter/ip_tables.c  | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 3 files changed, 4 insertions(+)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 08a65e4a77d0..ead123dab05e 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -700,6 +700,8 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
 	}
 	i = 0;
 
+	memset(&mtpar, 0, sizeof(mtpar));
+	memset(&tgpar, 0, sizeof(tgpar));
 	mtpar.net	= tgpar.net       = net;
 	mtpar.table     = tgpar.table     = name;
 	mtpar.entryinfo = tgpar.entryinfo = e;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e85f35b89c49..f6130704f052 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -531,6 +531,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 		return -ENOMEM;
 
 	j = 0;
+	memset(&mtpar, 0, sizeof(mtpar));
 	mtpar.net	= net;
 	mtpar.table     = name;
 	mtpar.entryinfo = &e->ip;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 97f79dc943d7..685c2168f524 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -551,6 +551,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 		return -ENOMEM;
 
 	j = 0;
+	memset(&mtpar, 0, sizeof(mtpar));
 	mtpar.net	= net;
 	mtpar.table     = name;
 	mtpar.entryinfo = &e->ipv6;

From d8e87fc6d11c31525430a388317b52f4a98a5328 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 7 Jun 2018 19:38:09 +0000
Subject: [PATCH 15/81] netfilter: remove include/net/netfilter/nft_dup.h

include/net/netfilter/nft_dup.h was introduced in d877f07112f1 ("netfilter: nf_tables: add nft_dup expression")
but was never user since this date.

Furthermore, the only struct in this file is unused elsewhere.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_dup.h | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 include/net/netfilter/nft_dup.h

diff --git a/include/net/netfilter/nft_dup.h b/include/net/netfilter/nft_dup.h
deleted file mode 100644
index 4d9d512984b2..000000000000
--- a/include/net/netfilter/nft_dup.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NFT_DUP_H_
-#define _NFT_DUP_H_
-
-struct nft_dup_inet {
-	enum nft_registers	sreg_addr:8;
-	enum nft_registers	sreg_dev:8;
-};
-
-#endif /* _NFT_DUP_H_ */

From b16558579576c8f2781062b638600f68954b1827 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 8 Jun 2018 18:10:34 +0200
Subject: [PATCH 16/81] bpf: implement dummy fops for bpf objects

syzkaller was able to trigger the following warning in
do_dentry_open():

  WARNING: CPU: 1 PID: 4508 at fs/open.c:778 do_dentry_open+0x4ad/0xe40 fs/open.c:778
  Kernel panic - not syncing: panic_on_warn set ...

  CPU: 1 PID: 4508 Comm: syz-executor867 Not tainted 4.17.0+ #90
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  Call Trace:
  [...]
   vfs_open+0x139/0x230 fs/open.c:908
   do_last fs/namei.c:3370 [inline]
   path_openat+0x1717/0x4dc0 fs/namei.c:3511
   do_filp_open+0x249/0x350 fs/namei.c:3545
   do_sys_open+0x56f/0x740 fs/open.c:1101
   __do_sys_openat fs/open.c:1128 [inline]
   __se_sys_openat fs/open.c:1122 [inline]
   __x64_sys_openat+0x9d/0x100 fs/open.c:1122
   do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

Problem was that prog and map inodes in bpf fs did not
implement a dummy file open operation that would return an
error. The patch in do_dentry_open() checks whether f_ops
are present and if not bails out with an error. While this
may be fine, we really shouldn't be throwing a warning
though. Thus follow the model similar to bad_file_ops and
reject the request unconditionally with -EIO.

Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Reported-by: syzbot+2e7fcab0f56fdbb330b8@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/inode.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index ed13645bd80c..76efe9a183f5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,6 +295,15 @@ static const struct file_operations bpffs_map_fops = {
 	.release	= bpffs_map_release,
 };
 
+static int bpffs_obj_open(struct inode *inode, struct file *file)
+{
+	return -EIO;
+}
+
+static const struct file_operations bpffs_obj_fops = {
+	.open		= bpffs_obj_open,
+};
+
 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 			 const struct inode_operations *iops,
 			 const struct file_operations *fops)
@@ -314,7 +323,8 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 
 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL);
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops,
+			     &bpffs_obj_fops);
 }
 
 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
@@ -322,7 +332,7 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 	struct bpf_map *map = arg;
 
 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
-			     map->btf ? &bpffs_map_fops : NULL);
+			     map->btf ? &bpffs_map_fops : &bpffs_obj_fops);
 }
 
 static struct dentry *

From 1c9ca7e9836a4df1518568ea47461c5ef7c2cf8b Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Fri, 8 Jun 2018 08:51:27 +0200
Subject: [PATCH 17/81] selftests: bpf: fix urandom_read build issue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc complains that urandom_read gets built twice.

gcc -o tools/testing/selftests/bpf/urandom_read
-static urandom_read.c -Wl,--build-id
gcc -Wall -O2 -I../../../include/uapi -I../../../lib -I../../../lib/bpf
-I../../../../include/generated  -I../../../include    urandom_read.c
urandom_read -lcap -lelf -lrt -lpthread -o
tools/testing/selftests/bpf/urandom_read
gcc: fatal error: input file
‘tools/testing/selftests/bpf/urandom_read’ is the
same as output file
compilation terminated.
../lib.mk:110: recipe for target
'tools/testing/selftests/bpf/urandom_read' failed
To fix this issue remove the urandom_read target and so target
TEST_CUSTOM_PROGS gets used.

Fixes: 81f77fd0deeb ("bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 607ed8729c06..7a6214e9ae58 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -16,9 +16,7 @@ LDLIBS += -lcap -lelf -lrt -lpthread
 TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
 all: $(TEST_CUSTOM_PROGS)
 
-$(TEST_CUSTOM_PROGS): urandom_read
-
-urandom_read: urandom_read.c
+$(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c
 	$(CC) -o $(TEST_CUSTOM_PROGS) -static $< -Wl,--build-id
 
 # Order correspond to 'make run_tests' order

From 646bb57ce86e4d7b0bd9d33244450ae009411e48 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 4 Jun 2018 11:07:24 -0400
Subject: [PATCH 18/81] ixgbe: Fix setting of TC configuration for macvlan case

When we were enabling macvlan interfaces we weren't correctly configuring
things until ixgbe_setup_tc was called a second time either by tweaking the
number of queues or increasing the macvlan count past 15.

The issue came down to the fact that num_rx_pools is not populated until
after the queues and interrupts are reinitialized.

Instead of trying to set it sooner we can just move the call to setup at
least 1 traffic class to the SR-IOV/VMDq setup function so that we just set
it for this one case. We already had a spot that was configuring the queues
for TC 0 in the code here anyway so it makes sense to also set the number
of TCs here as well.

Fixes: 49cfbeb7a95c ("ixgbe: Fix handling of macvlan Tx offload")
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c  | 8 ++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index 893a9206e718..d361f570ca37 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -593,6 +593,14 @@ static bool ixgbe_set_sriov_queues(struct ixgbe_adapter *adapter)
 	}
 
 #endif
+	/* To support macvlan offload we have to use num_tc to
+	 * restrict the queues that can be used by the device.
+	 * By doing this we can avoid reporting a false number of
+	 * queues.
+	 */
+	if (vmdq_i > 1)
+		netdev_set_num_tc(adapter->netdev, 1);
+
 	/* populate TC0 for use by pool 0 */
 	netdev_set_tc_queue(adapter->netdev, 0,
 			    adapter->num_rx_queues_per_pool, 0);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4929f7265598..f9e0dc041cfb 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8822,14 +8822,6 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc)
 	} else {
 		netdev_reset_tc(dev);
 
-		/* To support macvlan offload we have to use num_tc to
-		 * restrict the queues that can be used by the device.
-		 * By doing this we can avoid reporting a false number of
-		 * queues.
-		 */
-		if (!tc && adapter->num_rx_pools > 1)
-			netdev_set_num_tc(dev, 1);
-
 		if (adapter->hw.mac.type == ixgbe_mac_82598EB)
 			adapter->hw.fc.requested_mode = adapter->last_lfc_mode;
 

From e433f3a5e272625c166d780f79ecc8fe456a5fc9 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 4 Jun 2018 16:51:20 -0400
Subject: [PATCH 19/81] ixgbe: Use CONFIG_XFRM_OFFLOAD instead of CONFIG_XFRM

There is no point in adding code if CONFIG_XFRM is defined that we won't
use unless CONFIG_XFRM_OFFLOAD is defined. So instead of leaving this code
floating around I am replacing the ifdef with what I believe is the correct
one so that we only include the code and variables if they will actually be
used.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index fc534e91c6b2..144d5fe6b944 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -760,9 +760,9 @@ struct ixgbe_adapter {
 #define IXGBE_RSS_KEY_SIZE     40  /* size of RSS Hash Key in bytes */
 	u32 *rss_key;
 
-#ifdef CONFIG_XFRM
+#ifdef CONFIG_XFRM_OFFLOAD
 	struct ixgbe_ipsec *ipsec;
-#endif /* CONFIG_XFRM */
+#endif /* CONFIG_XFRM_OFFLOAD */
 };
 
 static inline u8 ixgbe_max_rss_indices(struct ixgbe_adapter *adapter)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f9e0dc041cfb..a925f05ec342 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9896,7 +9896,7 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
 	 * the TSO, so it's the exception.
 	 */
 	if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID)) {
-#ifdef CONFIG_XFRM
+#ifdef CONFIG_XFRM_OFFLOAD
 		if (!skb->sp)
 #endif
 			features &= ~NETIF_F_TSO;

From de7a7e34e27c029fbb3c4e764db045548629b834 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 4 Jun 2018 16:51:25 -0400
Subject: [PATCH 20/81] ixgbe: Move ipsec init function to before reset call

This patch moves the IPsec init function in ixgbe_sw_init. This way it is a
bit more consistent with the placement of similar initialization functions
and is placed before the reset_hw call which should allow us to clean up
any link issues that may be introduced by the fact that we force the link
up if somehow the device had IPsec still enabled before the driver was
loaded.

In addition to the function move it is necessary to change the assignment
of netdev->features. The easiest way to do this is to just test for the
existence of adapter->ipsec and if it is present we set the feature bits.

Fixes: 49a94d74d948 ("ixgbe: add ipsec engine start and stop routines")
Reported-by: Andre Tomt <andre@tomt.net>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c |  7 -------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 11 +++++++++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 344a1f213a5f..38d8cf75e9ad 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -1001,13 +1001,6 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
 
 	adapter->netdev->xfrmdev_ops = &ixgbe_xfrmdev_ops;
 
-#define IXGBE_ESP_FEATURES	(NETIF_F_HW_ESP | \
-				 NETIF_F_HW_ESP_TX_CSUM | \
-				 NETIF_F_GSO_ESP)
-
-	adapter->netdev->features |= IXGBE_ESP_FEATURES;
-	adapter->netdev->hw_enc_features |= IXGBE_ESP_FEATURES;
-
 	return;
 
 err2:
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a925f05ec342..8d061af276d3 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6117,6 +6117,7 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter,
 #ifdef CONFIG_IXGBE_DCB
 	ixgbe_init_dcb(adapter);
 #endif
+	ixgbe_init_ipsec_offload(adapter);
 
 	/* default flow control settings */
 	hw->fc.requested_mode = ixgbe_fc_full;
@@ -10429,6 +10430,14 @@ skip_sriov:
 	if (hw->mac.type >= ixgbe_mac_82599EB)
 		netdev->features |= NETIF_F_SCTP_CRC;
 
+#ifdef CONFIG_XFRM_OFFLOAD
+#define IXGBE_ESP_FEATURES	(NETIF_F_HW_ESP | \
+				 NETIF_F_HW_ESP_TX_CSUM | \
+				 NETIF_F_GSO_ESP)
+
+	if (adapter->ipsec)
+		netdev->features |= IXGBE_ESP_FEATURES;
+#endif
 	/* copy netdev features into list of user selectable features */
 	netdev->hw_features |= netdev->features |
 			       NETIF_F_HW_VLAN_CTAG_FILTER |
@@ -10491,8 +10500,6 @@ skip_sriov:
 					 NETIF_F_FCOE_MTU;
 	}
 #endif /* IXGBE_FCOE */
-	ixgbe_init_ipsec_offload(adapter);
-
 	if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)
 		netdev->hw_features |= NETIF_F_LRO;
 	if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)

From e9f655ee97f14b4f5eba7b6b5a56a7c298573e67 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 5 Jun 2018 11:11:08 -0400
Subject: [PATCH 21/81] ixgbe: Avoid loopback and fix boolean logic in
 ipsec_stop_data

This patch fixes two issues. First we add an early test for the Tx and Rx
security block ready bits. By doing this we can avoid the need for waits or
loopback in the event that the security block is already flushed out.
Secondly we fix the boolean logic that was testing for the Tx OR Rx ready
bits being set and change it so that we only exit if the Tx AND Rx ready
bits are both set.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 38d8cf75e9ad..7b23fb0c2d07 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -158,7 +158,16 @@ static void ixgbe_ipsec_stop_data(struct ixgbe_adapter *adapter)
 	reg |= IXGBE_SECRXCTRL_RX_DIS;
 	IXGBE_WRITE_REG(hw, IXGBE_SECRXCTRL, reg);
 
-	IXGBE_WRITE_FLUSH(hw);
+	/* If both Tx and Rx are ready there are no packets
+	 * that we need to flush so the loopback configuration
+	 * below is not necessary.
+	 */
+	t_rdy = IXGBE_READ_REG(hw, IXGBE_SECTXSTAT) &
+		IXGBE_SECTXSTAT_SECTX_RDY;
+	r_rdy = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT) &
+		IXGBE_SECRXSTAT_SECRX_RDY;
+	if (t_rdy && r_rdy)
+		return;
 
 	/* If the tx fifo doesn't have link, but still has data,
 	 * we can't clear the tx sec block.  Set the MAC loopback
@@ -185,7 +194,7 @@ static void ixgbe_ipsec_stop_data(struct ixgbe_adapter *adapter)
 			IXGBE_SECTXSTAT_SECTX_RDY;
 		r_rdy = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT) &
 			IXGBE_SECRXSTAT_SECRX_RDY;
-	} while (!t_rdy && !r_rdy && limit--);
+	} while (!(t_rdy && r_rdy) && limit--);
 
 	/* undo loopback if we played with it earlier */
 	if (!link) {

From 421d954c4f1e9afd55bc65398bfc64ceba38df21 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 5 Jun 2018 11:11:14 -0400
Subject: [PATCH 22/81] ixgbe: Fix bit definitions and add support for testing
 for ipsec support

This patch addresses two issues. First it adds the correct bit definitions
for the SECTXSTAT and SECRXSTAT registers. Then it makes use of those
definitions to test for if IPsec has been disabled on the part and if so we
do not enable it.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Reported-by: Andre Tomt <andre@tomt.net>
Acked-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 14 +++++++++++++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h  |  6 ++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 7b23fb0c2d07..c116f459945d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -975,10 +975,22 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring,
  **/
 void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
 {
+	struct ixgbe_hw *hw = &adapter->hw;
 	struct ixgbe_ipsec *ipsec;
+	u32 t_dis, r_dis;
 	size_t size;
 
-	if (adapter->hw.mac.type == ixgbe_mac_82598EB)
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		return;
+
+	/* If there is no support for either Tx or Rx offload
+	 * we should not be advertising support for IPsec.
+	 */
+	t_dis = IXGBE_READ_REG(hw, IXGBE_SECTXSTAT) &
+		IXGBE_SECTXSTAT_SECTX_OFF_DIS;
+	r_dis = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT) &
+		IXGBE_SECRXSTAT_SECRX_OFF_DIS;
+	if (t_dis || r_dis)
 		return;
 
 	ipsec = kzalloc(sizeof(*ipsec), GFP_KERNEL);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index e8ed37749ab1..44cfb2021145 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -599,13 +599,15 @@ struct ixgbe_nvm_version {
 #define IXGBE_SECTXCTRL_STORE_FORWARD   0x00000004
 
 #define IXGBE_SECTXSTAT_SECTX_RDY       0x00000001
-#define IXGBE_SECTXSTAT_ECC_TXERR       0x00000002
+#define IXGBE_SECTXSTAT_SECTX_OFF_DIS   0x00000002
+#define IXGBE_SECTXSTAT_ECC_TXERR       0x00000004
 
 #define IXGBE_SECRXCTRL_SECRX_DIS       0x00000001
 #define IXGBE_SECRXCTRL_RX_DIS          0x00000002
 
 #define IXGBE_SECRXSTAT_SECRX_RDY       0x00000001
-#define IXGBE_SECRXSTAT_ECC_RXERR       0x00000002
+#define IXGBE_SECRXSTAT_SECRX_OFF_DIS   0x00000002
+#define IXGBE_SECRXSTAT_ECC_RXERR       0x00000004
 
 /* LinkSec (MacSec) Registers */
 #define IXGBE_LSECTXCAP         0x08A00

From 0975764684487bf3f7a47eef009e750ea41bd514 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 11 Jun 2018 02:02:54 +0300
Subject: [PATCH 23/81] ipv6: allow PMTU exceptions to local routes

IPVS setups with local client and remote tunnel server need
to create exception for the local virtual IP. What we do is to
change PMTU from 64KB (on "lo") to 1460 in the common case.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception")
Fixes: 7343ff31ebf0 ("ipv6: Don't create clones of host routes.")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: David Ahern <dsahern@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fb956989adaf..86a0e4333d42 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2307,9 +2307,6 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	const struct in6_addr *daddr, *saddr;
 	struct rt6_info *rt6 = (struct rt6_info *)dst;
 
-	if (rt6->rt6i_flags & RTF_LOCAL)
-		return;
-
 	if (dst_metric_locked(dst, RTAX_MTU))
 		return;
 

From 349b71d6f427ff8211adf50839dbbff3f27c1805 Mon Sep 17 00:00:00 2001
From: Zhouyang Jia <jiazhouyang09@gmail.com>
Date: Mon, 11 Jun 2018 13:26:35 +0800
Subject: [PATCH 24/81] net: dsa: add error handling for pskb_trim_rcsum

When pskb_trim_rcsum fails, the lack of error-handling code may
cause unexpected results.

This patch adds error-handling code after calling pskb_trim_rcsum.

Signed-off-by: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_trailer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 7d20e1f3de28..56197f0d9608 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -75,7 +75,8 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!skb->dev)
 		return NULL;
 
-	pskb_trim_rcsum(skb, skb->len - 4);
+	if (pskb_trim_rcsum(skb, skb->len - 4))
+		return NULL;
 
 	return skb;
 }

From a343993c518ce252b62ec00ac06bccfb1d17129d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn.topel@intel.com>
Date: Mon, 11 Jun 2018 13:57:12 +0200
Subject: [PATCH 25/81] xsk: silence warning on memory allocation failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

syzkaller reported a warning from xdp_umem_pin_pages():

  WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
  ...
  __do_kmalloc mm/slab.c:3713 [inline]
  __kmalloc+0x25/0x760 mm/slab.c:3727
  kmalloc_array include/linux/slab.h:634 [inline]
  kcalloc include/linux/slab.h:645 [inline]
  xdp_umem_pin_pages net/xdp/xdp_umem.c:205 [inline]
  xdp_umem_reg net/xdp/xdp_umem.c:318 [inline]
  xdp_umem_create+0x5c9/0x10f0 net/xdp/xdp_umem.c:349
  xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531
  __sys_setsockopt+0x1bd/0x390 net/socket.c:1935
  __do_sys_setsockopt net/socket.c:1946 [inline]
  __se_sys_setsockopt net/socket.c:1943 [inline]
  __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943
  do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

This is a warning about attempting to allocate more than
KMALLOC_MAX_SIZE memory. The request originates from userspace, and if
the request is too big, the kernel is free to deny its allocation. In
this patch, the failed allocation attempt is silenced with
__GFP_NOWARN.

Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index b9ef487c4618..f47abb46c587 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -204,7 +204,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
 	long npgs;
 	int err;
 
-	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
+	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
+			    GFP_KERNEL | __GFP_NOWARN);
 	if (!umem->pgs)
 		return -ENOMEM;
 

From f6fadff33e8b09373eedf99822b89d9dd84545b8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 11 Jun 2018 23:22:04 +0200
Subject: [PATCH 26/81] tls: fix NULL pointer dereference on poll

While hacking on kTLS, I ran into the following panic from an
unprivileged netserver / netperf TCP session:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  PGD 800000037f378067 P4D 800000037f378067 PUD 3c0e61067 PMD 0
  Oops: 0010 [#1] SMP KASAN PTI
  CPU: 1 PID: 2289 Comm: netserver Not tainted 4.17.0+ #139
  Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016
  RIP: 0010:          (null)
  Code: Bad RIP value.
  RSP: 0018:ffff88036abcf740 EFLAGS: 00010246
  RAX: dffffc0000000000 RBX: ffff88036f5f6800 RCX: 1ffff1006debed26
  RDX: ffff88036abcf920 RSI: ffff8803cb1a4f00 RDI: ffff8803c258c280
  RBP: ffff8803c258c280 R08: ffff8803c258c280 R09: ffffed006f559d48
  R10: ffff88037aacea43 R11: ffffed006f559d49 R12: ffff8803c258c280
  R13: ffff8803cb1a4f20 R14: 00000000000000db R15: ffffffffc168a350
  FS:  00007f7e631f4700(0000) GS:ffff8803d1c80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffffffffffffffd6 CR3: 00000003ccf64005 CR4: 00000000003606e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   ? tls_sw_poll+0xa4/0x160 [tls]
   ? sock_poll+0x20a/0x680
   ? do_select+0x77b/0x11a0
   ? poll_schedule_timeout.constprop.12+0x130/0x130
   ? pick_link+0xb00/0xb00
   ? read_word_at_a_time+0x13/0x20
   ? vfs_poll+0x270/0x270
   ? deref_stack_reg+0xad/0xe0
   ? __read_once_size_nocheck.constprop.6+0x10/0x10
  [...]

Debugging further, it turns out that calling into ctx->sk_poll() is
invalid since sk_poll itself is NULL which was saved from the original
TCP socket in order for tls_sw_poll() to invoke it.

Looks like the recent conversion from poll to poll_mask callback started
in 152524231023 ("net: add support for ->poll_mask in proto_ops") missed
to eventually convert kTLS, too: TCP's ->poll was converted over to the
->poll_mask in commit 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
and therefore kTLS wrongly saved the ->poll old one which is now NULL.

Convert kTLS over to use ->poll_mask instead. Also instead of POLLIN |
POLLRDNORM use the proper EPOLLIN | EPOLLRDNORM bits as the case in
tcp_poll_mask() as well that is mangled here.

Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Watson <davejwatson@fb.com>
Tested-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  6 ++----
 net/tls/tls_main.c |  2 +-
 net/tls/tls_sw.c   | 19 +++++++++----------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 70c273777fe9..7f84ea3e217c 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -109,8 +109,7 @@ struct tls_sw_context_rx {
 
 	struct strparser strp;
 	void (*saved_data_ready)(struct sock *sk);
-	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
-				struct poll_table_struct *wait);
+	__poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events);
 	struct sk_buff *recv_pkt;
 	u8 control;
 	bool decrypted;
@@ -225,8 +224,7 @@ void tls_sw_free_resources_tx(struct sock *sk);
 void tls_sw_free_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
-			 struct poll_table_struct *wait);
+__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 301f22430469..a127d61e8af9 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -712,7 +712,7 @@ static int __init tls_register(void)
 	build_protos(tls_prots[TLSV4], &tcp_prot);
 
 	tls_sw_proto_ops = inet_stream_ops;
-	tls_sw_proto_ops.poll = tls_sw_poll;
+	tls_sw_proto_ops.poll_mask = tls_sw_poll_mask;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
 #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 8ca57d01b18f..34895b7c132d 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -915,23 +915,22 @@ splice_read_end:
 	return copied ? : err;
 }
 
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
-			 struct poll_table_struct *wait)
+__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events)
 {
-	unsigned int ret;
 	struct sock *sk = sock->sk;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	__poll_t mask;
 
-	/* Grab POLLOUT and POLLHUP from the underlying socket */
-	ret = ctx->sk_poll(file, sock, wait);
+	/* Grab EPOLLOUT and EPOLLHUP from the underlying socket */
+	mask = ctx->sk_poll_mask(sock, events);
 
-	/* Clear POLLIN bits, and set based on recv_pkt */
-	ret &= ~(POLLIN | POLLRDNORM);
+	/* Clear EPOLLIN bits, and set based on recv_pkt */
+	mask &= ~(EPOLLIN | EPOLLRDNORM);
 	if (ctx->recv_pkt)
-		ret |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
-	return ret;
+	return mask;
 }
 
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1188,7 +1187,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		sk->sk_data_ready = tls_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
 
-		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
+		sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask;
 
 		strp_check_rcv(&sw_ctx_rx->strp);
 	}

From 3f2d67b6bd24c615cfe3a6d793613bb2838dd9b9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 11 Jun 2018 07:12:12 -0700
Subject: [PATCH 27/81] net/ipv6: Ensure cfg is properly initialized in
 ipv6_create_tempaddr

Valdis reported a BUG in ipv6_add_addr:

[ 1820.832682] BUG: unable to handle kernel NULL pointer dereference at 0000000000000209
[ 1820.832728] RIP: 0010:ipv6_add_addr+0x280/0xd10
[ 1820.832732] Code: 49 8b 1f 0f 84 6a 0a 00 00 48 85 db 0f 84 4e 0a 00 00 48 8b 03 48 8b 53 08 49 89 45 00 49 8b 47 10
49 89 55 08 48 85 c0 74 15 <48> 8b 50 08 48 8b 00 49 89 95 b8 01 00 00 49 89 85 b0 01 00 00 4c
[ 1820.832847] RSP: 0018:ffffaa07c2fd7880 EFLAGS: 00010202
[ 1820.832853] RAX: 0000000000000201 RBX: ffffaa07c2fd79b0 RCX: 0000000000000000
[ 1820.832858] RDX: a4cfbfba2cbfa64c RSI: 0000000000000000 RDI: ffffffff8a8e9fa0
[ 1820.832862] RBP: ffffaa07c2fd7920 R08: 000000000000017a R09: ffffffff8a555300
[ 1820.832866] R10: 0000000000000000 R11: 0000000000000000 R12: ffff888d18e71c00
[ 1820.832871] R13: ffff888d0a9b1200 R14: 0000000000000000 R15: ffffaa07c2fd7980
[ 1820.832876] FS:  00007faa51bdb800(0000) GS:ffff888d1d400000(0000) knlGS:0000000000000000
[ 1820.832880] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1820.832885] CR2: 0000000000000209 CR3: 000000021e8f8001 CR4: 00000000001606e0
[ 1820.832888] Call Trace:
[ 1820.832898]  ? __local_bh_enable_ip+0x119/0x260
[ 1820.832904]  ? ipv6_create_tempaddr+0x259/0x5a0
[ 1820.832912]  ? __local_bh_enable_ip+0x139/0x260
[ 1820.832921]  ipv6_create_tempaddr+0x2da/0x5a0
[ 1820.832926]  ? ipv6_create_tempaddr+0x2da/0x5a0
[ 1820.832941]  manage_tempaddrs+0x1a5/0x240
[ 1820.832951]  inet6_addr_del+0x20b/0x3b0
[ 1820.832959]  ? nla_parse+0xce/0x1e0
[ 1820.832968]  inet6_rtm_deladdr+0xd9/0x210
[ 1820.832981]  rtnetlink_rcv_msg+0x1d4/0x5f0

Looking at the code I found 1 element (peer_pfx) of the newly introduced
ifa6_config struct that is not initialized. Use a memset rather than hard
coding an init for each struct element.

Reported-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Fixes: e6464b8c63619 ("net/ipv6: Convert ipv6_add_addr to struct ifa6_config")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 89019bf59f46..c134286d6a41 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1324,6 +1324,7 @@ retry:
 		}
 	}
 
+	memset(&cfg, 0, sizeof(cfg));
 	cfg.valid_lft = min_t(__u32, ifp->valid_lft,
 			      idev->cnf.temp_valid_lft + age);
 	cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
@@ -1357,7 +1358,6 @@ retry:
 
 	cfg.pfx = &addr;
 	cfg.scope = ipv6_addr_scope(cfg.pfx);
-	cfg.rt_priority = 0;
 
 	ift = ipv6_add_addr(idev, &cfg, block, NULL);
 	if (IS_ERR(ift)) {

From 6892286e9c09925780fe2cb6db3585b56b71fe8e Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 11 Jun 2018 18:00:13 -0700
Subject: [PATCH 28/81] tcp: Do not reload skb pointer after skb_gro_receive().

This is not necessary.  skb_gro_receive() will never change what
'head' points to.

In it's original implementation (see commit 71d93b39e52e ("net: Add
skb_gro_receive")), it did:

====================
+	*head = nskb;
+	nskb->next = p->next;
+	p->next = NULL;
====================

This sequence was removed in commit 58025e46ea2d ("net: gro: remove
obsolete code from skb_gro_receive()")

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_offload.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 4d58e2ce0b5b..8cc7c3487330 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -268,8 +268,6 @@ found:
 		goto out_check_final;
 	}
 
-	p = *head;
-	th2 = tcp_hdr(p);
 	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
 
 out_check_final:

From 155fb5c5fae72d1faa2067d6fa0a5be12279c689 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Mon, 28 May 2018 18:14:49 +0900
Subject: [PATCH 29/81] netfilter: fix null-ptr-deref in nf_nat_decode_session

Add null check for nat_hook in nf_nat_decode_session()

[  195.648098] UBSAN: Undefined behaviour in ./include/linux/netfilter.h:348:14
[  195.651366] BUG: KASAN: null-ptr-deref in __xfrm_policy_check+0x208/0x1d70
[  195.653888] member access within null pointer of type 'struct nf_nat_hook'
[  195.653896] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.17.0-rc6+ #5
[  195.656320] Read of size 8 at addr 0000000000000008 by task ping/2469
[  195.658715] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[  195.658721] Call Trace:
[  195.661087]
[  195.669341]  <IRQ>
[  195.670574]  dump_stack+0xc6/0x150
[  195.672156]  ? dump_stack_print_info.cold.0+0x1b/0x1b
[  195.674121]  ? ubsan_prologue+0x31/0x92
[  195.676546]  ubsan_epilogue+0x9/0x49
[  195.678159]  handle_null_ptr_deref+0x11a/0x130
[  195.679800]  ? sprint_OID+0x1a0/0x1a0
[  195.681322]  __ubsan_handle_type_mismatch_v1+0xd5/0x11d
[  195.683146]  ? ubsan_prologue+0x92/0x92
[  195.684642]  __xfrm_policy_check+0x18ef/0x1d70
[  195.686294]  ? rt_cache_valid+0x118/0x180
[  195.687804]  ? __xfrm_route_forward+0x410/0x410
[  195.689463]  ? fib_multipath_hash+0x700/0x700
[  195.691109]  ? kvm_sched_clock_read+0x23/0x40
[  195.692805]  ? pvclock_clocksource_read+0xf6/0x280
[  195.694409]  ? graph_lock+0xa0/0xa0
[  195.695824]  ? pvclock_clocksource_read+0xf6/0x280
[  195.697508]  ? pvclock_read_flags+0x80/0x80
[  195.698981]  ? kvm_sched_clock_read+0x23/0x40
[  195.700347]  ? sched_clock+0x5/0x10
[  195.701525]  ? sched_clock_cpu+0x18/0x1a0
[  195.702846]  tcp_v4_rcv+0x1d32/0x1de0
[  195.704115]  ? lock_repin_lock+0x70/0x270
[  195.707072]  ? pvclock_read_flags+0x80/0x80
[  195.709302]  ? tcp_v4_early_demux+0x4b0/0x4b0
[  195.711833]  ? lock_acquire+0x195/0x380
[  195.714222]  ? ip_local_deliver_finish+0xfc/0x770
[  195.716967]  ? raw_rcv+0x2b0/0x2b0
[  195.718856]  ? lock_release+0xa00/0xa00
[  195.720938]  ip_local_deliver_finish+0x1b9/0x770
[...]

Fixes: 2c205dd3981f ("netfilter: add struct nf_nat_hook and use it")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 04551af2ff23..dd2052f0efb7 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -345,7 +345,7 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 
 	rcu_read_lock();
 	nat_hook = rcu_dereference(nf_nat_hook);
-	if (nat_hook->decode_session)
+	if (nat_hook && nat_hook->decode_session)
 		nat_hook->decode_session(skb, fl);
 	rcu_read_unlock();
 #endif

From 3fb61eca185cc65a1be23d9a5a11347eef79f597 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 7 Jun 2018 17:56:08 +0200
Subject: [PATCH 30/81] netfilter: nft_socket: fix module autoload

Add alias definition for module autoload when adding socket rules.

Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_socket.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index f28a0b944087..74e1b3bd6954 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -142,3 +142,4 @@ module_exit(nft_socket_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Máté Eckl");
 MODULE_DESCRIPTION("nf_tables socket match module");
+MODULE_ALIAS_NFT_EXPR("socket");

From 215a31f19dedd4e92a67cf5a9717ee898d012b3a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 11 Jun 2018 17:18:29 +0200
Subject: [PATCH 31/81] netfilter: nft_dynset: do not reject set updates with
 NFT_SET_EVAL

NFT_SET_EVAL is signalling the kernel that this sets can be updated from
the evaluation path, even if there are no expressions attached to the
element. Otherwise, set updates with no expressions fail. Update
description to describe the right semantics.

Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 +-
 net/netfilter/nft_dynset.c               | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index c9bf74b94f37..89438e68dc03 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -266,7 +266,7 @@ enum nft_rule_compat_attributes {
  * @NFT_SET_INTERVAL: set contains intervals
  * @NFT_SET_MAP: set is used as a dictionary
  * @NFT_SET_TIMEOUT: set uses timeouts
- * @NFT_SET_EVAL: set contains expressions for evaluation
+ * @NFT_SET_EVAL: set can be updated from the evaluation path
  * @NFT_SET_OBJECT: set contains stateful objects
  */
 enum nft_set_flags {
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 4d49529cff61..27d7e4598ab6 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -203,9 +203,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 				goto err1;
 			set->ops->gc_init(set);
 		}
-
-	} else if (set->flags & NFT_SET_EVAL)
-		return -EINVAL;
+	}
 
 	nft_set_ext_prepare(&priv->tmpl);
 	nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen);

From 71ad00c50d77e507138c792a9646b53c16f22e11 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Jun 2018 13:20:35 +0200
Subject: [PATCH 32/81] netfilter: nf_tables: fix module unload race

We must first remove the nfnetlink protocol handler when nf_tables module
is unloaded -- we don't want userspace to submit new change requests once
we've started to tear down nft state.

Furthermore, nfnetlink must not call any subsystem function after
call_batch returned -EAGAIN.

EAGAIN means the subsys mutex was dropped, so its unlikely but possible that
nf_tables subsystem was removed due to 'rmmod nf_tables' on another cpu.

Therefore, we must abort batch completely and not move on to next part of
the batch.

Last, we can't invoke ->abort unless we've checked that the subsystem is
still registered.

Change netns exit path of nf_tables to make sure any incompleted
transaction gets removed on exit.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 12 +++++++++---
 net/netfilter/nfnetlink.c     | 10 +++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7979095b69b0..ae312b31db28 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6439,7 +6439,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 	kfree(trans);
 }
 
-static int nf_tables_abort(struct net *net, struct sk_buff *skb)
+static int __nf_tables_abort(struct net *net)
 {
 	struct nft_trans *trans, *next;
 	struct nft_trans_elem *te;
@@ -6555,6 +6555,11 @@ static void nf_tables_cleanup(struct net *net)
 	nft_validate_state_update(net, NFT_VALIDATE_SKIP);
 }
 
+static int nf_tables_abort(struct net *net, struct sk_buff *skb)
+{
+	return __nf_tables_abort(net);
+}
+
 static bool nf_tables_valid_genid(struct net *net, u32 genid)
 {
 	return net->nft.base_seq == genid;
@@ -7149,9 +7154,10 @@ static int __net_init nf_tables_init_net(struct net *net)
 
 static void __net_exit nf_tables_exit_net(struct net *net)
 {
+	if (!list_empty(&net->nft.commit_list))
+		__nf_tables_abort(net);
 	__nft_release_tables(net);
 	WARN_ON_ONCE(!list_empty(&net->nft.tables));
-	WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
 }
 
 static struct pernet_operations nf_tables_net_ops = {
@@ -7193,9 +7199,9 @@ err1:
 
 static void __exit nf_tables_module_exit(void)
 {
-	unregister_pernet_subsys(&nf_tables_net_ops);
 	nfnetlink_subsys_unregister(&nf_tables_subsys);
 	unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
+	unregister_pernet_subsys(&nf_tables_net_ops);
 	rcu_barrier();
 	nf_tables_core_module_exit();
 	kfree(info);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 4d0da7042aff..e1b6be29848d 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -429,7 +429,7 @@ replay:
 			 */
 			if (err == -EAGAIN) {
 				status |= NFNL_BATCH_REPLAY;
-				goto next;
+				goto done;
 			}
 		}
 ack:
@@ -456,7 +456,7 @@ ack:
 			if (err)
 				status |= NFNL_BATCH_FAILURE;
 		}
-next:
+
 		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
 		if (msglen > skb->len)
 			msglen = skb->len;
@@ -464,7 +464,11 @@ next:
 	}
 done:
 	if (status & NFNL_BATCH_REPLAY) {
-		ss->abort(net, oskb);
+		const struct nfnetlink_subsystem *ss2;
+
+		ss2 = nfnl_dereference_protected(subsys_id);
+		if (ss2 == ss)
+			ss->abort(net, oskb);
 		nfnl_err_reset(&err_list);
 		nfnl_unlock(subsys_id);
 		kfree_skb(skb);

From 0a2cf5ee432c2e8718af3553a56a3760d767b736 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Jun 2018 13:20:36 +0200
Subject: [PATCH 33/81] netfilter: nf_tables: close race between netns exit and
 rmmod

If net namespace is exiting while nf_tables module is being removed
we can oops:

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000040
 IP: nf_tables_flowtable_event+0x43/0xf0 [nf_tables]
 PGD 0 P4D 0
 Oops: 0000 [#1] SMP PTI
 Modules linked in: nf_tables(-) nfnetlink [..]
  unregister_netdevice_notifier+0xdd/0x130
  nf_tables_module_exit+0x24/0x3a [nf_tables]
  SyS_delete_module+0x1c5/0x240
  do_syscall_64+0x74/0x190

Avoid this by attempting to take reference on the net namespace from
the notifiers.  If it fails the namespace is exiting already, and nft
core is taking care of cleanup work.

We also need to make sure the netdev hook type gets removed
before netns ops removal, else notifier might be invoked with device
event for a netns where net->nft was never initialised (because
pernet ops was removed beforehand).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c    | 13 ++++++++++---
 net/netfilter/nft_chain_filter.c |  5 +++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ae312b31db28..d23a5c269c44 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5837,18 +5837,23 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct nft_flowtable *flowtable;
 	struct nft_table *table;
+	struct net *net;
 
 	if (event != NETDEV_UNREGISTER)
 		return 0;
 
+	net = maybe_get_net(dev_net(dev));
+	if (!net)
+		return 0;
+
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_for_each_entry(table, &dev_net(dev)->nft.tables, list) {
+	list_for_each_entry(table, &net->nft.tables, list) {
 		list_for_each_entry(flowtable, &table->flowtables, list) {
 			nft_flowtable_event(event, dev, flowtable);
 		}
 	}
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-
+	put_net(net);
 	return NOTIFY_DONE;
 }
 
@@ -7154,9 +7159,11 @@ static int __net_init nf_tables_init_net(struct net *net)
 
 static void __net_exit nf_tables_exit_net(struct net *net)
 {
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
 	if (!list_empty(&net->nft.commit_list))
 		__nf_tables_abort(net);
 	__nft_release_tables(net);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 	WARN_ON_ONCE(!list_empty(&net->nft.tables));
 }
 
@@ -7201,11 +7208,11 @@ static void __exit nf_tables_module_exit(void)
 {
 	nfnetlink_subsys_unregister(&nf_tables_subsys);
 	unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
+	nft_chain_filter_fini();
 	unregister_pernet_subsys(&nf_tables_net_ops);
 	rcu_barrier();
 	nf_tables_core_module_exit();
 	kfree(info);
-	nft_chain_filter_fini();
 }
 
 module_init(nf_tables_module_init);
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index 84c902477a91..d21834bed805 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -318,6 +318,10 @@ static int nf_tables_netdev_event(struct notifier_block *this,
 	    event != NETDEV_CHANGENAME)
 		return NOTIFY_DONE;
 
+	ctx.net = maybe_get_net(ctx.net);
+	if (!ctx.net)
+		return NOTIFY_DONE;
+
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
 	list_for_each_entry(table, &ctx.net->nft.tables, list) {
 		if (table->family != NFPROTO_NETDEV)
@@ -334,6 +338,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
 		}
 	}
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	put_net(ctx.net);
 
 	return NOTIFY_DONE;
 }

From adc972c5b88829d38ede08b1069718661c7330ae Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 11 Jun 2018 22:16:33 +0900
Subject: [PATCH 34/81] netfilter: nf_tables: use WARN_ON_ONCE instead of
 BUG_ON in nft_do_chain()

When depth of chain is bigger than NFT_JUMP_STACK_SIZE, the nft_do_chain
crashes. But there is no need to crash hard here.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index deff10adef9c..8de912ca53d3 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -183,7 +183,8 @@ next_rule:
 
 	switch (regs.verdict.code) {
 	case NFT_JUMP:
-		BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
+		if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
+			return NF_DROP;
 		jumpstack[stackptr].chain = chain;
 		jumpstack[stackptr].rules = rules + 1;
 		stackptr++;

From c05a45c0865d986a8aea373cd5297dbfded6882e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Jun 2018 22:22:19 +0200
Subject: [PATCH 35/81] netfilter: ctnetlink: avoid null pointer dereference

Dan Carpenter points out that deref occurs after NULL check, we should
re-fetch the pointer and check that instead.

Fixes: 2c205dd3981f7 ("netfilter: add struct nf_nat_hook and use it")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 39327a42879f..20a2e37c76d1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1446,7 +1446,8 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 		}
 		nfnl_lock(NFNL_SUBSYS_CTNETLINK);
 		rcu_read_lock();
-		if (nat_hook->parse_nat_setup)
+		nat_hook = rcu_dereference(nf_nat_hook);
+		if (nat_hook)
 			return -EAGAIN;
 #endif
 		return -EOPNOTSUPP;

From fc6ddbecce440df74fb4491c17c372b52cf5be83 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 12 Jun 2018 18:36:19 +0200
Subject: [PATCH 36/81] netfilter: xt_connmark: fix list corruption on rmmod

This needs to use xt_unregister_targets, else new revision is left
on the list which then causes list to point to a target struct that has been free'd.

Fixes: 472a73e00757 ("netfilter: xt_conntrack: Support bit-shifting for CONNMARK & MARK targets.")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_connmark.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 94df000abb92..29c38aa7f726 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -211,7 +211,7 @@ static int __init connmark_mt_init(void)
 static void __exit connmark_mt_exit(void)
 {
 	xt_unregister_match(&connmark_mt_reg);
-	xt_unregister_target(connmark_tg_reg);
+	xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg));
 }
 
 module_init(connmark_mt_init);

From 21ba8847f857028dc83a0f341e16ecc616e34740 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Tue, 12 Jun 2018 10:51:34 -0700
Subject: [PATCH 37/81] netfilter: nf_conncount: Fix garbage collection with
 zones
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, we use check_hlist() for garbage colleciton. However, we
use the ‘zone’ from the counted entry to query the existence of
existing entries in the hlist. This could be wrong when they are in
different zones, and this patch fixes this issue.

Fixes: e59ea3df3fc2 ("netfilter: xt_connlimit: honor conntrack zone if available")
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h |  3 ++-
 net/netfilter/nf_conncount.c               | 13 +++++++++----
 net/netfilter/nft_connlimit.c              |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index 1910b6572430..3a188a0923a3 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -20,7 +20,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 				 bool *addit);
 
 bool nf_conncount_add(struct hlist_head *head,
-		      const struct nf_conntrack_tuple *tuple);
+		      const struct nf_conntrack_tuple *tuple,
+		      const struct nf_conntrack_zone *zone);
 
 void nf_conncount_cache_free(struct hlist_head *hhead);
 
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 3b5059a8dcdd..d8383609fe28 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -46,6 +46,7 @@
 struct nf_conncount_tuple {
 	struct hlist_node		node;
 	struct nf_conntrack_tuple	tuple;
+	struct nf_conntrack_zone	zone;
 };
 
 struct nf_conncount_rb {
@@ -80,7 +81,8 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 }
 
 bool nf_conncount_add(struct hlist_head *head,
-		      const struct nf_conntrack_tuple *tuple)
+		      const struct nf_conntrack_tuple *tuple,
+		      const struct nf_conntrack_zone *zone)
 {
 	struct nf_conncount_tuple *conn;
 
@@ -88,6 +90,7 @@ bool nf_conncount_add(struct hlist_head *head,
 	if (conn == NULL)
 		return false;
 	conn->tuple = *tuple;
+	conn->zone = *zone;
 	hlist_add_head(&conn->node, head);
 	return true;
 }
@@ -108,7 +111,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 
 	/* check the saved connections */
 	hlist_for_each_entry_safe(conn, n, head, node) {
-		found = nf_conntrack_find_get(net, zone, &conn->tuple);
+		found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
 		if (found == NULL) {
 			hlist_del(&conn->node);
 			kmem_cache_free(conncount_conn_cachep, conn);
@@ -117,7 +120,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 
 		found_ct = nf_ct_tuplehash_to_ctrack(found);
 
-		if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple)) {
+		if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
+		    nf_ct_zone_equal(found_ct, zone, zone->dir)) {
 			/*
 			 * Just to be sure we have it only once in the list.
 			 * We should not see tuples twice unless someone hooks
@@ -196,7 +200,7 @@ count_tree(struct net *net, struct rb_root *root,
 			if (!addit)
 				return count;
 
-			if (!nf_conncount_add(&rbconn->hhead, tuple))
+			if (!nf_conncount_add(&rbconn->hhead, tuple, zone))
 				return 0; /* hotdrop */
 
 			return count + 1;
@@ -238,6 +242,7 @@ count_tree(struct net *net, struct rb_root *root,
 	}
 
 	conn->tuple = *tuple;
+	conn->zone = *zone;
 	memcpy(rbconn->key, key, sizeof(u32) * keylen);
 
 	INIT_HLIST_HEAD(&rbconn->hhead);
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 50c068d660e5..a832c59f0a9c 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -52,7 +52,7 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 	if (!addit)
 		goto out;
 
-	if (!nf_conncount_add(&priv->hhead, tuple_ptr)) {
+	if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) {
 		regs->verdict.code = NF_DROP;
 		spin_unlock_bh(&priv->lock);
 		return;

From cdb8744d80352b55c622d049a6c91f449cd291f8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Tue, 12 Jun 2018 10:05:55 -0700
Subject: [PATCH 38/81] Revert "net: do not allow changing
 SO_REUSEADDR/SO_REUSEPORT on bound sockets"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revert the patch mentioned in the subject because it breaks at least
the Avahi mDNS daemon. That patch namely causes the Ubuntu 18.04 Avahi
daemon to fail to start:

Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Successfully called chroot().
Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Successfully dropped remaining capabilities.
Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: No service file found in /etc/avahi/services.
Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: SO_REUSEADDR failed: Structure needs cleaning
Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: SO_REUSEADDR failed: Structure needs cleaning
Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Failed to create server: No suitable network protocol available
Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: avahi-daemon 0.7 exiting.
Jun 12 09:49:24 ubuntu-vm systemd[1]: avahi-daemon.service: Main process exited, code=exited, status=255/n/a
Jun 12 09:49:24 ubuntu-vm systemd[1]: avahi-daemon.service: Failed with result 'exit-code'.
Jun 12 09:49:24 ubuntu-vm systemd[1]: Failed to start Avahi mDNS/DNS-SD Stack.

Fixes: f396922d862a ("net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets")
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index f333d75ef1a9..bcc41829a16d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -728,22 +728,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			sock_valbool_flag(sk, SOCK_DBG, valbool);
 		break;
 	case SO_REUSEADDR:
-		val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
-		if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
-		    inet_sk(sk)->inet_num &&
-		    (sk->sk_reuse != val)) {
-			ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
-			break;
-		}
-		sk->sk_reuse = val;
+		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 		break;
 	case SO_REUSEPORT:
-		if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
-		    inet_sk(sk)->inet_num &&
-		    (sk->sk_reuseport != valbool)) {
-			ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
-			break;
-		}
 		sk->sk_reuseport = valbool;
 		break;
 	case SO_TYPE:

From f8d0efb112275444c03b76ee2376f0055d12aeba Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 11 Jun 2018 21:33:35 -0700
Subject: [PATCH 39/81] nfp: don't pad strings in nfp_cpp_resource_find() to
 avoid gcc 8 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Once upon a time nfp_cpp_resource_find() took a name parameter,
which could be any user-chosen string.  Resources are identified
by a CRC32 hash of a 8 byte string, so we had to pad user input
with zeros to make sure CRC32 gave the correct result.

Since then nfp_cpp_resource_find() was made to operate on allocated
resources only (struct nfp_resource).  We kzalloc those so there is
no need to pad the strings and use memcmp.

This avoids a GCC 8 stringop-truncation warning:

In function ‘nfp_cpp_resource_find’,
    inlined from ‘nfp_resource_try_acquire’ at .../nfpcore/nfp_resource.c:153:8,
    inlined from ‘nfp_resource_acquire’ at .../nfpcore/nfp_resource.c:206:9:
    .../nfpcore/nfp_resource.c:108:2: warning:  strncpy’ output may be truncated copying 8 bytes from a string of length 8 [-Wstringop-truncation]
      strncpy(name_pad, res->name, sizeof(name_pad));
      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
index 2dd89dba9311..d32af598da90 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
@@ -98,21 +98,18 @@ struct nfp_resource {
 
 static int nfp_cpp_resource_find(struct nfp_cpp *cpp, struct nfp_resource *res)
 {
-	char name_pad[NFP_RESOURCE_ENTRY_NAME_SZ] = {};
 	struct nfp_resource_entry entry;
 	u32 cpp_id, key;
 	int ret, i;
 
 	cpp_id = NFP_CPP_ID(NFP_RESOURCE_TBL_TARGET, 3, 0);  /* Atomic read */
 
-	strncpy(name_pad, res->name, sizeof(name_pad));
-
 	/* Search for a matching entry */
-	if (!memcmp(name_pad, NFP_RESOURCE_TBL_NAME "\0\0\0\0\0\0\0\0", 8)) {
+	if (!strcmp(res->name, NFP_RESOURCE_TBL_NAME)) {
 		nfp_err(cpp, "Grabbing device lock not supported\n");
 		return -EOPNOTSUPP;
 	}
-	key = crc32_posix(name_pad, sizeof(name_pad));
+	key = crc32_posix(res->name, NFP_RESOURCE_ENTRY_NAME_SZ);
 
 	for (i = 0; i < NFP_RESOURCE_TBL_ENTRIES; i++) {
 		u64 addr = NFP_RESOURCE_TBL_BASE +

From 29f534c4bbfc1f66faec04575148be80def66c2b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 11 Jun 2018 21:33:36 -0700
Subject: [PATCH 40/81] nfp: include all ring counters in interface stats

We are gathering software statistics on per-ring basis.
.ndo_get_stats64 handler adds the rings up.  Unfortunately
we are currently only adding up active rings, which means
that if user decreases the number of active rings the
statistics from deactivated rings will no longer be counted
and total interface statistics may go backwards.

Always sum all possible rings, the stats are allocated
statically for max number of rings, so we don't have to
worry about them being removed.  We could add the stats
up when user changes the ring count, but it seems unnecessary..
Adding up inactive rings will be very quick since no datapath
will be touching them.

Fixes: 164d1e9e5d52 ("nfp: add support for ethtool .set_channels")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 75110c8d6a90..ed27176c2bce 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3121,7 +3121,7 @@ static void nfp_net_stat64(struct net_device *netdev,
 	struct nfp_net *nn = netdev_priv(netdev);
 	int r;
 
-	for (r = 0; r < nn->dp.num_r_vecs; r++) {
+	for (r = 0; r < nn->max_r_vecs; r++) {
 		struct nfp_net_r_vector *r_vec = &nn->r_vecs[r];
 		u64 data[3];
 		unsigned int start;

From fe06a64e0de718d59ae10263180aca02b84245d6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 11 Jun 2018 21:33:37 -0700
Subject: [PATCH 41/81] nfp: remove phys_port_name on flower's vNIC

.ndo_get_phys_port_name was recently extended to support multi-vNIC
FWs.  These are firmwares which can have more than one vNIC per PF
without associated port (e.g. Adaptive Buffer Management FW), therefore
we need a way of distinguishing the vNICs.  Unfortunately, it's too
late to make flower use the same naming.  Flower users may depend on
.ndo_get_phys_port_name returning -EOPNOTSUPP, for example the name
udev gave the PF vNIC was just the bare PCI device-based name before
the change, and will have 'nn0' appended after.

To ensure flower's vNIC doesn't have phys_port_name attribute, add
a flag to vNIC struct and set it in flower code.  New projects will
not set the flag adhere to the naming scheme from the start.

Fixes: 51c1df83e35c ("nfp: assign vNIC id as phys_port_name of vNICs which are not ports")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/main.c    | 1 +
 drivers/net/ethernet/netronome/nfp/nfp_net.h        | 4 ++++
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 19cfa162ac65..1decf3a1cad3 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -455,6 +455,7 @@ static int nfp_flower_vnic_alloc(struct nfp_app *app, struct nfp_net *nn,
 
 	eth_hw_addr_random(nn->dp.netdev);
 	netif_keep_dst(nn->dp.netdev);
+	nn->vnic_no_name = true;
 
 	return 0;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 57cb035dcc6d..2a71a9ffd095 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -590,6 +590,8 @@ struct nfp_net_dp {
  * @vnic_list:		Entry on device vNIC list
  * @pdev:		Backpointer to PCI device
  * @app:		APP handle if available
+ * @vnic_no_name:	For non-port PF vNIC make ndo_get_phys_port_name return
+ *			-EOPNOTSUPP to keep backwards compatibility (set by app)
  * @port:		Pointer to nfp_port structure if vNIC is a port
  * @app_priv:		APP private data for this vNIC
  */
@@ -663,6 +665,8 @@ struct nfp_net {
 	struct pci_dev *pdev;
 	struct nfp_app *app;
 
+	bool vnic_no_name;
+
 	struct nfp_port *port;
 
 	void *app_priv;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index ed27176c2bce..d4c27f849f9b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3286,7 +3286,7 @@ nfp_net_get_phys_port_name(struct net_device *netdev, char *name, size_t len)
 	if (nn->port)
 		return nfp_port_get_phys_port_name(netdev, name, len);
 
-	if (nn->dp.is_vf)
+	if (nn->dp.is_vf || nn->vnic_no_name)
 		return -EOPNOTSUPP;
 
 	n = snprintf(name, len, "n%d", nn->id);

From e62e51af3430745630f0cf76bb41a28d20c4ebdc Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Mon, 11 Jun 2018 21:33:38 -0700
Subject: [PATCH 42/81] nfp: flower: free dst_entry in route table

We need to release the refcnt on dst_entry in the route table, otherwise
we will leak the route.

Fixes: 8e6a9046b66a ("nfp: flower vxlan neighbour offload")
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Signed-off-by: Louis Peens <louis.peens@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
index ec524d97869d..78afe75129ab 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
@@ -381,6 +381,8 @@ nfp_tun_neigh_event_handler(struct notifier_block *nb, unsigned long event,
 	err = PTR_ERR_OR_ZERO(rt);
 	if (err)
 		return NOTIFY_DONE;
+
+	ip_rt_put(rt);
 #else
 	return NOTIFY_DONE;
 #endif

From 8cde8f0c0c03f9f7440f3d71a74d7cc35083f281 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 11 Jun 2018 12:44:54 -0700
Subject: [PATCH 43/81] hv_netvsc: drop common code until callback model fixed

The callback model of handling network failover is not suitable
in the current form.
  1. It was merged without addressing all the review feedback.
  2. It was merged without approval of any of the netvsc maintainers.
  3. Design discussion on how to handle PV/VF fallback is still
     not complete.
  4. IMHO the code model using callbacks is trying to make
     something common which isn't.

Revert the netvsc specific changes for now. Does not impact ongoing
development of failover model for virtio.
Revisit this after a simpler library based failover kernel
routines are extracted.

This reverts
commit 9c6ffbacdb57 ("hv_netvsc: fix error return code in netvsc_probe()")
and
commit 1ff78076d8dd ("netvsc: refactor notifier/event handling code to use the failover framework")

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/Kconfig      |   1 -
 drivers/net/hyperv/hyperv_net.h |   2 -
 drivers/net/hyperv/netvsc_drv.c | 224 +++++++++++++++++++++++---------
 3 files changed, 165 insertions(+), 62 deletions(-)

diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig
index 23a2d145813a..0765d5f61714 100644
--- a/drivers/net/hyperv/Kconfig
+++ b/drivers/net/hyperv/Kconfig
@@ -2,6 +2,5 @@ config HYPERV_NET
 	tristate "Microsoft Hyper-V virtual network driver"
 	depends on HYPERV
 	select UCS2_STRING
-	select FAILOVER
 	help
 	  Select this option to enable the Hyper-V virtual network driver.
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 23304aca25f9..9246e4562830 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -931,8 +931,6 @@ struct net_device_context {
 	u32 vf_alloc;
 	/* Serial number of the VF to team with */
 	u32 vf_serial;
-
-	struct failover *failover;
 };
 
 /* Per channel data */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 7b18a8c267c2..3ec79eb183ad 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -42,7 +42,6 @@
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
-#include <net/failover.h>
 
 #include "hyperv_net.h"
 
@@ -1780,6 +1779,46 @@ out_unlock:
 	rtnl_unlock();
 }
 
+static struct net_device *get_netvsc_bymac(const u8 *mac)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		if (dev->netdev_ops != &device_ops)
+			continue;	/* not a netvsc device */
+
+		if (ether_addr_equal(mac, dev->perm_addr))
+			return dev;
+	}
+
+	return NULL;
+}
+
+static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		struct net_device_context *net_device_ctx;
+
+		if (dev->netdev_ops != &device_ops)
+			continue;	/* not a netvsc device */
+
+		net_device_ctx = netdev_priv(dev);
+		if (!rtnl_dereference(net_device_ctx->nvdev))
+			continue;	/* device is removed */
+
+		if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
+			return dev;	/* a match */
+	}
+
+	return NULL;
+}
+
 /* Called when VF is injecting data into network stack.
  * Change the associated network device from VF to netvsc.
  * note: already called with rcu_read_lock
@@ -1802,6 +1841,46 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
 	return RX_HANDLER_ANOTHER;
 }
 
+static int netvsc_vf_join(struct net_device *vf_netdev,
+			  struct net_device *ndev)
+{
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
+	int ret;
+
+	ret = netdev_rx_handler_register(vf_netdev,
+					 netvsc_vf_handle_frame, ndev);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not register netvsc VF receive handler (err = %d)\n",
+			   ret);
+		goto rx_handler_failed;
+	}
+
+	ret = netdev_master_upper_dev_link(vf_netdev, ndev,
+					   NULL, NULL, NULL);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not set master device %s (err = %d)\n",
+			   ndev->name, ret);
+		goto upper_link_failed;
+	}
+
+	/* set slave flag before open to prevent IPv6 addrconf */
+	vf_netdev->flags |= IFF_SLAVE;
+
+	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+
+	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
+
+	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
+	return 0;
+
+upper_link_failed:
+	netdev_rx_handler_unregister(vf_netdev);
+rx_handler_failed:
+	return ret;
+}
+
 static void __netvsc_vf_setup(struct net_device *ndev,
 			      struct net_device *vf_netdev)
 {
@@ -1852,95 +1931,85 @@ static void netvsc_vf_setup(struct work_struct *w)
 	rtnl_unlock();
 }
 
-static int netvsc_pre_register_vf(struct net_device *vf_netdev,
-				  struct net_device *ndev)
+static int netvsc_register_vf(struct net_device *vf_netdev)
 {
+	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
 
+	if (vf_netdev->addr_len != ETH_ALEN)
+		return NOTIFY_DONE;
+
+	/*
+	 * We will use the MAC address to locate the synthetic interface to
+	 * associate with the VF interface. If we don't find a matching
+	 * synthetic interface, move on.
+	 */
+	ndev = get_netvsc_bymac(vf_netdev->perm_addr);
+	if (!ndev)
+		return NOTIFY_DONE;
+
 	net_device_ctx = netdev_priv(ndev);
 	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 	if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
-		return -ENODEV;
+		return NOTIFY_DONE;
 
-	return 0;
-}
+	if (netvsc_vf_join(vf_netdev, ndev) != 0)
+		return NOTIFY_DONE;
 
-static int netvsc_register_vf(struct net_device *vf_netdev,
-			      struct net_device *ndev)
-{
-	struct net_device_context *ndev_ctx = netdev_priv(ndev);
-
-	/* set slave flag before open to prevent IPv6 addrconf */
-	vf_netdev->flags |= IFF_SLAVE;
-
-	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
-
-	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
-
-	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
+	netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
 
 	dev_hold(vf_netdev);
-	rcu_assign_pointer(ndev_ctx->vf_netdev, vf_netdev);
-
-	return 0;
+	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
+	return NOTIFY_OK;
 }
 
 /* VF up/down change detected, schedule to change data path */
-static int netvsc_vf_changed(struct net_device *vf_netdev,
-			     struct net_device *ndev)
+static int netvsc_vf_changed(struct net_device *vf_netdev)
 {
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
+	struct net_device *ndev;
 	bool vf_is_up = netif_running(vf_netdev);
 
+	ndev = get_netvsc_byref(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
+
 	net_device_ctx = netdev_priv(ndev);
 	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 	if (!netvsc_dev)
-		return -ENODEV;
+		return NOTIFY_DONE;
 
 	netvsc_switch_datapath(ndev, vf_is_up);
 	netdev_info(ndev, "Data path switched %s VF: %s\n",
 		    vf_is_up ? "to" : "from", vf_netdev->name);
 
-	return 0;
+	return NOTIFY_OK;
 }
 
-static int netvsc_pre_unregister_vf(struct net_device *vf_netdev,
-				    struct net_device *ndev)
+static int netvsc_unregister_vf(struct net_device *vf_netdev)
 {
+	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 
+	ndev = get_netvsc_byref(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
+
 	net_device_ctx = netdev_priv(ndev);
 	cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
 
-	return 0;
-}
-
-static int netvsc_unregister_vf(struct net_device *vf_netdev,
-				struct net_device *ndev)
-{
-	struct net_device_context *net_device_ctx;
-
-	net_device_ctx = netdev_priv(ndev);
-
 	netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
 
+	netdev_rx_handler_unregister(vf_netdev);
+	netdev_upper_dev_unlink(vf_netdev, ndev);
 	RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
 	dev_put(vf_netdev);
 
-	return 0;
+	return NOTIFY_OK;
 }
 
-static struct failover_ops netvsc_failover_ops = {
-	.slave_pre_register	= netvsc_pre_register_vf,
-	.slave_register		= netvsc_register_vf,
-	.slave_pre_unregister	= netvsc_pre_unregister_vf,
-	.slave_unregister	= netvsc_unregister_vf,
-	.slave_link_change	= netvsc_vf_changed,
-	.slave_handle_frame	= netvsc_vf_handle_frame,
-};
-
 static int netvsc_probe(struct hv_device *dev,
 			const struct hv_vmbus_device_id *dev_id)
 {
@@ -2030,16 +2099,8 @@ static int netvsc_probe(struct hv_device *dev,
 		goto register_failed;
 	}
 
-	net_device_ctx->failover = failover_register(net, &netvsc_failover_ops);
-	if (IS_ERR(net_device_ctx->failover)) {
-		ret = PTR_ERR(net_device_ctx->failover);
-		goto err_failover;
-	}
-
 	return ret;
 
-err_failover:
-	unregister_netdev(net);
 register_failed:
 	rndis_filter_device_remove(dev, nvdev);
 rndis_failed:
@@ -2080,15 +2141,13 @@ static int netvsc_remove(struct hv_device *dev)
 	rtnl_lock();
 	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
 	if (vf_netdev)
-		failover_slave_unregister(vf_netdev);
+		netvsc_unregister_vf(vf_netdev);
 
 	if (nvdev)
 		rndis_filter_device_remove(dev, nvdev);
 
 	unregister_netdevice(net);
 
-	failover_unregister(ndev_ctx->failover);
-
 	rtnl_unlock();
 	rcu_read_unlock();
 
@@ -2115,8 +2174,54 @@ static struct  hv_driver netvsc_drv = {
 	.remove = netvsc_remove,
 };
 
+/*
+ * On Hyper-V, every VF interface is matched with a corresponding
+ * synthetic interface. The synthetic interface is presented first
+ * to the guest. When the corresponding VF instance is registered,
+ * we will take care of switching the data path.
+ */
+static int netvsc_netdev_event(struct notifier_block *this,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	/* Skip our own events */
+	if (event_dev->netdev_ops == &device_ops)
+		return NOTIFY_DONE;
+
+	/* Avoid non-Ethernet type devices */
+	if (event_dev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	/* Avoid Vlan dev with same MAC registering as VF */
+	if (is_vlan_dev(event_dev))
+		return NOTIFY_DONE;
+
+	/* Avoid Bonding master dev with same MAC registering as VF */
+	if ((event_dev->priv_flags & IFF_BONDING) &&
+	    (event_dev->flags & IFF_MASTER))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return netvsc_register_vf(event_dev);
+	case NETDEV_UNREGISTER:
+		return netvsc_unregister_vf(event_dev);
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+		return netvsc_vf_changed(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block netvsc_netdev_notifier = {
+	.notifier_call = netvsc_netdev_event,
+};
+
 static void __exit netvsc_drv_exit(void)
 {
+	unregister_netdevice_notifier(&netvsc_netdev_notifier);
 	vmbus_driver_unregister(&netvsc_drv);
 }
 
@@ -2135,6 +2240,7 @@ static int __init netvsc_drv_init(void)
 	if (ret)
 		return ret;
 
+	register_netdevice_notifier(&netvsc_netdev_notifier);
 	return 0;
 }
 

From 7bf7bb37f16a80465ee3bd7c6c966f96f5a075a6 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 11 Jun 2018 12:44:55 -0700
Subject: [PATCH 44/81] hv_netvsc: fix network namespace issues with VF support

When finding the parent netvsc device, the search needs to be across
all netvsc device instances (independent of network namespace).

Find parent device of VF using upper_dev_get routine which
searches only adjacent list.

Fixes: e8ff40d4bff1 ("hv_netvsc: improve VF device matching")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>

netns aware byref
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h |  2 ++
 drivers/net/hyperv/netvsc_drv.c | 43 +++++++++++++++------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 9246e4562830..d31c0cd329a1 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -901,6 +901,8 @@ struct net_device_context {
 	struct hv_device *device_ctx;
 	/* netvsc_device */
 	struct netvsc_device __rcu *nvdev;
+	/* list of netvsc net_devices */
+	struct list_head list;
 	/* reconfigure work */
 	struct delayed_work dwork;
 	/* last reconfig time */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 3ec79eb183ad..309696b5cd14 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -67,6 +67,8 @@ static int debug = -1;
 module_param(debug, int, 0444);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+static LIST_HEAD(netvsc_dev_list);
+
 static void netvsc_change_rx_flags(struct net_device *net, int change)
 {
 	struct net_device_context *ndev_ctx = netdev_priv(net);
@@ -1781,13 +1783,10 @@ out_unlock:
 
 static struct net_device *get_netvsc_bymac(const u8 *mac)
 {
-	struct net_device *dev;
+	struct net_device_context *ndev_ctx;
 
-	ASSERT_RTNL();
-
-	for_each_netdev(&init_net, dev) {
-		if (dev->netdev_ops != &device_ops)
-			continue;	/* not a netvsc device */
+	list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
+		struct net_device *dev = hv_get_drvdata(ndev_ctx->device_ctx);
 
 		if (ether_addr_equal(mac, dev->perm_addr))
 			return dev;
@@ -1798,25 +1797,18 @@ static struct net_device *get_netvsc_bymac(const u8 *mac)
 
 static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
 {
+	struct net_device_context *net_device_ctx;
 	struct net_device *dev;
 
-	ASSERT_RTNL();
+	dev = netdev_master_upper_dev_get(vf_netdev);
+	if (!dev || dev->netdev_ops != &device_ops)
+		return NULL;	/* not a netvsc device */
 
-	for_each_netdev(&init_net, dev) {
-		struct net_device_context *net_device_ctx;
+	net_device_ctx = netdev_priv(dev);
+	if (!rtnl_dereference(net_device_ctx->nvdev))
+		return NULL;	/* device is removed */
 
-		if (dev->netdev_ops != &device_ops)
-			continue;	/* not a netvsc device */
-
-		net_device_ctx = netdev_priv(dev);
-		if (!rtnl_dereference(net_device_ctx->nvdev))
-			continue;	/* device is removed */
-
-		if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
-			return dev;	/* a match */
-	}
-
-	return NULL;
+	return dev;
 }
 
 /* Called when VF is injecting data into network stack.
@@ -2093,15 +2085,19 @@ static int netvsc_probe(struct hv_device *dev,
 	else
 		net->max_mtu = ETH_DATA_LEN;
 
-	ret = register_netdev(net);
+	rtnl_lock();
+	ret = register_netdevice(net);
 	if (ret != 0) {
 		pr_err("Unable to register netdev.\n");
 		goto register_failed;
 	}
 
-	return ret;
+	list_add(&net_device_ctx->list, &netvsc_dev_list);
+	rtnl_unlock();
+	return 0;
 
 register_failed:
+	rtnl_unlock();
 	rndis_filter_device_remove(dev, nvdev);
 rndis_failed:
 	free_percpu(net_device_ctx->vf_stats);
@@ -2147,6 +2143,7 @@ static int netvsc_remove(struct hv_device *dev)
 		rndis_filter_device_remove(dev, nvdev);
 
 	unregister_netdevice(net);
+	list_del(&ndev_ctx->list);
 
 	rtnl_unlock();
 	rcu_read_unlock();

From c0a41b887ce614279c51964509e8d715979ce1f2 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 11 Jun 2018 12:44:56 -0700
Subject: [PATCH 45/81] hv_netvsc: move VF to same namespace as netvsc device

When VF is added, the paravirtual device is already present
and may have been moved to another network namespace. For example,
sometimes the management interface is put in another net namespace
in some environments.

The VF should get moved to where the netvsc device is when the
VF is discovered. The user can move it later (if desired).

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 309696b5cd14..fe2256bf1d13 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1928,6 +1928,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
+	int ret;
 
 	if (vf_netdev->addr_len != ETH_ALEN)
 		return NOTIFY_DONE;
@@ -1946,11 +1947,29 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 	if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
 		return NOTIFY_DONE;
 
-	if (netvsc_vf_join(vf_netdev, ndev) != 0)
+	/* if syntihetic interface is a different namespace,
+	 * then move the VF to that namespace; join will be
+	 * done again in that context.
+	 */
+	if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
+		ret = dev_change_net_namespace(vf_netdev,
+					       dev_net(ndev), "eth%d");
+		if (ret)
+			netdev_err(vf_netdev,
+				   "could not move to same namespace as %s: %d\n",
+				   ndev->name, ret);
+		else
+			netdev_info(vf_netdev,
+				    "VF moved to namespace with: %s\n",
+				    ndev->name);
 		return NOTIFY_DONE;
+	}
 
 	netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
 
+	if (netvsc_vf_join(vf_netdev, ndev) != 0)
+		return NOTIFY_DONE;
+
 	dev_hold(vf_netdev);
 	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
 	return NOTIFY_OK;

From 909f1edc49953dbc0bc0512e1300691b9c2f432d Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 11 Jun 2018 13:19:03 +0200
Subject: [PATCH 46/81] net: phy: mdio-gpio: Cut surplus includes

The GPIO MDIO driver now needs only <linux/gpio/consumer.h>
so cut the legacy <linux/gpio.h> and <linux/of_gpio.h>
includes that are no longer used.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 4e4c8daf44c3..33265747bf39 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -26,10 +26,7 @@
 #include <linux/platform_device.h>
 #include <linux/mdio-bitbang.h>
 #include <linux/mdio-gpio.h>
-#include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
-
-#include <linux/of_gpio.h>
 #include <linux/of_mdio.h>
 
 struct mdio_gpio_info {

From 469998c861faaa9f228701557fe8454f75a12e5c Mon Sep 17 00:00:00 2001
From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
Date: Fri, 8 Jun 2018 02:27:59 -0700
Subject: [PATCH 47/81] net: thunderx: prevent concurrent data re-writing by
 nicvf_set_rx_mode

For each network interface linux network stack issue ndo_set_rx_mode call
in order to configure MAC address filters (e.g. for multicast filtering).
Currently ThunderX NICVF driver has only one ordered workqueue to process
such requests for all VFs.

And because of that it is possible that subsequent call to
ndo_set_rx_mode would corrupt data which is currently in use
by nicvf_set_rx_mode_task. Which in turn could cause following issue:
[...]
[   48.978341] Unable to handle kernel paging request at virtual address 1fffff0000000000
[   48.986275] Mem abort info:
[   48.989058]   Exception class = DABT (current EL), IL = 32 bits
[   48.994965]   SET = 0, FnV = 0
[   48.998020]   EA = 0, S1PTW = 0
[   49.001152] Data abort info:
[   49.004022]   ISV = 0, ISS = 0x00000004
[   49.007869]   CM = 0, WnR = 0
[   49.010826] [1fffff0000000000] address between user and kernel address ranges
[   49.017963] Internal error: Oops: 96000004 [#1] SMP
[...]
[   49.072138] task: ffff800fdd675400 task.stack: ffff000026440000
[   49.078051] PC is at prefetch_freepointer.isra.37+0x28/0x3c
[   49.083613] LR is at kmem_cache_alloc_trace+0xc8/0x1fc
[...]
[   49.272684] [<ffff0000082738f0>] prefetch_freepointer.isra.37+0x28/0x3c
[   49.279286] [<ffff000008276bc8>] kmem_cache_alloc_trace+0xc8/0x1fc
[   49.285455] [<ffff0000082c0c0c>] alloc_fdtable+0x78/0x134
[   49.290841] [<ffff0000082c15c0>] dup_fd+0x254/0x2f4
[   49.295709] [<ffff0000080d1954>] copy_process.isra.38.part.39+0x64c/0x1168
[   49.302572] [<ffff0000080d264c>] _do_fork+0xfc/0x3b0
[   49.307524] [<ffff0000080d29e8>] SyS_clone+0x44/0x50
[...]

This patch is to prevent such concurrent data write with spinlock.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/thunder/nic.h     |  2 +
 .../net/ethernet/cavium/thunder/nicvf_main.c  | 50 +++++++++++++------
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 448d1fafc827..f4d81765221e 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -325,6 +325,8 @@ struct nicvf {
 	struct tasklet_struct	qs_err_task;
 	struct work_struct	reset_task;
 	struct nicvf_work       rx_mode_work;
+	/* spinlock to protect workqueue arguments from concurrent access */
+	spinlock_t              rx_mode_wq_lock;
 
 	/* PTP timestamp */
 	struct cavium_ptp	*ptp_clock;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 7135db45927e..135766c4296b 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1923,17 +1923,12 @@ static int nicvf_ioctl(struct net_device *netdev, struct ifreq *req, int cmd)
 	}
 }
 
-static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
+static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs,
+				     struct nicvf *nic)
 {
-	struct nicvf_work *vf_work = container_of(work_arg, struct nicvf_work,
-						  work.work);
-	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
 	union nic_mbx mbx = {};
 	int idx;
 
-	if (!vf_work)
-		return;
-
 	/* From the inside of VM code flow we have only 128 bits memory
 	 * available to send message to host's PF, so send all mc addrs
 	 * one by one, starting from flush command in case if kernel
@@ -1944,7 +1939,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
 	mbx.xcast.msg = NIC_MBOX_MSG_RESET_XCAST;
 	nicvf_send_msg_to_pf(nic, &mbx);
 
-	if (vf_work->mode & BGX_XCAST_MCAST_FILTER) {
+	if (mode & BGX_XCAST_MCAST_FILTER) {
 		/* once enabling filtering, we need to signal to PF to add
 		 * its' own LMAC to the filter to accept packets for it.
 		 */
@@ -1954,23 +1949,46 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
 	}
 
 	/* check if we have any specific MACs to be added to PF DMAC filter */
-	if (vf_work->mc) {
+	if (mc_addrs) {
 		/* now go through kernel list of MACs and add them one by one */
-		for (idx = 0; idx < vf_work->mc->count; idx++) {
+		for (idx = 0; idx < mc_addrs->count; idx++) {
 			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
-			mbx.xcast.data.mac = vf_work->mc->mc[idx];
+			mbx.xcast.data.mac = mc_addrs->mc[idx];
 			nicvf_send_msg_to_pf(nic, &mbx);
 		}
-		kfree(vf_work->mc);
+		kfree(mc_addrs);
 	}
 
 	/* and finally set rx mode for PF accordingly */
 	mbx.xcast.msg = NIC_MBOX_MSG_SET_XCAST;
-	mbx.xcast.data.mode = vf_work->mode;
+	mbx.xcast.data.mode = mode;
 
 	nicvf_send_msg_to_pf(nic, &mbx);
 }
 
+static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
+{
+	struct nicvf_work *vf_work = container_of(work_arg, struct nicvf_work,
+						  work.work);
+	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
+	u8 mode;
+	struct xcast_addr_list *mc;
+
+	if (!vf_work)
+		return;
+
+	/* Save message data locally to prevent them from
+	 * being overwritten by next ndo_set_rx_mode call().
+	 */
+	spin_lock(&nic->rx_mode_wq_lock);
+	mode = vf_work->mode;
+	mc = vf_work->mc;
+	vf_work->mc = NULL;
+	spin_unlock(&nic->rx_mode_wq_lock);
+
+	__nicvf_set_rx_mode_task(mode, mc, nic);
+}
+
 static void nicvf_set_rx_mode(struct net_device *netdev)
 {
 	struct nicvf *nic = netdev_priv(netdev);
@@ -2004,9 +2022,12 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
 			}
 		}
 	}
+	spin_lock(&nic->rx_mode_wq_lock);
+	kfree(nic->rx_mode_work.mc);
 	nic->rx_mode_work.mc = mc_list;
 	nic->rx_mode_work.mode = mode;
-	queue_delayed_work(nicvf_rx_mode_wq, &nic->rx_mode_work.work, 2 * HZ);
+	queue_delayed_work(nicvf_rx_mode_wq, &nic->rx_mode_work.work, 0);
+	spin_unlock(&nic->rx_mode_wq_lock);
 }
 
 static const struct net_device_ops nicvf_netdev_ops = {
@@ -2163,6 +2184,7 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	INIT_WORK(&nic->reset_task, nicvf_reset_task);
 
 	INIT_DELAYED_WORK(&nic->rx_mode_work.work, nicvf_set_rx_mode_task);
+	spin_lock_init(&nic->rx_mode_wq_lock);
 
 	err = register_netdev(netdev);
 	if (err) {

From 31962c8c78b3cb480e28120a20b45811b76e207d Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 11 Jun 2018 16:02:36 +0200
Subject: [PATCH 48/81] tc-testing: ife: fix wrong teardown command in test
 b7b8

fix failures in the 'teardown' stage of test b7b8, probably a leftover of
commit 7c5995b33d6e ("tc-testing: fixed copy-pasting error in ife tests")

Fixes: a56e6bcd34b55 ("tc-testing: updated ife test cases")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/tc-testing/tc-tests/actions/ife.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
index de97e4ff705c..637ea0219617 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
@@ -568,7 +568,7 @@
         "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*use tcindex 65535.*index 1",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {

From 760a6ed6b6f29c48f97ff5a94ba0dbc639a2e677 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 11 Jun 2018 19:52:27 +0200
Subject: [PATCH 49/81] net: stmmac: dwmac-meson8b: Fix an error handling path
 in 'meson8b_dwmac_probe()'

If 'of_device_get_match_data()' fails, we need to release some resources as
done in the other error handling path of this function.

Fixes: efacb568c962 ("net: stmmac: dwmac-meson: extend phy mode setting")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
index 4ff231df7322..c5979569fd60 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
@@ -334,9 +334,10 @@ static int meson8b_dwmac_probe(struct platform_device *pdev)
 
 	dwmac->data = (const struct meson8b_dwmac_data *)
 		of_device_get_match_data(&pdev->dev);
-	if (!dwmac->data)
-		return -EINVAL;
-
+	if (!dwmac->data) {
+		ret = -EINVAL;
+		goto err_remove_config_dt;
+	}
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
 	dwmac->regs = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(dwmac->regs)) {

From c0129a0614428e5e4350fa963eecd1fbe19e57e9 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 11 Jun 2018 14:07:14 -0700
Subject: [PATCH 50/81] smc: convert to ->poll_mask

smc->clcsock is an internal TCP socket, after TCP socket
converts to ->poll_mask, ->poll doesn't exist any more.
So just convert smc socket to ->poll_mask too.

Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask")
Reported-by: syzbot+f5066e369b2d5fff630f@syzkaller.appspotmail.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 973b4471b532..da7f02edcd37 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1273,8 +1273,7 @@ static __poll_t smc_accept_poll(struct sock *parent)
 	return mask;
 }
 
-static __poll_t smc_poll(struct file *file, struct socket *sock,
-			     poll_table *wait)
+static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
@@ -1290,7 +1289,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
 		release_sock(sk);
-		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+		mask = smc->clcsock->ops->poll_mask(smc->clcsock, events);
 		lock_sock(sk);
 		sk->sk_err = smc->clcsock->sk->sk_err;
 		if (sk->sk_err) {
@@ -1308,11 +1307,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 			}
 		}
 	} else {
-		if (sk->sk_state != SMC_CLOSED) {
-			release_sock(sk);
-			sock_poll_wait(file, sk_sleep(sk), wait);
-			lock_sock(sk);
-		}
 		if (sk->sk_err)
 			mask |= EPOLLERR;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
@@ -1625,7 +1619,7 @@ static const struct proto_ops smc_sock_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= smc_accept,
 	.getname	= smc_getname,
-	.poll		= smc_poll,
+	.poll_mask	= smc_poll_mask,
 	.ioctl		= smc_ioctl,
 	.listen		= smc_listen,
 	.shutdown	= smc_shutdown,

From 57f230ab04d2910a06d17d988f1c4d7586a59113 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 12 Jun 2018 08:57:53 +0200
Subject: [PATCH 51/81] xen/netfront: raise max number of slots in
 xennet_get_responses()

The max number of slots used in xennet_get_responses() is set to
MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD).

In old kernel-xen MAX_SKB_FRAGS was 18, while nowadays it is 17. This
difference is resulting in frequent messages "too many slots" and a
reduced network throughput for some workloads (factor 10 below that of
a kernel-xen based guest).

Replacing MAX_SKB_FRAGS by XEN_NETIF_NR_SLOTS_MIN for calculation of
the max number of slots to use solves that problem (tests showed no
more messages "too many slots" and throughput was as high as with the
kernel-xen based guest system).

Replace MAX_SKB_FRAGS-2 by XEN_NETIF_NR_SLOTS_MIN-1 in
netfront_tx_slot_available() for making it clearer what is really being
tested without actually modifying the tested value.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netfront.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 679da1abd73c..922ce0abf5cf 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -239,7 +239,7 @@ static void rx_refill_timeout(struct timer_list *t)
 static int netfront_tx_slot_available(struct netfront_queue *queue)
 {
 	return (queue->tx.req_prod_pvt - queue->tx.rsp_cons) <
-		(NET_TX_RING_SIZE - MAX_SKB_FRAGS - 2);
+		(NET_TX_RING_SIZE - XEN_NETIF_NR_SLOTS_MIN - 1);
 }
 
 static void xennet_maybe_wake_tx(struct netfront_queue *queue)
@@ -790,7 +790,7 @@ static int xennet_get_responses(struct netfront_queue *queue,
 	RING_IDX cons = queue->rx.rsp_cons;
 	struct sk_buff *skb = xennet_get_rx_skb(queue, cons);
 	grant_ref_t ref = xennet_get_rx_ref(queue, cons);
-	int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+	int max = XEN_NETIF_NR_SLOTS_MIN + (rx->status <= RX_COPY_THRESHOLD);
 	int slots = 1;
 	int err = 0;
 	unsigned long ret;

From 995191220056300c51ed870a5d5321f91f3eef89 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 14 Jun 2018 07:37:02 +0800
Subject: [PATCH 52/81] sctp: define sctp_packet_gso_append to build GSO frames

Now sctp GSO uses skb_gro_receive() to append the data into head
skb frag_list. However it actually only needs very few code from
skb_gro_receive(). Besides, NAPI_GRO_CB has to be set while most
of its members are not needed here.

This patch is to add sctp_packet_gso_append() to build GSO frames
instead of skb_gro_receive(), and it would avoid many unnecessary
checks and make the code clearer.

Note that sctp will use page frags instead of frag_list to build
GSO frames in another patch. But it may take time, as sctp's GSO
frames may have different size. skb_segment() can only split it
into the frags with the same size, which would break the border
of sctp chunks.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  5 +++++
 net/sctp/output.c          | 28 ++++++++++++++++++----------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ebf809eed33a..dbe1b911a24d 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1133,6 +1133,11 @@ struct sctp_input_cb {
 };
 #define SCTP_INPUT_CB(__skb)	((struct sctp_input_cb *)&((__skb)->cb[0]))
 
+struct sctp_output_cb {
+	struct sk_buff *last;
+};
+#define SCTP_OUTPUT_CB(__skb)	((struct sctp_output_cb *)&((__skb)->cb[0]))
+
 static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb)
 {
 	const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index e672dee302c7..7f849b01ec8e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -409,6 +409,21 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
 	refcount_inc(&sk->sk_wmem_alloc);
 }
 
+static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb)
+{
+	if (SCTP_OUTPUT_CB(head)->last == head)
+		skb_shinfo(head)->frag_list = skb;
+	else
+		SCTP_OUTPUT_CB(head)->last->next = skb;
+	SCTP_OUTPUT_CB(head)->last = skb;
+
+	head->truesize += skb->truesize;
+	head->data_len += skb->len;
+	head->len += skb->len;
+
+	__skb_header_release(skb);
+}
+
 static int sctp_packet_pack(struct sctp_packet *packet,
 			    struct sk_buff *head, int gso, gfp_t gfp)
 {
@@ -422,7 +437,7 @@ static int sctp_packet_pack(struct sctp_packet *packet,
 
 	if (gso) {
 		skb_shinfo(head)->gso_type = sk->sk_gso_type;
-		NAPI_GRO_CB(head)->last = head;
+		SCTP_OUTPUT_CB(head)->last = head;
 	} else {
 		nskb = head;
 		pkt_size = packet->size;
@@ -503,15 +518,8 @@ merge:
 					 &packet->chunk_list);
 		}
 
-		if (gso) {
-			if (skb_gro_receive(&head, nskb)) {
-				kfree_skb(nskb);
-				return 0;
-			}
-			if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
-					 sk->sk_gso_max_segs))
-				return 0;
-		}
+		if (gso)
+			sctp_packet_gso_append(head, nskb);
 
 		pkt_count++;
 	} while (!list_empty(&packet->chunk_list));

From bdf767cae3dddcb50a9ca09d01bb79df3e384f7b Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Mon, 11 Jun 2018 21:03:45 +0800
Subject: [PATCH 53/81] net: qcom/emac: Add missing of_node_put()

Add missing of_node_put() call for device node returned by
of_parse_phandle().

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Timur Tabi <timur@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
index e78e5db39458..c694e3428dfc 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
@@ -384,6 +384,7 @@ int emac_sgmii_config(struct platform_device *pdev, struct emac_adapter *adpt)
 		}
 
 		sgmii_pdev = of_find_device_by_node(np);
+		of_node_put(np);
 		if (!sgmii_pdev) {
 			dev_err(&pdev->dev, "invalid internal-phy property\n");
 			return -ENODEV;

From 4fd44a98ffe0d048246efef67ed640fdf2098a62 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 12 Jun 2018 23:09:37 +0000
Subject: [PATCH 54/81] tcp: verify the checksum of the first data segment in a
 new connection

commit 079096f103fa ("tcp/dccp: install syn_recv requests into ehash
table") introduced an optimization for the handling of child sockets
created for a new TCP connection.

But this optimization passes any data associated with the last ACK of the
connection handshake up the stack without verifying its checksum, because it
calls tcp_child_process(), which in turn calls tcp_rcv_state_process()
directly.  These lower-level processing functions do not do any checksum
verification.

Insert a tcp_checksum_complete call in the TCP_NEW_SYN_RECEIVE path to
fix this.

Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 ++++
 net/ipv6/tcp_ipv6.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fed3f1c66167..bea17f1e8302 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1730,6 +1730,10 @@ process:
 			reqsk_put(req);
 			goto discard_it;
 		}
+		if (tcp_checksum_complete(skb)) {
+			reqsk_put(req);
+			goto csum_error;
+		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
 			inet_csk_reqsk_queue_drop_and_put(sk, req);
 			goto lookup;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b620d9b72e59..7efa9fd7e109 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1479,6 +1479,10 @@ process:
 			reqsk_put(req);
 			goto discard_it;
 		}
+		if (tcp_checksum_complete(skb)) {
+			reqsk_put(req);
+			goto csum_error;
+		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
 			inet_csk_reqsk_queue_drop_and_put(sk, req);
 			goto lookup;

From 087fca595a0a30804fd7896e77ba11aa46e5d708 Mon Sep 17 00:00:00 2001
From: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Date: Wed, 13 Jun 2018 12:05:16 +0530
Subject: [PATCH 55/81] net: emaclite: Fix position of lp->mii_bus assignment

To ensure MDIO bus is not double freed in remove() path
assign lp->mii_bus after MDIO bus registration.

Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/xilinx_emaclite.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
index 69e31ceccfae..37989ce543ba 100644
--- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c
+++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
@@ -863,14 +863,14 @@ static int xemaclite_mdio_setup(struct net_local *lp, struct device *dev)
 	bus->write = xemaclite_mdio_write;
 	bus->parent = dev;
 
-	lp->mii_bus = bus;
-
 	rc = of_mdiobus_register(bus, np);
 	if (rc) {
 		dev_err(dev, "Failed to register mdio bus.\n");
 		goto err_register;
 	}
 
+	lp->mii_bus = bus;
+
 	return 0;
 
 err_register:

From 27cad008406600822ab638980412ceea740e7fc8 Mon Sep 17 00:00:00 2001
From: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Date: Wed, 13 Jun 2018 12:05:17 +0530
Subject: [PATCH 56/81] net: emaclite: Fix MDIO bus unregister bug

Since 'has_mdio' flag is not used,sequence insmod->rmmod-> insmod
leads to failure as MDIO unregister doesn't happen in .remove().
Fix it by checking MII bus pointer instead.

Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/xilinx_emaclite.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
index 37989ce543ba..06eb6c886388 100644
--- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c
+++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
@@ -1191,7 +1191,7 @@ static int xemaclite_of_remove(struct platform_device *of_dev)
 	struct net_local *lp = netdev_priv(ndev);
 
 	/* Un-register the mii_bus, if configured */
-	if (lp->has_mdio) {
+	if (lp->mii_bus) {
 		mdiobus_unregister(lp->mii_bus);
 		mdiobus_free(lp->mii_bus);
 		lp->mii_bus = NULL;

From bd45cbf5451dcbba16c19aafd6dd99bc3e1e9644 Mon Sep 17 00:00:00 2001
From: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Date: Wed, 13 Jun 2018 12:05:18 +0530
Subject: [PATCH 57/81] net: emaclite: Remove unused 'has_mdio' flag.

Remove unused 'has_mdio' flag.

Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/xilinx_emaclite.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
index 06eb6c886388..ec4608e8ab1b 100644
--- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c
+++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
@@ -123,7 +123,6 @@
  * @phy_node:		pointer to the PHY device node
  * @mii_bus:		pointer to the MII bus
  * @last_link:		last link status
- * @has_mdio:		indicates whether MDIO is included in the HW
  */
 struct net_local {
 
@@ -144,7 +143,6 @@ struct net_local {
 	struct mii_bus *mii_bus;
 
 	int last_link;
-	bool has_mdio;
 };
 
 

From 560c5bddba72cc4fd9b77731b64b7937fde3b340 Mon Sep 17 00:00:00 2001
From: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Date: Wed, 13 Jun 2018 12:05:19 +0530
Subject: [PATCH 58/81] net: emaclite: Remove xemaclite_mdio_setup return check

Errors are already reported in xemaclite_mdio_setup so avoid
reporting it again.

Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/xilinx_emaclite.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
index ec4608e8ab1b..2a0c06e0f730 100644
--- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c
+++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
@@ -1143,9 +1143,7 @@ static int xemaclite_of_probe(struct platform_device *ofdev)
 	xemaclite_update_address(lp, ndev->dev_addr);
 
 	lp->phy_node = of_parse_phandle(ofdev->dev.of_node, "phy-handle", 0);
-	rc = xemaclite_mdio_setup(lp, &ofdev->dev);
-	if (rc)
-		dev_warn(&ofdev->dev, "error registering MDIO bus\n");
+	xemaclite_mdio_setup(lp, &ofdev->dev);
 
 	dev_info(dev, "MAC address is now %pM\n", ndev->dev_addr);
 

From 90904ff5f958a215cc3d26f957a46e80fa178470 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Wed, 13 Jun 2018 15:09:18 +0200
Subject: [PATCH 59/81] l2tp: fix pseudo-wire type for sessions created by
 pppol2tp_connect()

Define cfg.pw_type so that the new session is created with its .pwtype
field properly set (L2TP_PWTYPE_PPP).

Not setting the pseudo-wire type had several annoying effects:

  * Invalid value returned in the L2TP_ATTR_PW_TYPE attribute when
    dumping sessions with the netlink API.

  * Impossibility to delete the session using the netlink API (because
    l2tp_nl_cmd_session_delete() gets the deletion callback function
    from an array indexed by the session's pseudo-wire type).

Also, there are several cases where we should check a session's
pseudo-wire type. For example, pppol2tp_connect() should refuse to
connect a session that is not PPPoL2TP, but that requires the session's
.pwtype field to be properly set.

Fixes: f7faffa3ff8e ("l2tp: Add L2TPv3 protocol support")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index b56cb1df4fc0..270a0a999eaf 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -751,6 +751,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 		/* Default MTU must allow space for UDP/L2TP/PPP headers */
 		cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
 		cfg.mru = cfg.mtu;
+		cfg.pw_type = L2TP_PWTYPE_PPP;
 
 		session = l2tp_session_create(sizeof(struct pppol2tp_session),
 					      tunnel, session_id,

From 7ac6ab1f8a38ba7f8d97f95475bb6a2575db4658 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Wed, 13 Jun 2018 15:09:19 +0200
Subject: [PATCH 60/81] l2tp: only accept PPP sessions in pppol2tp_connect()

l2tp_session_priv() returns a struct pppol2tp_session pointer only for
PPPoL2TP sessions. In particular, if the session is an L2TP_PWTYPE_ETH
pseudo-wire, l2tp_session_priv() returns a pointer to an l2tp_eth_sess
structure, which is much smaller than struct pppol2tp_session. This
leads to invalid memory dereference when trying to lock ps->sk_lock.

Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 270a0a999eaf..8b3b6947a07d 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -734,6 +734,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 	session = l2tp_session_get(sock_net(sk), tunnel, session_id);
 	if (session) {
 		drop_refcnt = true;
+
+		if (session->pwtype != L2TP_PWTYPE_PPP) {
+			error = -EPROTOTYPE;
+			goto end;
+		}
+
 		ps = l2tp_session_priv(session);
 
 		/* Using a pre-existing session is fine as long as it hasn't

From 3e1bc8bf974e2d4e7beb842a4c801c2542eff3bd Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Wed, 13 Jun 2018 15:09:20 +0200
Subject: [PATCH 61/81] l2tp: prevent pppol2tp_connect() from creating kernel
 sockets

If 'fd' is negative, l2tp_tunnel_create() creates a tunnel socket using
the configuration passed in 'tcfg'. Currently, pppol2tp_connect() sets
the relevant fields to zero, tricking l2tp_tunnel_create() into setting
up an unusable kernel socket.

We can't set 'tcfg' with the required fields because there's no way to
get them from the current connect() parameters. So let's restrict
kernel sockets creation to the netlink API, which is the original use
case.

Fixes: 789a4a2c61d8 ("l2tp: Add support for static unmanaged L2TPv3 tunnels")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 8b3b6947a07d..1b24f76ae210 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -701,6 +701,15 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 				.encap = L2TP_ENCAPTYPE_UDP,
 				.debug = 0,
 			};
+
+			/* Prevent l2tp_tunnel_register() from trying to set up
+			 * a kernel socket.
+			 */
+			if (fd < 0) {
+				error = -EBADF;
+				goto end;
+			}
+
 			error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel);
 			if (error < 0)
 				goto end;

From bda06be2158c7aa7e41b15500c4d3840369c19a6 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Wed, 13 Jun 2018 15:09:21 +0200
Subject: [PATCH 62/81] l2tp: clean up stale tunnel or session in
 pppol2tp_connect's error path

pppol2tp_connect() may create a tunnel or a session. Remove them in
case of error.

Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1b24f76ae210..f429fed06a1e 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -612,6 +612,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 	u32 session_id, peer_session_id;
 	bool drop_refcnt = false;
 	bool drop_tunnel = false;
+	bool new_session = false;
+	bool new_tunnel = false;
 	int ver = 2;
 	int fd;
 
@@ -722,6 +724,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 				goto end;
 			}
 			drop_tunnel = true;
+			new_tunnel = true;
 		}
 	} else {
 		/* Error if we can't find the tunnel */
@@ -788,6 +791,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 			goto end;
 		}
 		drop_refcnt = true;
+		new_session = true;
 	}
 
 	/* Special case: if source & dest session_id == 0x0000, this
@@ -834,6 +838,12 @@ out_no_ppp:
 		  session->name);
 
 end:
+	if (error) {
+		if (new_session)
+			l2tp_session_delete(session);
+		if (new_tunnel)
+			l2tp_tunnel_delete(tunnel);
+	}
 	if (drop_refcnt)
 		l2tp_session_dec_refcount(session);
 	if (drop_tunnel)

From f1693c63ab133d16994cc50f773982b5905af264 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Date: Thu, 14 Jun 2018 11:52:34 -0700
Subject: [PATCH 63/81] rds: avoid unenecessary cong_update in loop transport

Loop transport which is self loopback, remote port congestion
update isn't relevant. Infact the xmit path already ignores it.
Receive path needs to do the same.

Reported-by: syzbot+4c20b3866171ce8441d2@syzkaller.appspotmail.com
Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/loop.c | 1 +
 net/rds/rds.h  | 5 +++++
 net/rds/recv.c | 5 +++++
 3 files changed, 11 insertions(+)

diff --git a/net/rds/loop.c b/net/rds/loop.c
index f2bf78de5688..dac6218a460e 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -193,4 +193,5 @@ struct rds_transport rds_loop_transport = {
 	.inc_copy_to_user	= rds_message_inc_copy_to_user,
 	.inc_free		= rds_loop_inc_free,
 	.t_name			= "loopback",
+	.t_type			= RDS_TRANS_LOOP,
 };
diff --git a/net/rds/rds.h b/net/rds/rds.h
index b04c333d9d1c..f2272fb8cd45 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -479,6 +479,11 @@ struct rds_notifier {
 	int			n_status;
 };
 
+/* Available as part of RDS core, so doesn't need to participate
+ * in get_preferred transport etc
+ */
+#define	RDS_TRANS_LOOP	3
+
 /**
  * struct rds_transport -  transport specific behavioural hooks
  *
diff --git a/net/rds/recv.c b/net/rds/recv.c
index dc67458b52f0..192ac6f78ded 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -103,6 +103,11 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
 		rds_stats_add(s_recv_bytes_added_to_socket, delta);
 	else
 		rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
+
+	/* loop transport doesn't send/recv congestion updates */
+	if (rs->rs_transport->t_type == RDS_TRANS_LOOP)
+		return;
+
 	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
 
 	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "

From 06bdf2803cae82c66c666b932f21b7c01d7b2ef9 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Thu, 14 Jun 2018 18:29:09 -0700
Subject: [PATCH 64/81] hv_netvsc: Fix the variable sizes in ipsecv2 and rsc
 offload

These fields in struct ndis_ipsecv2_offload and struct ndis_rsc_offload
are one byte according to the specs. This patch defines them with the
right size. These structs are not in use right now, but will be used soon.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index d31c0cd329a1..1a924b867b07 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -1277,17 +1277,17 @@ struct ndis_lsov2_offload {
 
 struct ndis_ipsecv2_offload {
 	u32	encap;
-	u16	ip6;
-	u16	ip4opt;
-	u16	ip6ext;
-	u16	ah;
-	u16	esp;
-	u16	ah_esp;
-	u16	xport;
-	u16	tun;
-	u16	xport_tun;
-	u16	lso;
-	u16	extseq;
+	u8	ip6;
+	u8	ip4opt;
+	u8	ip6ext;
+	u8	ah;
+	u8	esp;
+	u8	ah_esp;
+	u8	xport;
+	u8	tun;
+	u8	xport_tun;
+	u8	lso;
+	u8	extseq;
 	u32	udp_esp;
 	u32	auth;
 	u32	crypto;
@@ -1295,8 +1295,8 @@ struct ndis_ipsecv2_offload {
 };
 
 struct ndis_rsc_offload {
-	u16	ip4;
-	u16	ip6;
+	u8	ip4;
+	u8	ip6;
 };
 
 struct ndis_encap_offload {

From badbc27df3a934e0025be238754f9ca6a852c006 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Fri, 8 Jun 2018 10:04:47 +0300
Subject: [PATCH 65/81] nl80211: fix some kernel doc tag mistakes

There is a bunch of tags marking constants with &, which means struct
or enum name.  Replace them with %, which is the correct tag for
constants.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/uapi/linux/nl80211.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 28b36545de24..27e4e441caac 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -981,18 +981,18 @@
  *	only the %NL80211_ATTR_IE data is used and updated with this command.
  *
  * @NL80211_CMD_SET_PMK: For offloaded 4-Way handshake, set the PMK or PMK-R0
- *	for the given authenticator address (specified with &NL80211_ATTR_MAC).
- *	When &NL80211_ATTR_PMKR0_NAME is set, &NL80211_ATTR_PMK specifies the
+ *	for the given authenticator address (specified with %NL80211_ATTR_MAC).
+ *	When %NL80211_ATTR_PMKR0_NAME is set, %NL80211_ATTR_PMK specifies the
  *	PMK-R0, otherwise it specifies the PMK.
  * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously
  *	configured PMK for the authenticator address identified by
- *	&NL80211_ATTR_MAC.
+ *	%NL80211_ATTR_MAC.
  * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way
  *	handshake was completed successfully by the driver. The BSSID is
- *	specified with &NL80211_ATTR_MAC. Drivers that support 4 way handshake
+ *	specified with %NL80211_ATTR_MAC. Drivers that support 4 way handshake
  *	offload should send this event after indicating 802.11 association with
- *	&NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed
- *	&NL80211_CMD_DISCONNECT should be indicated instead.
+ *	%NL80211_CMD_CONNECT or %NL80211_CMD_ROAM. If the 4 way handshake failed
+ *	%NL80211_CMD_DISCONNECT should be indicated instead.
  *
  * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request
  *	and RX notification.  This command is used both as a request to transmit
@@ -1029,9 +1029,9 @@
  *	initiated the connection through the connect request.
  *
  * @NL80211_CMD_STA_OPMODE_CHANGED: An event that notify station's
- *	ht opmode or vht opmode changes using any of &NL80211_ATTR_SMPS_MODE,
- *	&NL80211_ATTR_CHANNEL_WIDTH,&NL80211_ATTR_NSS attributes with its
- *	address(specified in &NL80211_ATTR_MAC).
+ *	ht opmode or vht opmode changes using any of %NL80211_ATTR_SMPS_MODE,
+ *	%NL80211_ATTR_CHANNEL_WIDTH,%NL80211_ATTR_NSS attributes with its
+ *	address(specified in %NL80211_ATTR_MAC).
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -2218,7 +2218,7 @@ enum nl80211_commands {
  * @NL80211_ATTR_EXTERNAL_AUTH_ACTION: Identify the requested external
  *     authentication operation (u32 attribute with an
  *     &enum nl80211_external_auth_action value). This is used with the
- *     &NL80211_CMD_EXTERNAL_AUTH request event.
+ *     %NL80211_CMD_EXTERNAL_AUTH request event.
  * @NL80211_ATTR_EXTERNAL_AUTH_SUPPORT: Flag attribute indicating that the user
  *     space supports external authentication. This attribute shall be used
  *     only with %NL80211_CMD_CONNECT request. The driver may offload
@@ -3491,7 +3491,7 @@ enum nl80211_sched_scan_match_attr {
  * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated
  *	base on contiguous rules and wider channels will be allowed to cross
  *	multiple contiguous/overlapping frequency ranges.
- * @NL80211_RRF_IR_CONCURRENT: See &NL80211_FREQUENCY_ATTR_IR_CONCURRENT
+ * @NL80211_RRF_IR_CONCURRENT: See %NL80211_FREQUENCY_ATTR_IR_CONCURRENT
  * @NL80211_RRF_NO_HT40MINUS: channels can't be used in HT40- operation
  * @NL80211_RRF_NO_HT40PLUS: channels can't be used in HT40+ operation
  * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed
@@ -5643,11 +5643,11 @@ enum nl80211_nan_func_attributes {
  * @NL80211_NAN_SRF_INCLUDE: present if the include bit of the SRF set.
  *	This is a flag.
  * @NL80211_NAN_SRF_BF: Bloom Filter. Present if and only if
- *	&NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary.
+ *	%NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary.
  * @NL80211_NAN_SRF_BF_IDX: index of the Bloom Filter. Mandatory if
- *	&NL80211_NAN_SRF_BF is present. This is a u8.
+ *	%NL80211_NAN_SRF_BF is present. This is a u8.
  * @NL80211_NAN_SRF_MAC_ADDRS: list of MAC addresses for the SRF. Present if
- *	and only if &NL80211_NAN_SRF_BF isn't present. This is a nested
+ *	and only if %NL80211_NAN_SRF_BF isn't present. This is a nested
  *	attribute. Each nested attribute is a MAC address.
  * @NUM_NL80211_NAN_SRF_ATTR: internal
  * @NL80211_NAN_SRF_ATTR_MAX: highest NAN SRF attribute

From 3c12d0486856b9eb89c2a9ac336713cba90813e3 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Wed, 6 Jun 2018 10:53:55 +0200
Subject: [PATCH 66/81] cfg80211: initialize sinfo in cfg80211_get_station

Most of the implementations behind cfg80211_get_station will not initialize
sinfo to zero before manipulating it. For example, the member "filled",
which indicates the filled in parts of this struct, is often only modified
by enabling certain bits in the bitfield while keeping the remaining bits
in their original state. A caller without a preinitialized sinfo.filled can
then no longer decide which parts of sinfo were filled in by
cfg80211_get_station (or actually the underlying implementations).

cfg80211_get_station must therefore take care that sinfo is initialized to
zero. Otherwise, the caller may tries to read information which was not
filled in and which must therefore also be considered uninitialized. In
batadv_v_elp_get_throughput's case, an invalid "random" expected throughput
may be stored for this neighbor and thus the B.A.T.M.A.N V algorithm may
switch to non-optimal neighbors for certain destinations.

Fixes: 7406353d43c8 ("cfg80211: implement cfg80211_get_station cfg80211 API")
Reported-by: Thomas Lauer <holminateur@gmail.com>
Reported-by: Marcel Schmidt <ff.z-casparistrasse@mailbox.org>
Cc: b.a.t.m.a.n@lists.open-mesh.org
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/wireless/util.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/wireless/util.c b/net/wireless/util.c
index b5bb1c309914..3c654cd7ba56 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1746,6 +1746,8 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
 	if (!rdev->ops->get_station)
 		return -EOPNOTSUPP;
 
+	memset(sinfo, 0, sizeof(*sinfo));
+
 	return rdev_get_station(rdev, dev, mac_addr, sinfo);
 }
 EXPORT_SYMBOL(cfg80211_get_station);

From 3f61b7a30a6a8fd917d7570cb00a26a054d84ab4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 29 May 2018 12:04:51 +0200
Subject: [PATCH 67/81] mac80211_hwsim: fix module init error paths

We didn't free the workqueue on any errors, nor did we
correctly check for rhashtable allocation errors, nor
did we free the hashtable on error.

Reported-by: Colin King <colin.king@canonical.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 drivers/net/wireless/mac80211_hwsim.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 9825bfd42abc..18e819d964f1 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3572,11 +3572,14 @@ static int __init init_mac80211_hwsim(void)
 	hwsim_wq = alloc_workqueue("hwsim_wq", 0, 0);
 	if (!hwsim_wq)
 		return -ENOMEM;
-	rhashtable_init(&hwsim_radios_rht, &hwsim_rht_params);
+
+	err = rhashtable_init(&hwsim_radios_rht, &hwsim_rht_params);
+	if (err)
+		goto out_free_wq;
 
 	err = register_pernet_device(&hwsim_net_ops);
 	if (err)
-		return err;
+		goto out_free_rht;
 
 	err = platform_driver_register(&mac80211_hwsim_driver);
 	if (err)
@@ -3701,6 +3704,10 @@ out_unregister_driver:
 	platform_driver_unregister(&mac80211_hwsim_driver);
 out_unregister_pernet:
 	unregister_pernet_device(&hwsim_net_ops);
+out_free_rht:
+	rhashtable_destroy(&hwsim_radios_rht);
+out_free_wq:
+	destroy_workqueue(hwsim_wq);
 	return err;
 }
 module_init(init_mac80211_hwsim);

From dc8b274f0952f604d72b10698cde6887321a669f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@toke.dk>
Date: Fri, 25 May 2018 14:29:21 +0200
Subject: [PATCH 68/81] mac80211: Move up init of TXQs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On init, ieee80211_if_add() dumps the interface. Since that now includes a
dump of the TXQ state, we need to initialise that before the dump happens.
So move up the TXQ initialisation to to before the call to
ieee80211_if_add().

Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace")
Reported-by: Niklas Cassel <niklas.cassel@linaro.org>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Tested-by: Niklas Cassel <niklas.cassel@linaro.org>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/mac80211/main.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 4d2e797e3f16..722f3d9fb416 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1098,6 +1098,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	ieee80211_led_init(local);
 
+	result = ieee80211_txq_setup_flows(local);
+	if (result)
+		goto fail_flows;
+
 	rtnl_lock();
 
 	result = ieee80211_init_rate_ctrl_alg(local,
@@ -1120,10 +1124,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	rtnl_unlock();
 
-	result = ieee80211_txq_setup_flows(local);
-	if (result)
-		goto fail_flows;
-
 #ifdef CONFIG_INET
 	local->ifa_notifier.notifier_call = ieee80211_ifa_changed;
 	result = register_inetaddr_notifier(&local->ifa_notifier);
@@ -1149,8 +1149,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 #if defined(CONFIG_INET) || defined(CONFIG_IPV6)
  fail_ifa:
 #endif
-	ieee80211_txq_teardown_flows(local);
- fail_flows:
 	rtnl_lock();
 	rate_control_deinitialize(local);
 	ieee80211_remove_interfaces(local);
@@ -1158,6 +1156,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	rtnl_unlock();
 	ieee80211_led_exit(local);
 	ieee80211_wep_free(local);
+	ieee80211_txq_teardown_flows(local);
+ fail_flows:
 	destroy_workqueue(local->workqueue);
  fail_workqueue:
 	wiphy_unregister(local->hw.wiphy);

From bf2b61a6838f19cbc33f6732715012c483fa3795 Mon Sep 17 00:00:00 2001
From: Dedy Lansky <dlansky@codeaurora.org>
Date: Fri, 15 Jun 2018 13:05:01 +0200
Subject: [PATCH 69/81] cfg80211: fix rcu in cfg80211_unregister_wdev

Callers of cfg80211_unregister_wdev can free the wdev object
immediately after this function returns. This may crash the kernel
because this wdev object is still in use by other threads.
Add synchronize_rcu() after list_del_rcu to make sure wdev object can
be safely freed.

Signed-off-by: Dedy Lansky <dlansky@codeaurora.org>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 net/wireless/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/wireless/core.c b/net/wireless/core.c
index 5fe35aafdd9c..48e8097339ab 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1012,6 +1012,7 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev)
 	nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE);
 
 	list_del_rcu(&wdev->list);
+	synchronize_rcu();
 	rdev->devlist_generation++;
 
 	switch (wdev->iftype) {

From ab188e8f4aad9845589ed050bde9514550a23ea5 Mon Sep 17 00:00:00 2001
From: Elad Nachman <eladv6@gmail.com>
Date: Fri, 15 Jun 2018 09:57:39 +0300
Subject: [PATCH 70/81] stmmac: added support for 802.1ad vlan stripping

stmmac reception handler calls stmmac_rx_vlan() to strip the vlan before
calling napi_gro_receive().

The function assumes VLAN tagged frames are always tagged with
802.1Q protocol, and assigns ETH_P_8021Q to the skb by hard-coding
the parameter on call to __vlan_hwaccel_put_tag() .

This causes packets not to be passed to the VLAN slave if it was created
with 802.1AD protocol
(ip link add link eth0 eth0.100 type vlan proto 802.1ad id 100).

This fix passes the protocol from the VLAN header into
__vlan_hwaccel_put_tag() instead of using the hard-coded value of
ETH_P_8021Q.

NETIF_F_HW_VLAN_STAG_RX check was added and the strip action is now
dependent on the correct combination of features and the detected vlan tag.

NETIF_F_HW_VLAN_STAG_RX feature was added to be in line with the driver
actual abilities.

Signed-off-by: Elad Nachman <eladn@gilat.com>
Reviewed-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/stmicro/stmmac/stmmac_main.c | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 11fb7c777d89..5e6d4fe2f4ef 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3182,17 +3182,22 @@ dma_map_err:
 
 static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
 {
-	struct ethhdr *ehdr;
+	struct vlan_ethhdr *veth;
+	__be16 vlan_proto;
 	u16 vlanid;
 
-	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) ==
-	    NETIF_F_HW_VLAN_CTAG_RX &&
-	    !__vlan_get_tag(skb, &vlanid)) {
+	veth = (struct vlan_ethhdr *)skb->data;
+	vlan_proto = veth->h_vlan_proto;
+
+	if ((vlan_proto == htons(ETH_P_8021Q) &&
+	     dev->features & NETIF_F_HW_VLAN_CTAG_RX) ||
+	    (vlan_proto == htons(ETH_P_8021AD) &&
+	     dev->features & NETIF_F_HW_VLAN_STAG_RX)) {
 		/* pop the vlan tag */
-		ehdr = (struct ethhdr *)skb->data;
-		memmove(skb->data + VLAN_HLEN, ehdr, ETH_ALEN * 2);
+		vlanid = ntohs(veth->h_vlan_TCI);
+		memmove(skb->data + VLAN_HLEN, veth, ETH_ALEN * 2);
 		skb_pull(skb, VLAN_HLEN);
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+		__vlan_hwaccel_put_tag(skb, vlan_proto, vlanid);
 	}
 }
 
@@ -4235,7 +4240,7 @@ int stmmac_dvr_probe(struct device *device,
 	ndev->watchdog_timeo = msecs_to_jiffies(watchdog);
 #ifdef STMMAC_VLAN_TAG_USED
 	/* Both mac100 and gmac support receive VLAN tag detection */
-	ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX;
 #endif
 	priv->msg_enable = netif_msg_init(debug, default_msg_level);
 

From 6eba08c3626bca42b3cb0c9d43ac37ab11b4be1a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 15 Jun 2018 16:23:35 +0300
Subject: [PATCH 71/81] ipv6: Only emit append events for appended routes

Current code will emit an append event in the FIB notification chain for
any route added with NLM_F_APPEND set, even if the route was not
appended to any existing route.

This is inconsistent with IPv4 where such an event is only emitted when
the new route is appended after an existing one.

Align IPv6 behavior with IPv4, thereby allowing listeners to more easily
handle these events.

Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 7aa4c41a3bd9..39d1d487eca2 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -934,6 +934,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 {
 	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+	enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
 	struct fib6_info *iter = NULL, *match = NULL;
 	struct fib6_info __rcu **ins;
 	int replace = (info->nlh &&
@@ -1013,6 +1014,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 				       "Can not append to a REJECT route");
 			return -EINVAL;
 		}
+		event = FIB_EVENT_ENTRY_APPEND;
 		rt->fib6_nsiblings = match->fib6_nsiblings;
 		list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
 		match->fib6_nsiblings++;
@@ -1034,15 +1036,12 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 	 *	insert node
 	 */
 	if (!replace) {
-		enum fib_event_type event;
-
 		if (!add)
 			pr_warn("NLM_F_CREATE should be set when creating new route\n");
 
 add:
 		nlflags |= NLM_F_CREATE;
 
-		event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
 		err = call_fib6_entry_notifiers(info->nl_net, event, rt,
 						extack);
 		if (err)

From 53b562df8c203e189fc69f7af0d8668e8dec5a8a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 15 Jun 2018 16:23:36 +0300
Subject: [PATCH 72/81] mlxsw: spectrum_router: Allow appending to dev-only
 routes

Commit f34436a43092 ("net/ipv6: Simplify route replace and appending
into multipath route") changed the IPv6 route append logic so that
dev-only routes can be appended and not only gatewayed routes.

Align mlxsw with the new behaviour.

Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlxsw/spectrum_router.c | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 77b2adb29341..c8956ab224ea 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -4771,11 +4771,11 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
-				 const struct fib6_info *nrt, bool replace)
+				 const struct fib6_info *nrt, bool append)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 
-	if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace)
+	if (!append)
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
@@ -4790,8 +4790,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			break;
 		if (rt->fib6_metric < nrt->fib6_metric)
 			continue;
-		if (rt->fib6_metric == nrt->fib6_metric &&
-		    mlxsw_sp_fib6_rt_can_mp(rt))
+		if (rt->fib6_metric == nrt->fib6_metric)
 			return fib6_entry;
 		if (rt->fib6_metric > nrt->fib6_metric)
 			break;
@@ -5316,7 +5315,8 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
-				    struct fib6_info *rt, bool replace)
+				    struct fib6_info *rt, bool replace,
+				    bool append)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5342,7 +5342,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 	/* Before creating a new entry, try to append route to an existing
 	 * multipath entry.
 	 */
-	fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace);
+	fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, append);
 	if (fib6_entry) {
 		err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt);
 		if (err)
@@ -5350,6 +5350,14 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 		return 0;
 	}
 
+	/* We received an append event, yet did not find any route to
+	 * append to.
+	 */
+	if (WARN_ON(append)) {
+		err = -EINVAL;
+		goto err_fib6_entry_append;
+	}
+
 	fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt);
 	if (IS_ERR(fib6_entry)) {
 		err = PTR_ERR(fib6_entry);
@@ -5367,6 +5375,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 err_fib6_node_entry_link:
 	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
 err_fib6_entry_create:
+err_fib6_entry_append:
 err_fib6_entry_nexthop_add:
 	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
 	return err;
@@ -5717,7 +5726,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 	struct mlxsw_sp_fib_event_work *fib_work =
 		container_of(work, struct mlxsw_sp_fib_event_work, work);
 	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
-	bool replace;
+	bool replace, append;
 	int err;
 
 	rtnl_lock();
@@ -5728,8 +5737,10 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 	case FIB_EVENT_ENTRY_APPEND: /* fall through */
 	case FIB_EVENT_ENTRY_ADD:
 		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+		append = fib_work->event == FIB_EVENT_ENTRY_APPEND;
 		err = mlxsw_sp_router_fib6_add(mlxsw_sp,
-					       fib_work->fen6_info.rt, replace);
+					       fib_work->fen6_info.rt, replace,
+					       append);
 		if (err)
 			mlxsw_sp_router_fib_abort(mlxsw_sp);
 		mlxsw_sp_rt6_release(fib_work->fen6_info.rt);

From ce45bded6435aa7d1e34be3b47e9c04c72f85742 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 15 Jun 2018 16:23:37 +0300
Subject: [PATCH 73/81] mlxsw: spectrum_router: Align with new route replace
 logic

Commit f34436a43092 ("net/ipv6: Simplify route replace and appending
into multipath route") changed the IPv6 route replace logic so that the
first matching route (i.e., same metric) is replaced.

Have mlxsw replace the first matching route as well.

Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlxsw/spectrum_router.c | 21 +++++--------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index c8956ab224ea..6aaaf3d9ba31 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -4756,12 +4756,6 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
 	kfree(mlxsw_sp_rt6);
 }
 
-static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
-{
-	/* RTF_CACHE routes are ignored */
-	return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
-}
-
 static struct fib6_info *
 mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 {
@@ -5169,7 +5163,7 @@ static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			      const struct fib6_info *nrt, bool replace)
 {
-	struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL;
+	struct mlxsw_sp_fib6_entry *fib6_entry;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
 		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
@@ -5178,18 +5172,13 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			continue;
 		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
 			break;
-		if (replace && rt->fib6_metric == nrt->fib6_metric) {
-			if (mlxsw_sp_fib6_rt_can_mp(rt) ==
-			    mlxsw_sp_fib6_rt_can_mp(nrt))
-				return fib6_entry;
-			if (mlxsw_sp_fib6_rt_can_mp(nrt))
-				fallback = fallback ?: fib6_entry;
-		}
+		if (replace && rt->fib6_metric == nrt->fib6_metric)
+			return fib6_entry;
 		if (rt->fib6_metric > nrt->fib6_metric)
-			return fallback ?: fib6_entry;
+			return fib6_entry;
 	}
 
-	return fallback;
+	return NULL;
 }
 
 static int

From 9e25826ffc942e985b8595b2f1cf2065d3880514 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 15 Jun 2018 16:23:38 +0300
Subject: [PATCH 74/81] mlxsw: spectrum_switchdev: Fix port_vlan refcounting

Switchdev notifications for addition of SWITCHDEV_OBJ_ID_PORT_VLAN are
distributed not only on clean addition, but also when flags on an
existing VLAN are changed. mlxsw_sp_bridge_port_vlan_add() calls
mlxsw_sp_port_vlan_get() to get at the port_vlan in question, which
implicitly references the object. This then leads to discrepancies in
reference counting when the VLAN is removed. spectrum.c warns about the
problem when the module is removed:

[13578.493090] WARNING: CPU: 0 PID: 2454 at drivers/net/ethernet/mellanox/mlxsw/spectrum.c:2973 mlxsw_sp_port_remove+0xfd/0x110 [mlxsw_spectrum]
[...]
[13578.627106] Call Trace:
[13578.629617]  mlxsw_sp_fini+0x2a/0xe0 [mlxsw_spectrum]
[13578.634748]  mlxsw_core_bus_device_unregister+0x3e/0x130 [mlxsw_core]
[13578.641290]  mlxsw_pci_remove+0x13/0x40 [mlxsw_pci]
[13578.646238]  pci_device_remove+0x31/0xb0
[13578.650244]  device_release_driver_internal+0x14f/0x220
[13578.655562]  driver_detach+0x32/0x70
[13578.659183]  bus_remove_driver+0x47/0xa0
[13578.663134]  pci_unregister_driver+0x1e/0x80
[13578.667486]  mlxsw_sp_module_exit+0xc/0x3fa [mlxsw_spectrum]
[13578.673207]  __x64_sys_delete_module+0x13b/0x1e0
[13578.677888]  ? exit_to_usermode_loop+0x78/0x80
[13578.682374]  do_syscall_64+0x39/0xe0
[13578.685976]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fix by putting the port_vlan when mlxsw_sp_port_vlan_bridge_join()
determines it's a flag-only change.

Fixes: b3529af6bb0d ("spectrum: Reference count VLAN entries")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index e97652c40d13..eea5666a86b2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1018,8 +1018,10 @@ mlxsw_sp_port_vlan_bridge_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
 	int err;
 
 	/* No need to continue if only VLAN flags were changed */
-	if (mlxsw_sp_port_vlan->bridge_port)
+	if (mlxsw_sp_port_vlan->bridge_port) {
+		mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
 		return 0;
+	}
 
 	err = mlxsw_sp_port_vlan_fid_join(mlxsw_sp_port_vlan, bridge_port);
 	if (err)

From de9bada5d389903f4faf33980e6a95a2911c7e6d Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 15 Jun 2018 15:39:17 +0200
Subject: [PATCH 75/81] l2tp: reject creation of non-PPP sessions on L2TPv2
 tunnels

The /proc/net/pppol2tp handlers (pppol2tp_seq_*()) iterate over all
L2TPv2 tunnels, and rightfully expect that only PPP sessions can be
found there. However, l2tp_netlink accepts creating Ethernet sessions
regardless of the underlying tunnel version.

This confuses pppol2tp_seq_session_show(), which expects that
l2tp_session_priv() returns a pppol2tp_session structure. When the
session is an Ethernet pseudo-wire, a struct l2tp_eth_sess is returned
instead. This leads to invalid memory access when
pppol2tp_session_get_sock() later tries to dereference ps->sk.

Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_netlink.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 6616c9fd292f..5b9900889e31 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -553,6 +553,12 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 		goto out_tunnel;
 	}
 
+	/* L2TPv2 only accepts PPP pseudo-wires */
+	if (tunnel->version == 2 && cfg.pw_type != L2TP_PWTYPE_PPP) {
+		ret = -EPROTONOSUPPORT;
+		goto out_tunnel;
+	}
+
 	if (tunnel->version > 2) {
 		if (info->attrs[L2TP_ATTR_DATA_SEQ])
 			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);

From ecd012e45ab5fd76ed57546865897ce35920f56b Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 15 Jun 2018 15:39:19 +0200
Subject: [PATCH 76/81] l2tp: filter out non-PPP sessions in
 pppol2tp_tunnel_ioctl()

pppol2tp_tunnel_ioctl() can act on an L2TPv3 tunnel, in which case
'session' may be an Ethernet pseudo-wire.

However, pppol2tp_session_ioctl() expects a PPP pseudo-wire, as it
assumes l2tp_session_priv() points to a pppol2tp_session structure. For
an Ethernet pseudo-wire l2tp_session_priv() points to an l2tp_eth_sess
structure instead, making pppol2tp_session_ioctl() access invalid
memory.

Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index f429fed06a1e..55188382845c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1201,7 +1201,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
 				l2tp_session_get(sock_net(sk), tunnel,
 						 stats.session_id);
 
-			if (session) {
+			if (session && session->pwtype == L2TP_PWTYPE_PPP) {
 				err = pppol2tp_session_ioctl(session, cmd,
 							     arg);
 				l2tp_session_dec_refcount(session);

From a447da7d00410278c90d3576782a43f8b675d7be Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 15 Jun 2018 03:07:45 +0200
Subject: [PATCH 77/81] tls: fix use-after-free in tls_push_record

syzkaller managed to trigger a use-after-free in tls like the
following:

  BUG: KASAN: use-after-free in tls_push_record.constprop.15+0x6a2/0x810 [tls]
  Write of size 1 at addr ffff88037aa08000 by task a.out/2317

  CPU: 3 PID: 2317 Comm: a.out Not tainted 4.17.0+ #144
  Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016
  Call Trace:
   dump_stack+0x71/0xab
   print_address_description+0x6a/0x280
   kasan_report+0x258/0x380
   ? tls_push_record.constprop.15+0x6a2/0x810 [tls]
   tls_push_record.constprop.15+0x6a2/0x810 [tls]
   tls_sw_push_pending_record+0x2e/0x40 [tls]
   tls_sk_proto_close+0x3fe/0x710 [tls]
   ? tcp_check_oom+0x4c0/0x4c0
   ? tls_write_space+0x260/0x260 [tls]
   ? kmem_cache_free+0x88/0x1f0
   inet_release+0xd6/0x1b0
   __sock_release+0xc0/0x240
   sock_close+0x11/0x20
   __fput+0x22d/0x660
   task_work_run+0x114/0x1a0
   do_exit+0x71a/0x2780
   ? mm_update_next_owner+0x650/0x650
   ? handle_mm_fault+0x2f5/0x5f0
   ? __do_page_fault+0x44f/0xa50
   ? mm_fault_error+0x2d0/0x2d0
   do_group_exit+0xde/0x300
   __x64_sys_exit_group+0x3a/0x50
   do_syscall_64+0x9a/0x300
   ? page_fault+0x8/0x30
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

This happened through fault injection where aead_req allocation in
tls_do_encryption() eventually failed and we returned -ENOMEM from
the function. Turns out that the use-after-free is triggered from
tls_sw_sendmsg() in the second tls_push_record(). The error then
triggers a jump to waiting for memory in sk_stream_wait_memory()
resp. returning immediately in case of MSG_DONTWAIT. What follows is
the trim_both_sgl(sk, orig_size), which drops elements from the sg
list added via tls_sw_sendmsg(). Now the use-after-free gets triggered
when the socket is being closed, where tls_sk_proto_close() callback
is invoked. The tls_complete_pending_work() will figure that there's
a pending closed tls record to be flushed and thus calls into the
tls_push_pending_closed_record() from there. ctx->push_pending_record()
is called from the latter, which is the tls_sw_push_pending_record()
from sw path. This again calls into tls_push_record(). And here the
tls_fill_prepend() will panic since the buffer address has been freed
earlier via trim_both_sgl(). One way to fix it is to move the aead
request allocation out of tls_do_encryption() early into tls_push_record().
This means we don't prep the tls header and advance state to the
TLS_PENDING_CLOSED_RECORD before allocation which could potentially
fail happened. That fixes the issue on my side.

Fixes: 3c4d7559159b ("tls: kernel TLS support")
Reported-by: syzbot+5c74af81c547738e1684@syzkaller.appspotmail.com
Reported-by: syzbot+709f2810a6a05f11d4d3@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 34895b7c132d..2945a3bd538c 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -191,18 +191,12 @@ static void tls_free_both_sg(struct sock *sk)
 }
 
 static int tls_do_encryption(struct tls_context *tls_ctx,
-			     struct tls_sw_context_tx *ctx, size_t data_len,
-			     gfp_t flags)
+			     struct tls_sw_context_tx *ctx,
+			     struct aead_request *aead_req,
+			     size_t data_len)
 {
-	unsigned int req_size = sizeof(struct aead_request) +
-		crypto_aead_reqsize(ctx->aead_send);
-	struct aead_request *aead_req;
 	int rc;
 
-	aead_req = kzalloc(req_size, flags);
-	if (!aead_req)
-		return -ENOMEM;
-
 	ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
 	ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
 
@@ -219,7 +213,6 @@ static int tls_do_encryption(struct tls_context *tls_ctx,
 	ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size;
 	ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
 
-	kfree(aead_req);
 	return rc;
 }
 
@@ -228,8 +221,14 @@ static int tls_push_record(struct sock *sk, int flags,
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct aead_request *req;
 	int rc;
 
+	req = kzalloc(sizeof(struct aead_request) +
+		      crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation);
+	if (!req)
+		return -ENOMEM;
+
 	sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
 	sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1);
 
@@ -245,15 +244,14 @@ static int tls_push_record(struct sock *sk, int flags,
 	tls_ctx->pending_open_record_frags = 0;
 	set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags);
 
-	rc = tls_do_encryption(tls_ctx, ctx, ctx->sg_plaintext_size,
-			       sk->sk_allocation);
+	rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size);
 	if (rc < 0) {
 		/* If we are called from write_space and
 		 * we fail, we need to set this SOCK_NOSPACE
 		 * to trigger another write_space in the future.
 		 */
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		return rc;
+		goto out_req;
 	}
 
 	free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
@@ -268,6 +266,8 @@ static int tls_push_record(struct sock *sk, int flags,
 		tls_err_abort(sk, EBADMSG);
 
 	tls_advance_record_sn(sk, &tls_ctx->tx);
+out_req:
+	kfree(req);
 	return rc;
 }
 

From 06030dbaf3b6c5801dcdb7fe4fbab3b91c8da84a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 15 Jun 2018 03:07:46 +0200
Subject: [PATCH 78/81] tls: fix waitall behavior in tls_sw_recvmsg

Current behavior in tls_sw_recvmsg() is to wait for incoming tls
messages and copy up to exactly len bytes of data that the user
provided. This is problematic in the sense that i) if no packet
is currently queued in strparser we keep waiting until one has been
processed and pushed into tls receive layer for tls_wait_data() to
wake up and push the decrypted bits to user space. Given after
tls decryption, we're back at streaming data, use sock_rcvlowat()
hint from tcp socket instead. Retain current behavior with MSG_WAITALL
flag and otherwise use the hint target for breaking the loop and
returning to application. This is done if currently no ctx->recv_pkt
is ready, otherwise continue to process it from our strparser
backlog.

Fixes: c46234ebb4d1 ("tls: RX path for ktls")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 2945a3bd538c..f127fac88acf 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -754,7 +754,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	struct sk_buff *skb;
 	ssize_t copied = 0;
 	bool cmsg = false;
-	int err = 0;
+	int target, err = 0;
 	long timeo;
 
 	flags |= nonblock;
@@ -764,6 +764,7 @@ int tls_sw_recvmsg(struct sock *sk,
 
 	lock_sock(sk);
 
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 	do {
 		bool zc = false;
@@ -856,6 +857,9 @@ fallback_to_reg_recv:
 					goto recv_end;
 			}
 		}
+		/* If we have a new message from strparser, continue now. */
+		if (copied >= target && !ctx->recv_pkt)
+			break;
 	} while (len);
 
 recv_end:

From 7c099773b08634df9db0f5be40f0fcc06baa2e1b Mon Sep 17 00:00:00 2001
From: Zhouyang Jia <jiazhouyang09@gmail.com>
Date: Fri, 15 Jun 2018 11:06:17 +0800
Subject: [PATCH 79/81] net: cxgb3: add error handling for sysfs_create_group

When sysfs_create_group fails, the lack of error-handling code may
cause unexpected results.

This patch adds error-handling code after calling sysfs_create_group.

Signed-off-by: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index 2edfdbdaae48..7b795edd9d3a 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -3362,10 +3362,17 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	err = sysfs_create_group(&adapter->port[0]->dev.kobj,
 				 &cxgb3_attr_group);
+	if (err) {
+		dev_err(&pdev->dev, "cannot create sysfs group\n");
+		goto out_close_led;
+	}
 
 	print_port_info(adapter, ai);
 	return 0;
 
+out_close_led:
+	t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL, 0);
+
 out_free_dev:
 	iounmap(adapter->regs);
 	for (i = ai->nports0 + ai->nports1 - 1; i >= 0; --i)

From f6a6f203d507aae3a06a8de79c6f0ecc4658b81c Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 12 Jun 2018 21:26:10 -0700
Subject: [PATCH 80/81] neighbour: skip NTF_EXT_LEARNED entries during forced
 gc

Commit 9ce33e46531d ("neighbour: support for NTF_EXT_LEARNED flag")
added support for NTF_EXT_LEARNED for neighbour entries.
NTF_EXT_LEARNED entries are neigh entries managed by control
plane (eg: Ethernet VPN implementation in FRR routing suite).
Periodic gc already excludes these entries. This patch extends
it to forced gc which the earlier patch missed.

Fixes: 9ce33e46531d ("neighbour: support for NTF_EXT_LEARNED flag")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a7a9c3d738ba..8e3fda9e725c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -119,13 +119,14 @@ unsigned long neigh_rand_reach_time(unsigned long base)
 EXPORT_SYMBOL(neigh_rand_reach_time);
 
 
-static bool neigh_del(struct neighbour *n, __u8 state,
+static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,
 		      struct neighbour __rcu **np, struct neigh_table *tbl)
 {
 	bool retval = false;
 
 	write_lock(&n->lock);
-	if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) {
+	if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) &&
+	    !(n->flags & flags)) {
 		struct neighbour *neigh;
 
 		neigh = rcu_dereference_protected(n->next,
@@ -157,7 +158,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
 	while ((n = rcu_dereference_protected(*np,
 					      lockdep_is_held(&tbl->lock)))) {
 		if (n == ndel)
-			return neigh_del(n, 0, np, tbl);
+			return neigh_del(n, 0, 0, np, tbl);
 		np = &n->next;
 	}
 	return false;
@@ -185,7 +186,8 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 			 * - nobody refers to it.
 			 * - it is not permanent
 			 */
-			if (neigh_del(n, NUD_PERMANENT, np, tbl)) {
+			if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np,
+				      tbl)) {
 				shrunk = 1;
 				continue;
 			}

From 7cfde0af731c14664e3882c7ba77ace1059f2c5e Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 15 Jun 2018 16:17:27 +0100
Subject: [PATCH 81/81] net: stmmac: Run HWIF Quirks after getting HW caps

Currently we were running HWIF quirks before getting HW capabilities.
This is not right because some HWIF callbacks depend on HW caps.

Lets save the quirks callback and use it in a later stage.

This fixes Altera socfpga.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Fixes: 5f0456b43140 ("net: stmmac: Implement logic to automatically select HW Interface")
Reported-by: Dinh Nguyen <dinh.linux@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Dinh Nguyen <dinh.linux@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/hwif.c        | 9 ++-------
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      | 1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 +++++++
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 14770fc8865e..1f50e83cafb2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -252,13 +252,8 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 				return ret;
 		}
 
-		/* Run quirks, if needed */
-		if (entry->quirks) {
-			ret = entry->quirks(priv);
-			if (ret)
-				return ret;
-		}
-
+		/* Save quirks, if needed for posterior use */
+		priv->hwif_quirks = entry->quirks;
 		return 0;
 	}
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 025efbf6145c..76649adf8fb0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -129,6 +129,7 @@ struct stmmac_priv {
 	struct net_device *dev;
 	struct device *device;
 	struct mac_device_info *hw;
+	int (*hwif_quirks)(struct stmmac_priv *priv);
 	struct mutex lock;
 
 	/* RX Queue */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 5e6d4fe2f4ef..e79b0d7b388a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4135,6 +4135,13 @@ static int stmmac_hw_init(struct stmmac_priv *priv)
 	if (priv->dma_cap.tsoen)
 		dev_info(priv->device, "TSO supported\n");
 
+	/* Run HW quirks, if any */
+	if (priv->hwif_quirks) {
+		ret = priv->hwif_quirks(priv);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }