From c7b725be84985532161bcb4fbecd056326998a77 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 15 Aug 2017 18:38:42 -0700
Subject: [PATCH 01/27] net: igmp: Use ingress interface rather than vrf device

Anuradha reported that statically added groups for interfaces enslaved
to a VRF device were not persisting. The problem is that igmp queries
and reports need to use the data in the in_dev for the real ingress
device rather than the VRF device. Update igmp_rcv accordingly.

Fixes: e58e41596811 ("net: Enable support for VRF with ipv4 multicast")
Reported-by: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 498706b072fb..caf2f1101d02 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb)
 {
 	/* This basically follows the spec line by line -- see RFC1112 */
 	struct igmphdr *ih;
-	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
 	int len = skb->len;
 	bool dropped = true;
 
+	if (netif_is_l3_master(dev)) {
+		dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
+		if (!dev)
+			goto drop;
+	}
+
+	in_dev = __in_dev_get_rcu(dev);
 	if (!in_dev)
 		goto drop;
 

From 494bea39f3201776cdfddc232705f54a0bd210c4 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Wed, 16 Aug 2017 13:30:07 +0800
Subject: [PATCH 02/27] openvswitch: fix skb_panic due to the incorrect actions
 attrlen

For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.

But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
  skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
  ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
  ------------[ cut here ]------------
  kernel BUG at net/core/skbuff.c:129!
  [...]
  Call Trace:
   <IRQ>
   [<ffffffff8148be82>] skb_put+0x43/0x44
   [<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
   [<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
   [<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
   [<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
   [<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
   [<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
   [<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
   [<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
   [<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
   [<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
  [...]

Also we can find that the actions_len is much little than the orig_len:
  crash> struct sw_flow_actions 0xffff8812f539d000
  struct sw_flow_actions {
    rcu = {
      next = 0xffff8812f5398800,
      func = 0xffffe3b00035db32
    },
    orig_len = 1384,
    actions_len = 592,
    actions = 0xffff8812f539d01c
  }

So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.

Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.

Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace")
Cc: Neil McKee <neil.mckee@inmon.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/actions.c  | 1 +
 net/openvswitch/datapath.c | 7 ++++---
 net/openvswitch/datapath.h | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index e4610676299b..a54a556fcdb5 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1337,6 +1337,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		goto out;
 	}
 
+	OVS_CB(skb)->acts_origlen = acts->orig_len;
 	err = do_execute_actions(dp, skb, key,
 				 acts->actions, acts->actions_len);
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 45fe8c8a884d..6b44fe405282 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -381,7 +381,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 }
 
 static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
-			      unsigned int hdrlen)
+			      unsigned int hdrlen, int actions_attrlen)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
@@ -398,7 +398,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 
 	/* OVS_PACKET_ATTR_ACTIONS */
 	if (upcall_info->actions_len)
-		size += nla_total_size(upcall_info->actions_len);
+		size += nla_total_size(actions_attrlen);
 
 	/* OVS_PACKET_ATTR_MRU */
 	if (upcall_info->mru)
@@ -465,7 +465,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	else
 		hlen = skb->len;
 
-	len = upcall_msg_size(upcall_info, hlen - cutlen);
+	len = upcall_msg_size(upcall_info, hlen - cutlen,
+			      OVS_CB(skb)->acts_origlen);
 	user_skb = genlmsg_new(len, GFP_ATOMIC);
 	if (!user_skb) {
 		err = -ENOMEM;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 5d8dcd88815f..480600649d0b 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -99,11 +99,13 @@ struct datapath {
  * when a packet is received by OVS.
  * @mru: The maximum received fragement size; 0 if the packet is not
  * fragmented.
+ * @acts_origlen: The netlink size of the flow actions applied to this skb.
  * @cutlen: The number of bytes from the packet end to be removed.
  */
 struct ovs_skb_cb {
 	struct vport		*input_vport;
 	u16			mru;
+	u16			acts_origlen;
 	u32			cutlen;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)

From 120e9dabaf551c6dc03d3a10a1f026376cb1811c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 07:03:15 -0700
Subject: [PATCH 03/27] dccp: defer ccid_hc_tx_delete() at dismantle time

syszkaller team reported another problem in DCCP [1]

Problem here is that the structure holding RTO timer
(ccid2_hc_tx_rto_expire() handler) is freed too soon.

We can not use del_timer_sync() to cancel the timer
since this timer wants to grab socket lock (that would risk a dead lock)

Solution is to defer the freeing of memory when all references to
the socket were released. Socket timers do own a reference, so this
should fix the issue.

[1]

==================================================================
BUG: KASAN: use-after-free in ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
Read of size 4 at addr ffff8801d2660540 by task kworker/u4:7/3365

CPU: 1 PID: 3365 Comm: kworker/u4:7 Not tainted 4.13.0-rc4+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events_unbound call_usermodehelper_exec_work
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429
 ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
 call_timer_fn+0x233/0x830 kernel/time/timer.c:1268
 expire_timers kernel/time/timer.c:1307 [inline]
 __run_timers+0x7fd/0xb90 kernel/time/timer.c:1601
 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284
 invoke_softirq kernel/softirq.c:364 [inline]
 irq_exit+0x1cc/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:638 [inline]
 smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:1044
 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:702
RIP: 0010:arch_local_irq_enable arch/x86/include/asm/paravirt.h:824 [inline]
RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:267 [inline]
RIP: 0010:_raw_write_unlock_irq+0x56/0x70 kernel/locking/spinlock.c:343
RSP: 0018:ffff8801cd50eaa8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff10
RAX: dffffc0000000000 RBX: ffffffff85a090c0 RCX: 0000000000000006
RDX: 1ffffffff0b595f3 RSI: 1ffff1003962f989 RDI: ffffffff85acaf98
RBP: ffff8801cd50eab0 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801cc96ea60
R13: dffffc0000000000 R14: ffff8801cc96e4c0 R15: ffff8801cc96e4c0
 </IRQ>
 release_task+0xe9e/0x1a40 kernel/exit.c:220
 wait_task_zombie kernel/exit.c:1162 [inline]
 wait_consider_task+0x29b8/0x33c0 kernel/exit.c:1389
 do_wait_thread kernel/exit.c:1452 [inline]
 do_wait+0x441/0xa90 kernel/exit.c:1523
 kernel_wait4+0x1f5/0x370 kernel/exit.c:1665
 SYSC_wait4+0x134/0x140 kernel/exit.c:1677
 SyS_wait4+0x2c/0x40 kernel/exit.c:1673
 call_usermodehelper_exec_sync kernel/kmod.c:286 [inline]
 call_usermodehelper_exec_work+0x1a0/0x2c0 kernel/kmod.c:323
 process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2097
 worker_thread+0x223/0x1860 kernel/workqueue.c:2231
 kthread+0x35e/0x430 kernel/kthread.c:231
 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:425

Allocated by task 21267:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc+0x127/0x750 mm/slab.c:3561
 ccid_new+0x20e/0x390 net/dccp/ccid.c:151
 dccp_hdlr_ccid+0x27/0x140 net/dccp/feat.c:44
 __dccp_feat_activate+0x142/0x2a0 net/dccp/feat.c:344
 dccp_feat_activate_values+0x34e/0xa90 net/dccp/feat.c:1538
 dccp_rcv_request_sent_state_process net/dccp/input.c:472 [inline]
 dccp_rcv_state_process+0xed1/0x1620 net/dccp/input.c:677
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2269
 release_sock+0xa4/0x2a0 net/core/sock.c:2784
 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline]
 __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643
 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682
 SYSC_connect+0x204/0x470 net/socket.c:1642
 SyS_connect+0x24/0x30 net/socket.c:1623
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 3049:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 ccid_hc_tx_delete+0xc5/0x100 net/dccp/ccid.c:190
 dccp_destroy_sock+0x1d1/0x2b0 net/dccp/proto.c:225
 inet_csk_destroy_sock+0x166/0x3f0 net/ipv4/inet_connection_sock.c:833
 dccp_done+0xb7/0xd0 net/dccp/proto.c:145
 dccp_time_wait+0x13d/0x300 net/dccp/minisocks.c:72
 dccp_rcv_reset+0x1d1/0x5b0 net/dccp/input.c:160
 dccp_rcv_state_process+0x8fc/0x1620 net/dccp/input.c:663
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __sk_receive_skb+0x33e/0xc00 net/core/sock.c:521
 dccp_v4_rcv+0xef1/0x1c00 net/dccp/ipv4.c:871
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:477 [inline]
 ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488
 __netif_receive_skb_core+0x19af/0x33d0 net/core/dev.c:4417
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4455
 process_backlog+0x203/0x740 net/core/dev.c:5130
 napi_poll net/core/dev.c:5527 [inline]
 net_rx_action+0x792/0x1910 net/core/dev.c:5593
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284

The buggy address belongs to the object at ffff8801d2660100
 which belongs to the cache ccid2_hc_tx_sock of size 1240
The buggy address is located 1088 bytes inside of
 1240-byte region [ffff8801d2660100, ffff8801d26605d8)
The buggy address belongs to the page:
page:ffffea0007499800 count:1 mapcount:0 mapping:ffff8801d2660100 index:0x0 compound_mapcount: 0
flags: 0x200000000008100(slab|head)
raw: 0200000000008100 ffff8801d2660100 0000000000000000 0000000100000005
raw: ffffea00075271a0 ffffea0007538820 ffff8801d3aef9c0 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d2660400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8801d2660480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801d2660500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff8801d2660580: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc
 ffff8801d2660600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/proto.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 86bc40ba6ba5..b68168fcc06a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,6 +24,7 @@
 #include <net/checksum.h>
 
 #include <net/inet_sock.h>
+#include <net/inet_common.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
 
@@ -170,6 +171,15 @@ const char *dccp_packet_name(const int type)
 
 EXPORT_SYMBOL_GPL(dccp_packet_name);
 
+static void dccp_sk_destruct(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_tx_ccid = NULL;
+	inet_sock_destruct(sk);
+}
+
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -179,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
 	sk->sk_write_space	= dccp_write_space;
+	sk->sk_destruct		= dccp_sk_destruct;
 	icsk->icsk_sync_mss	= dccp_sync_mss;
 	dp->dccps_mss_cache	= 536;
 	dp->dccps_rate_last	= jiffies;
@@ -219,8 +230,7 @@ void dccp_destroy_sock(struct sock *sk)
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
 	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+	dp->dccps_hc_rx_ccid = NULL;
 
 	/* clean up feature negotiation state */
 	dccp_feat_list_purge(&dp->dccps_featneg);

From 81fbfe8adaf38d4f5a98c19bebfd41c5d6acaee8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 10:36:47 -0700
Subject: [PATCH 04/27] ptr_ring: use kmalloc_array()

As found by syzkaller, malicious users can set whatever tx_queue_len
on a tun device and eventually crash the kernel.

Lets remove the ALIGN(XXX, SMP_CACHE_BYTES) thing since a small
ring buffer is not fast anyway.

Fixes: 2e0ab8ca83c1 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h  | 9 +++++----
 include/linux/skb_array.h | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d8c97ec8a8e6..37b4bb2545b3 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -436,9 +436,9 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
 	__PTR_RING_PEEK_CALL_v; \
 })
 
-static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp)
+static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
-	return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp);
+	return kcalloc(size, sizeof(void *), gfp);
 }
 
 static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -582,7 +582,8 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
  * In particular if you consume ring in interrupt or BH context, you must
  * disable interrupts/BH when doing so.
  */
-static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
+static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
+					   unsigned int nrings,
 					   int size,
 					   gfp_t gfp, void (*destroy)(void *))
 {
@@ -590,7 +591,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
 	void ***queues;
 	int i;
 
-	queues = kmalloc(nrings * sizeof *queues, gfp);
+	queues = kmalloc_array(nrings, sizeof(*queues), gfp);
 	if (!queues)
 		goto noqueues;
 
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 35226cd4efb0..8621ffdeecbf 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -193,7 +193,8 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 }
 
 static inline int skb_array_resize_multiple(struct skb_array **rings,
-					    int nrings, int size, gfp_t gfp)
+					    int nrings, unsigned int size,
+					    gfp_t gfp)
 {
 	BUILD_BUG_ON(offsetof(struct skb_array, ring));
 	return ptr_ring_resize_multiple((struct ptr_ring **)rings,

From c780a049f9bf442314335372c9abc4548bfe3e44 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 11:09:12 -0700
Subject: [PATCH 05/27] ipv4: better IP_MAX_MTU enforcement

While working on yet another syzkaller report, I found
that our IP_MAX_MTU enforcements were not properly done.

gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and
final result can be bigger than IP_MAX_MTU :/

This is a problem because device mtu can be changed on other cpus or
threads.

While this patch does not fix the issue I am working on, it is
probably worth addressing it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 4 ++--
 net/ipv4/route.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 821cedcc8e73..0cf7f5a65fe6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -352,7 +352,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	    !forwarding)
 		return dst_mtu(dst);
 
-	return min(dst->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
 }
 
 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
@@ -364,7 +364,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 		return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
 	}
 
-	return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
 }
 
 u32 ip_idents_reserve(u32 hash, int segs);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7effa62beed3..fe877a4a72b1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1267,7 +1267,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	if (mtu)
 		return mtu;
 
-	mtu = dst->dev->mtu;
+	mtu = READ_ONCE(dst->dev->mtu);
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
 		if (rt->rt_uses_gateway && mtu > 576)

From 014cd0a368dc6351c65d51e4ee34f8573a4a1543 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 17 Aug 2017 20:30:39 +1000
Subject: [PATCH 06/27] bpf: Update sysctl documentation to list all supported
 architectures

The sysctl documentation states that the JIT is only available on
x86_64, which is no longer correct.

Update the list, and break it out to indicate which architectures
support the cBPF JIT (via HAVE_CBPF_JIT) or the eBPF JIT
(HAVE_EBPF_JIT).

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 14db18c970b1..b9c3c6078010 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -36,8 +36,23 @@ bpf_jit_enable
 --------------
 
 This enables Berkeley Packet Filter Just in Time compiler.
-Currently supported on x86_64 architecture, bpf_jit provides a framework
-to speed packet filtering, the one used by tcpdump/libpcap for example.
+
+There are two flavors of JIT, the new eBPF JIT supported on:
+  - x86_64
+  - arm64
+  - ppc64
+  - sparc64
+  - mips64
+
+And the older cBPF JIT supported on:
+  - arm
+  - mips
+  - ppc
+  - sparc
+
+The BPF JIT provides a framework to speed packet filtering, the one used by
+tcpdump/libpcap for example.
+
 Values :
 	0 - disable the JIT (default value)
 	1 - enable the JIT

From acc8b31665b4cc17b35c4fa445427f7e2f6dc86b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 Aug 2017 10:10:43 +0200
Subject: [PATCH 07/27] net: sched: fix p_filter_chain check in tcf_chain_flush

The dereference before check is wrong and leads to an oops when
p_filter_chain is NULL. The check needs to be done on the pointer to
prevent NULL dereference.

Fixes: f93e1cdcf42c ("net/sched: fix filter flushing")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 39da0c5801c9..9fd44c221347 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -205,7 +205,7 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 {
 	struct tcf_proto *tp;
 
-	if (*chain->p_filter_chain)
+	if (chain->p_filter_chain)
 		RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
 	while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
 		RCU_INIT_POINTER(chain->filter_chain, tp->next);

From eac2c68d663effb077210218788952b5a0c1f60e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 18 Aug 2017 12:11:50 +0100
Subject: [PATCH 08/27] nfp: fix infinite loop on umapping cleanup

The while loop that performs the dma page unmapping never decrements
index counter f and hence loops forever. Fix this with a pre-decrement
on f.

Detected by CoverityScan, CID#1357309 ("Infinite loop")

Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 4631ca8b8eb2..9f77ce038a4a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -908,8 +908,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev)
 	return NETDEV_TX_OK;
 
 err_unmap:
-	--f;
-	while (f >= 0) {
+	while (--f >= 0) {
 		frag = &skb_shinfo(skb)->frags[f];
 		dma_unmap_page(dp->dev, tx_ring->txbufs[wr_idx].dma_addr,
 			       skb_frag_size(frag), DMA_TO_DEVICE);

From a120d9ab65354727559b9db75ded8071b7ef19e2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 18 Aug 2017 14:12:06 +0100
Subject: [PATCH 09/27] netxen: fix incorrect loop counter decrement

The loop counter k is currently being decremented from zero which
is incorrect. Fix this by incrementing k instead

Detected by CoverityScan, CID#401847 ("Infinite loop")

Fixes: 83f18a557c6d ("netxen_nic: fw dump support")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
index 66ff15d08bad..0a66389c06c2 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
@@ -2311,7 +2311,7 @@ netxen_md_rdqueue(struct netxen_adapter *adapter,
 				 loop_cnt++) {
 		NX_WR_DUMP_REG(select_addr, adapter->ahw.pci_base0, queue_id);
 		read_addr = queueEntry->read_addr;
-		for (k = 0; k < read_cnt; k--) {
+		for (k = 0; k < read_cnt; k++) {
 			NX_RD_DUMP_REG(read_addr, adapter->ahw.pci_base0,
 							&read_value);
 			*data_buff++ = read_value;

From 2110ba58303f0c2a03360c5f81fbe67ed312e7b9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 18 Aug 2017 17:11:06 +0200
Subject: [PATCH 10/27] bpf, doc: improve sysctl knob description

Current context speaking of tcpdump filters is out of date these
days, so lets improve the sysctl description for the BPF knobs
a bit.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 37 ++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index b9c3c6078010..d7c2b88b92ae 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -35,23 +35,32 @@ Table : Subdirectories in /proc/sys/net
 bpf_jit_enable
 --------------
 
-This enables Berkeley Packet Filter Just in Time compiler.
-
-There are two flavors of JIT, the new eBPF JIT supported on:
+This enables the BPF Just in Time (JIT) compiler. BPF is a flexible
+and efficient infrastructure allowing to execute bytecode at various
+hook points. It is used in a number of Linux kernel subsystems such
+as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints)
+and security (e.g. seccomp). LLVM has a BPF back end that can compile
+restricted C into a sequence of BPF instructions. After program load
+through bpf(2) and passing a verifier in the kernel, a JIT will then
+translate these BPF proglets into native CPU instructions. There are
+two flavors of JITs, the newer eBPF JIT currently supported on:
   - x86_64
   - arm64
   - ppc64
   - sparc64
   - mips64
 
-And the older cBPF JIT supported on:
+And the older cBPF JIT supported on the following archs:
   - arm
   - mips
   - ppc
   - sparc
 
-The BPF JIT provides a framework to speed packet filtering, the one used by
-tcpdump/libpcap for example.
+eBPF JITs are a superset of cBPF JITs, meaning the kernel will
+migrate cBPF instructions into eBPF instructions and then JIT
+compile them transparently. Older cBPF JITs can only translate
+tcpdump filters, seccomp rules, etc, but not mentioned eBPF
+programs loaded through bpf(2).
 
 Values :
 	0 - disable the JIT (default value)
@@ -61,9 +70,9 @@ Values :
 bpf_jit_harden
 --------------
 
-This enables hardening for the Berkeley Packet Filter Just in Time compiler.
-Supported are eBPF JIT backends. Enabling hardening trades off performance,
-but can mitigate JIT spraying.
+This enables hardening for the BPF JIT compiler. Supported are eBPF
+JIT backends. Enabling hardening trades off performance, but can
+mitigate JIT spraying.
 Values :
 	0 - disable JIT hardening (default value)
 	1 - enable JIT hardening for unprivileged users only
@@ -72,11 +81,11 @@ Values :
 bpf_jit_kallsyms
 ----------------
 
-When Berkeley Packet Filter Just in Time compiler is enabled, then compiled
-images are unknown addresses to the kernel, meaning they neither show up in
-traces nor in /proc/kallsyms. This enables export of these addresses, which
-can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature
-is disabled.
+When BPF JIT compiler is enabled, then compiled images are unknown
+addresses to the kernel, meaning they neither show up in traces nor
+in /proc/kallsyms. This enables export of these addresses, which can
+be used for debugging/tracing. If bpf_jit_harden is enabled, this
+feature is disabled.
 Values :
 	0 - disable JIT kallsyms export (default value)
 	1 - enable JIT kallsyms export for privileged users only

From a0917e0bc6efc05834c0c1eafebd579a9c75e6e9 Mon Sep 17 00:00:00 2001
From: Matthew Dawson <matthew@mjdsystems.ca>
Date: Fri, 18 Aug 2017 15:04:54 -0400
Subject: [PATCH 11/27] datagram: When peeking datagrams with offset < 0 don't
 skip empty skbs

Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
headers from UDP packets before queueing"), when udp packets are being
peeked the requested extra offset is always 0 as there is no need to skip
the udp header.  However, when the offset is 0 and the next skb is
of length 0, it is only returned once.  The behaviour can be seen with
the following python script:

from socket import *;
f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
f.bind(('::', 0));
addr=('::1', f.getsockname()[1]);
g.sendto(b'', addr)
g.sendto(b'b', addr)
print(f.recvfrom(10, MSG_PEEK));
print(f.recvfrom(10, MSG_PEEK));

Where the expected output should be the empty string twice.

Instead, make sk_peek_offset return negative values, and pass those values
to __skb_try_recv_datagram/__skb_try_recv_from_queue.  If the passed offset
to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
__skb_try_recv_from_queue will then ensure the offset is reset back to 0
if a peek is requested without an offset, unless no packets are found.

Also simplify the if condition in __skb_try_recv_from_queue.  If _off is
greater then 0, and off is greater then or equal to skb->len, then
(_off || skb->len) must always be true assuming skb->len >= 0 is always
true.

Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
as it double checked if MSG_PEEK was set in the flags.

V2:
 - Moved the negative fixup into __skb_try_recv_from_queue, and remove now
redundant checks
 - Fix peeking in udp{,v6}_recvmsg to report the right value when the
offset is 0

V3:
 - Marked new branch in __skb_try_recv_from_queue as unlikely.

Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h  |  4 +---
 net/core/datagram.c | 12 +++++++++---
 net/ipv4/udp.c      |  3 ++-
 net/ipv6/udp.c      |  3 ++-
 net/unix/af_unix.c  |  5 +----
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 7c0632c7e870..aeeec62992ca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -507,9 +507,7 @@ int sk_set_peek_off(struct sock *sk, int val);
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
-		s32 off = READ_ONCE(sk->sk_peek_off);
-		if (off >= 0)
-			return off;
+		return READ_ONCE(sk->sk_peek_off);
 	}
 
 	return 0;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..a21ca8dee5ea 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 					  int *peeked, int *off, int *err,
 					  struct sk_buff **last)
 {
+	bool peek_at_off = false;
 	struct sk_buff *skb;
-	int _off = *off;
+	int _off = 0;
+
+	if (unlikely(flags & MSG_PEEK && *off >= 0)) {
+		peek_at_off = true;
+		_off = *off;
+	}
 
 	*last = queue->prev;
 	skb_queue_walk(queue, skb) {
 		if (flags & MSG_PEEK) {
-			if (_off >= skb->len && (skb->len || _off ||
-						 skb->peeked)) {
+			if (peek_at_off && _off >= skb->len &&
+			    (_off || skb->peeked)) {
 				_off -= skb->len;
 				continue;
 			}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a7c804f73990..cd1d044a7fa5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 578142b7ca3e..20039c8501eb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7b52a380d710..be8982b4f8c0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	 */
 	mutex_lock(&u->iolock);
 
-	if (flags & MSG_PEEK)
-		skip = sk_peek_offset(sk, flags);
-	else
-		skip = 0;
+	skip = max(sk_peek_offset(sk, flags), 0);
 
 	do {
 		int chunk;

From ff244c6b29b176f3f448bc75e55df297225e1b3a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Aug 2017 13:39:56 -0700
Subject: [PATCH 12/27] tun: handle register_netdevice() failures properly

syzkaller reported a double free [1], caused by the fact
that tun driver was not updated properly when priv_destructor
was added.

When/if register_netdevice() fails, priv_destructor() must have been
called already.

[1]
BUG: KASAN: double-free or invalid-free in selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023

CPU: 0 PID: 2919 Comm: syzkaller227220 Not tainted 4.13.0-rc4+ #23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x7f/0x260 mm/kasan/report.c:252
 kasan_report_double_free+0x55/0x80 mm/kasan/report.c:333
 kasan_slab_free+0xa0/0xc0 mm/kasan/kasan.c:514
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xd3/0x260 mm/slab.c:3820
 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023
 security_tun_dev_free_security+0x48/0x80 security/security.c:1512
 tun_set_iff drivers/net/tun.c:1884 [inline]
 __tun_chr_ioctl+0x2ce6/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x443ff9
RSP: 002b:00007ffc34271f68 EFLAGS: 00000217 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000004002e0 RCX: 0000000000443ff9
RDX: 0000000020533000 RSI: 00000000400454ca RDI: 0000000000000003
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000217 R12: 0000000000401ce0
R13: 0000000000401d70 R14: 0000000000000000 R15: 0000000000000000

Allocated by task 2919:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xaa/0xd0 mm/kasan/kasan.c:551
 kmem_cache_alloc_trace+0x101/0x6f0 mm/slab.c:3627
 kmalloc include/linux/slab.h:493 [inline]
 kzalloc include/linux/slab.h:666 [inline]
 selinux_tun_dev_alloc_security+0x49/0x170 security/selinux/hooks.c:5012
 security_tun_dev_alloc_security+0x6d/0xa0 security/security.c:1506
 tun_set_iff drivers/net/tun.c:1839 [inline]
 __tun_chr_ioctl+0x1730/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 2919:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x6e/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xd3/0x260 mm/slab.c:3820
 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023
 security_tun_dev_free_security+0x48/0x80 security/security.c:1512
 tun_free_netdev+0x13b/0x1b0 drivers/net/tun.c:1563
 register_netdevice+0x8d0/0xee0 net/core/dev.c:7605
 tun_set_iff drivers/net/tun.c:1859 [inline]
 __tun_chr_ioctl+0x1caf/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

The buggy address belongs to the object at ffff8801d2843b40
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 0 bytes inside of
 32-byte region [ffff8801d2843b40, ffff8801d2843b60)
The buggy address belongs to the page:
page:ffffea000660cea8 count:1 mapcount:0 mapping:ffff8801d2843000 index:0xffff8801d2843fc1
flags: 0x200000000000100(slab)
raw: 0200000000000100 ffff8801d2843000 ffff8801d2843fc1 000000010000003f
raw: ffffea0006626a40 ffffea00066141a0 ffff8801dbc00100
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d2843a00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8801d2843a80: 00 00 00 fc fc fc fc fc fb fb fb fb fc fc fc fc
>ffff8801d2843b00: 00 00 00 00 fc fc fc fc fb fb fb fb fc fc fc fc
                                           ^
 ffff8801d2843b80: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8801d2843c00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc

==================================================================

Fixes: cf124db566e6 ("net: Fix inconsistent teardown and release of private netdev state.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 32ad87345f57..0a2c0a42283f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1879,6 +1879,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 err_detach:
 	tun_detach_all(dev);
+	/* register_netdevice() already called tun_free_netdev() */
+	goto err_free_dev;
+
 err_free_flow:
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);

From 5bfd37b4de5c98e86b12bd13be5aa46c7484a125 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 09:41:54 -0700
Subject: [PATCH 13/27] tipc: fix use-after-free

syszkaller reported use-after-free in tipc [1]

When msg->rep skb is freed, set the pointer to NULL,
so that caller does not free it again.

[1]

==================================================================
BUG: KASAN: use-after-free in skb_push+0xd4/0xe0 net/core/skbuff.c:1466
Read of size 8 at addr ffff8801c6e71e90 by task syz-executor5/4115

CPU: 1 PID: 4115 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
 skb_push+0xd4/0xe0 net/core/skbuff.c:1466
 tipc_nl_compat_recv+0x833/0x18f0 net/tipc/netlink_compat.c:1209
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512e9
RSP: 002b:00007f3bc8184c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004512e9
RDX: 0000000000000020 RSI: 0000000020fdb000 RDI: 0000000000000006
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b5e76
R13: 00007f3bc8184b48 R14: 00000000004b5e86 R15: 0000000000000000

Allocated by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc_node+0x13d/0x750 mm/slab.c:3651
 __alloc_skb+0xf1/0x740 net/core/skbuff.c:219
 alloc_skb include/linux/skbuff.h:903 [inline]
 tipc_tlv_alloc+0x26/0xb0 net/tipc/netlink_compat.c:148
 tipc_nl_compat_dumpit+0xf2/0x3c0 net/tipc/netlink_compat.c:248
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 kfree_skbmem+0x1a1/0x1d0 net/core/skbuff.c:622
 __kfree_skb net/core/skbuff.c:682 [inline]
 kfree_skb+0x165/0x4c0 net/core/skbuff.c:699
 tipc_nl_compat_dumpit+0x36a/0x3c0 net/tipc/netlink_compat.c:260
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

The buggy address belongs to the object at ffff8801c6e71dc0
 which belongs to the cache skbuff_head_cache of size 224
The buggy address is located 208 bytes inside of
 224-byte region [ffff8801c6e71dc0, ffff8801c6e71ea0)
The buggy address belongs to the page:
page:ffffea00071b9c40 count:1 mapcount:0 mapping:ffff8801c6e71000 index:0x0
flags: 0x200000000000100(slab)
raw: 0200000000000100 ffff8801c6e71000 0000000000000000 000000010000000c
raw: ffffea0007224a20 ffff8801d98caf48 ffff8801d9e79040 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801c6e71d80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
 ffff8801c6e71e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801c6e71e80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
                         ^
 ffff8801c6e71f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8801c6e71f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov  <dvyukov@google.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink_compat.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 9bfe886ab330..750949dfc1d7 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -258,13 +258,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
 	arg = nlmsg_new(0, GFP_KERNEL);
 	if (!arg) {
 		kfree_skb(msg->rep);
+		msg->rep = NULL;
 		return -ENOMEM;
 	}
 
 	err = __tipc_nl_compat_dumpit(cmd, msg, arg);
-	if (err)
+	if (err) {
 		kfree_skb(msg->rep);
-
+		msg->rep = NULL;
+	}
 	kfree_skb(arg);
 
 	return err;

From 15339e441ec46fbc3bf3486bb1ae4845b0f1bb8d Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 16 Aug 2017 20:16:40 +0200
Subject: [PATCH 14/27] sctp: fully initialize the IPv6 address in
 sctp_v6_to_addr()

KMSAN reported use of uninitialized sctp_addr->v4.sin_addr.s_addr and
sctp_addr->v6.sin6_scope_id in sctp_v6_cmp_addr() (see below).
Make sure all fields of an IPv6 address are initialized, which
guarantees that the IPv4 fields are also initialized.

==================================================================
 BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0
 net/sctp/ipv6.c:517
 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
 01/01/2011
 Call Trace:
  dump_stack+0x172/0x1c0 lib/dump_stack.c:42
  is_logbuf_locked mm/kmsan/kmsan.c:59 [inline]
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938
  native_save_fl arch/x86/include/asm/irqflags.h:18 [inline]
  arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline]
  arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline]
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467
  sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517
  sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
  sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651
  sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871
  inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg net/socket.c:643 [inline]
  SYSC_sendto+0x608/0x710 net/socket.c:1696
  SyS_sendto+0x8a/0xb0 net/socket.c:1664
  entry_SYSCALL_64_fastpath+0x13/0x94
 RIP: 0033:0x44b479
 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479
 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006
 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c
 R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff
 R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000
 origin description: ----dst_saddr@sctp_v6_get_dst
 local variable created at:
  sk_fullsock include/net/sock.h:2321 [inline]
  inet6_sk include/linux/ipv6.h:309 [inline]
  sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
==================================================================
 BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0
 net/sctp/ipv6.c:517
 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
 01/01/2011
 Call Trace:
  dump_stack+0x172/0x1c0 lib/dump_stack.c:42
  is_logbuf_locked mm/kmsan/kmsan.c:59 [inline]
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938
  native_save_fl arch/x86/include/asm/irqflags.h:18 [inline]
  arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline]
  arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline]
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467
  sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517
  sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
  sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651
  sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871
  inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg net/socket.c:643 [inline]
  SYSC_sendto+0x608/0x710 net/socket.c:1696
  SyS_sendto+0x8a/0xb0 net/socket.c:1664
  entry_SYSCALL_64_fastpath+0x13/0x94
 RIP: 0033:0x44b479
 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479
 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006
 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c
 R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff
 R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000
 origin description: ----dst_saddr@sctp_v6_get_dst
 local variable created at:
  sk_fullsock include/net/sock.h:2321 [inline]
  inet6_sk include/linux/ipv6.h:309 [inline]
  sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
==================================================================

Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 2a186b201ad2..a4b6ffb61495 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -512,7 +512,9 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
 {
 	addr->sa.sa_family = AF_INET6;
 	addr->v6.sin6_port = port;
+	addr->v6.sin6_flowinfo = 0;
 	addr->v6.sin6_addr = *saddr;
+	addr->v6.sin6_scope_id = 0;
 }
 
 /* Compare addresses exactly.

From 383143f31d7d3525a1dbff733d52fff917f82f15 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Wed, 16 Aug 2017 11:18:09 -0700
Subject: [PATCH 15/27] ipv6: reset fn->rr_ptr when replacing route

syzcaller reported the following use-after-free issue in rt6_select():
BUG: KASAN: use-after-free in rt6_select net/ipv6/route.c:755 [inline] at addr ffff8800bc6994e8
BUG: KASAN: use-after-free in ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 at addr ffff8800bc6994e8
Read of size 4 by task syz-executor1/439628
CPU: 0 PID: 439628 Comm: syz-executor1 Not tainted 4.3.5+ #8
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
 0000000000000000 ffff88018fe435b0 ffffffff81ca384d ffff8801d3588c00
 ffff8800bc699380 ffff8800bc699500 dffffc0000000000 ffff8801d40a47c0
 ffff88018fe435d8 ffffffff81735751 ffff88018fe43660 ffff8800bc699380
Call Trace:
 [<ffffffff81ca384d>] __dump_stack lib/dump_stack.c:15 [inline]
 [<ffffffff81ca384d>] dump_stack+0xc1/0x124 lib/dump_stack.c:51
sctp: [Deprecated]: syz-executor0 (pid 439615) Use of struct sctp_assoc_value in delayed_ack socket option.
Use struct sctp_sack_info instead
 [<ffffffff81735751>] kasan_object_err+0x21/0x70 mm/kasan/report.c:158
 [<ffffffff817359c4>] print_address_description mm/kasan/report.c:196 [inline]
 [<ffffffff817359c4>] kasan_report_error+0x1b4/0x4a0 mm/kasan/report.c:285
 [<ffffffff81735d93>] kasan_report mm/kasan/report.c:305 [inline]
 [<ffffffff81735d93>] __asan_report_load4_noabort+0x43/0x50 mm/kasan/report.c:325
 [<ffffffff82a28e39>] rt6_select net/ipv6/route.c:755 [inline]
 [<ffffffff82a28e39>] ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084
 [<ffffffff82a28fb1>] ip6_pol_route_output+0x81/0xb0 net/ipv6/route.c:1203
 [<ffffffff82ab0a50>] fib6_rule_action+0x1f0/0x680 net/ipv6/fib6_rules.c:95
 [<ffffffff8265cbb6>] fib_rules_lookup+0x2a6/0x7a0 net/core/fib_rules.c:223
 [<ffffffff82ab1430>] fib6_rule_lookup+0xd0/0x250 net/ipv6/fib6_rules.c:41
 [<ffffffff82a22006>] ip6_route_output+0x1d6/0x2c0 net/ipv6/route.c:1224
 [<ffffffff829e83d2>] ip6_dst_lookup_tail+0x4d2/0x890 net/ipv6/ip6_output.c:943
 [<ffffffff829e889a>] ip6_dst_lookup_flow+0x9a/0x250 net/ipv6/ip6_output.c:1079
 [<ffffffff82a9f7d8>] ip6_datagram_dst_update+0x538/0xd40 net/ipv6/datagram.c:91
 [<ffffffff82aa0978>] __ip6_datagram_connect net/ipv6/datagram.c:251 [inline]
 [<ffffffff82aa0978>] ip6_datagram_connect+0x518/0xe50 net/ipv6/datagram.c:272
 [<ffffffff82aa1313>] ip6_datagram_connect_v6_only+0x63/0x90 net/ipv6/datagram.c:284
 [<ffffffff8292f790>] inet_dgram_connect+0x170/0x1f0 net/ipv4/af_inet.c:564
 [<ffffffff82565547>] SYSC_connect+0x1a7/0x2f0 net/socket.c:1582
 [<ffffffff8256a649>] SyS_connect+0x29/0x30 net/socket.c:1563
 [<ffffffff82c72032>] entry_SYSCALL_64_fastpath+0x12/0x17
Object at ffff8800bc699380, in cache ip6_dst_cache size: 384

The root cause of it is that in fib6_add_rt2node(), when it replaces an
existing route with the new one, it does not update fn->rr_ptr.
This commit resets fn->rr_ptr to NULL when it points to a route which is
replaced in fib6_add_rt2node().

Fixes: 27596472473a ("ipv6: fix ECMP route replacement")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ebb299cf72b7..e6f878067c68 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -914,6 +914,8 @@ add:
 		}
 		nsiblings = iter->rt6i_nsiblings;
 		fib6_purge_rt(iter, fn, info->nl_net);
+		if (fn->rr_ptr == iter)
+			fn->rr_ptr = NULL;
 		rt6_release(iter);
 
 		if (nsiblings) {
@@ -926,6 +928,8 @@ add:
 				if (rt6_qualify_for_ecmp(iter)) {
 					*ins = iter->dst.rt6_next;
 					fib6_purge_rt(iter, fn, info->nl_net);
+					if (fn->rr_ptr == iter)
+						fn->rr_ptr = NULL;
 					rt6_release(iter);
 					nsiblings--;
 				} else {

From bc3aae2bbac46dd894c89db5d5e98f7f0ef9e205 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 16 Aug 2017 12:38:52 -0700
Subject: [PATCH 16/27] net: check and errout if res->fi is NULL when
 RTM_F_FIB_MATCH is set

Syzkaller hit 'general protection fault in fib_dump_info' bug on
commit 4.13-rc5..

Guilty file: net/ipv4/fib_semantics.c

kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 0 PID: 2808 Comm: syz-executor0 Not tainted 4.13.0-rc5 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
Ubuntu-1.8.2-1ubuntu1 04/01/2014
task: ffff880078562700 task.stack: ffff880078110000
RIP: 0010:fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314
RSP: 0018:ffff880078117010 EFLAGS: 00010206
RAX: dffffc0000000000 RBX: 00000000000000fe RCX: 0000000000000002
RDX: 0000000000000006 RSI: ffff880078117084 RDI: 0000000000000030
RBP: ffff880078117268 R08: 000000000000000c R09: ffff8800780d80c8
R10: 0000000058d629b4 R11: 0000000067fce681 R12: 0000000000000000
R13: ffff8800784bd540 R14: ffff8800780d80b5 R15: ffff8800780d80a4
FS:  00000000022fa940(0000) GS:ffff88007fc00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004387d0 CR3: 0000000079135000 CR4: 00000000000006f0
Call Trace:
  inet_rtm_getroute+0xc89/0x1f50 net/ipv4/route.c:2766
  rtnetlink_rcv_msg+0x288/0x680 net/core/rtnetlink.c:4217
  netlink_rcv_skb+0x340/0x470 net/netlink/af_netlink.c:2397
  rtnetlink_rcv+0x28/0x30 net/core/rtnetlink.c:4223
  netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
  netlink_unicast+0x4c4/0x6e0 net/netlink/af_netlink.c:1291
  netlink_sendmsg+0x8c4/0xca0 net/netlink/af_netlink.c:1854
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg+0xca/0x110 net/socket.c:643
  ___sys_sendmsg+0x779/0x8d0 net/socket.c:2035
  __sys_sendmsg+0xd1/0x170 net/socket.c:2069
  SYSC_sendmsg net/socket.c:2080 [inline]
  SyS_sendmsg+0x2d/0x50 net/socket.c:2076
  entry_SYSCALL_64_fastpath+0x1a/0xa5
  RIP: 0033:0x4512e9
  RSP: 002b:00007ffc75584cc8 EFLAGS: 00000216 ORIG_RAX:
  000000000000002e
  RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00000000004512e9
  RDX: 0000000000000000 RSI: 0000000020f2cfc8 RDI: 0000000000000003
  RBP: 000000000000000e R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000216 R12: fffffffffffffffe
  R13: 0000000000718000 R14: 0000000020c44ff0 R15: 0000000000000000
  Code: 00 0f b6 8d ec fd ff ff 48 8b 85 f0 fd ff ff 88 48 17 48 8b 45
  28 48 8d 78 30 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03
  <0f>
  b6 04 02 84 c0 74 08 3c 03 0f 8e cb 0c 00 00 48 8b 45 28 44
  RIP: fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314 RSP:
  ffff880078117010
---[ end trace 254a7af28348f88b ]---

This patch adds a res->fi NULL check.

example run:
$ip route get 0.0.0.0 iif virt1-0
broadcast 0.0.0.0 dev lo
    cache <local,brd> iif virt1-0

$ip route get 0.0.0.0 iif virt1-0 fibmatch
RTNETLINK answers: No route to host

Reported-by: idaifish <idaifish@gmail.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Fixes: b61798130f1b ("net: ipv4: RTM_GETROUTE: return matched fib result when requested")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fe877a4a72b1..2331de20ca50 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2763,14 +2763,21 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
 		table_id = rt->rt_table_id;
 
-	if (rtm->rtm_flags & RTM_F_FIB_MATCH)
+	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+		if (!res.fi) {
+			err = fib_props[res.type].error;
+			if (!err)
+				err = -EHOSTUNREACH;
+			goto errout_free;
+		}
 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
 				    rt->rt_type, res.prefix, res.prefixlen,
 				    fl4.flowi4_tos, res.fi, 0);
-	else
+	} else {
 		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+	}
 	if (err < 0)
 		goto errout_free;
 

From cdbeb633ca71a02b7b63bfeb94994bf4e1a0b894 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 16 Aug 2017 17:53:36 -0400
Subject: [PATCH 17/27] tcp: when rearming RTO, if RTO time is in past then
 fire RTO ASAP

In some situations tcp_send_loss_probe() can realize that it's unable
to send a loss probe (TLP), and falls back to calling tcp_rearm_rto()
to schedule an RTO timer. In such cases, sometimes tcp_rearm_rto()
realizes that the RTO was eligible to fire immediately or at some
point in the past (delta_us <= 0). Previously in such cases
tcp_rearm_rto() was scheduling such "overdue" RTOs to happen at now +
icsk_rto, which caused needless delays of hundreds of milliseconds
(and non-linear behavior that made reproducible testing
difficult). This commit changes the logic to schedule "overdue" RTOs
ASAP, rather than at now + icsk_rto.

Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)")
Suggested-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53de1424c13c..bab7f0493098 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3009,8 +3009,7 @@ void tcp_rearm_rto(struct sock *sk)
 			/* delta_us may not be positive if the socket is locked
 			 * when the retrans timer fires and is rescheduled.
 			 */
-			if (delta_us > 0)
-				rto = usecs_to_jiffies(delta_us);
+			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
 		}
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
 					  TCP_RTO_MAX);

From b6f6d56c91f5261c55edef2df300698c4486b669 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 17 Aug 2017 13:06:14 +0200
Subject: [PATCH 18/27] PCI: Allow PCI express root ports to find themselves

If the pci_find_pcie_root_port() function is called on a root port
itself, return the root port rather than NULL.

This effectively reverts commit 0e405232871d6 ("PCI: fix oops when
try to find Root Port for a PCI device") which added an extra check
that would now be redundant.

Fixes: a99b646afa8a ("PCI: Disable PCIe Relaxed Ordering if unsupported")
Fixes: c56d4450eb68 ("PCI: Turn off Request Attributes to avoid Chelsio T5 Completion erratum")
Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Shawn Lin <shawn.lin@rock-chips.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/pci/pci.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index da5570cf5c6a..fdf65a6c13f6 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -514,7 +514,7 @@ EXPORT_SYMBOL(pci_find_resource);
  */
 struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev)
 {
-	struct pci_dev *bridge, *highest_pcie_bridge = NULL;
+	struct pci_dev *bridge, *highest_pcie_bridge = dev;
 
 	bridge = pci_upstream_bridge(dev);
 	while (bridge && pci_is_pcie(bridge)) {
@@ -522,11 +522,10 @@ struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev)
 		bridge = pci_upstream_bridge(bridge);
 	}
 
-	if (highest_pcie_bridge &&
-	    pci_pcie_type(highest_pcie_bridge) == PCI_EXP_TYPE_ROOT_PORT)
-		return highest_pcie_bridge;
+	if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT)
+		return NULL;
 
-	return NULL;
+	return highest_pcie_bridge;
 }
 EXPORT_SYMBOL(pci_find_pcie_root_port);
 

From ca3d89a3ebe79367bd41b6b8ba37664478ae2dba Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 17 Aug 2017 18:29:52 +0300
Subject: [PATCH 19/27] net/mlx4_core: Enable 4K UAR if SRIOV module parameter
 is not enabled

enable_4k_uar module parameter was added in patch cited below to
address the backward compatibility issue in SRIOV when the VM has
system's PAGE_SIZE uar implementation and the Hypervisor has 4k uar
implementation.

The above compatibility issue does not exist in the non SRIOV case.
In this patch, we always enable 4k uar implementation if SRIOV
is not enabled on mlx4's supported cards.

Fixes: 76e39ccf9c36 ("net/mlx4_core: Fix backward compatibility on VFs")
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 09b9bc17bce9..5fe5cdc51357 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -432,7 +432,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		/* Virtual PCI function needs to determine UAR page size from
 		 * firmware. Only master PCI function can set the uar page size
 		 */
-		if (enable_4k_uar)
+		if (enable_4k_uar || !dev->persist->num_vfs)
 			dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
 		else
 			dev->uar_page_shift = PAGE_SHIFT;
@@ -2277,7 +2277,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 
 		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
 
-		if (enable_4k_uar) {
+		if (enable_4k_uar || !dev->persist->num_vfs) {
 			init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
 						    PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
 			init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;

From b024d949a3c24255a7ef1a470420eb478949aa4c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 17 Aug 2017 23:14:58 +0100
Subject: [PATCH 20/27] irda: do not leak initialized list.dev to userspace

list.dev has not been initialized and so the copy_to_user is copying
data from the stack back to user space which is a potential
information leak. Fix this ensuring all of list is initialized to
zero.

Detected by CoverityScan, CID#1357894 ("Uninitialized scalar variable")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/af_irda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 2e6990f8b80b..23fa7c8b09a5 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -2213,7 +2213,7 @@ static int irda_getsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	struct irda_sock *self = irda_sk(sk);
-	struct irda_device_list list;
+	struct irda_device_list list = { 0 };
 	struct irda_device_info *discoveries;
 	struct irda_ias_set *	ias_opt;	/* IAS get/query params */
 	struct ias_object *	ias_obj;	/* Object in IAS */

From 9a19bad70cf16b0cdf3576efda7deb490e7aa529 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 18 Aug 2017 00:19:42 +0100
Subject: [PATCH 21/27] rxrpc: Fix oops when discarding a preallocated service
 call

rxrpc_service_prealloc_one() doesn't set the socket pointer on any new call
it preallocates, but does add it to the rxrpc net namespace call list.
This, however, causes rxrpc_put_call() to oops when the call is discarded
when the socket is closed.  rxrpc_put_call() needs the socket to be able to
reach the namespace so that it can use a lock held therein.

Fix this by setting a call's socket pointer immediately before discarding
it.

This can be triggered by unloading the kafs module, resulting in an oops
like the following:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000030
IP: rxrpc_put_call+0x1e2/0x32d
PGD 0
P4D 0
Oops: 0000 [#1] SMP
Modules linked in: kafs(E-)
CPU: 3 PID: 3037 Comm: rmmod Tainted: G            E   4.12.0-fscache+ #213
Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
task: ffff8803fc92e2c0 task.stack: ffff8803fef74000
RIP: 0010:rxrpc_put_call+0x1e2/0x32d
RSP: 0018:ffff8803fef77e08 EFLAGS: 00010282
RAX: 0000000000000000 RBX: ffff8803fab99ac0 RCX: 000000000000000f
RDX: ffffffff81c50a40 RSI: 000000000000000c RDI: ffff8803fc92ea88
RBP: ffff8803fef77e30 R08: ffff8803fc87b941 R09: ffffffff82946d20
R10: ffff8803fef77d10 R11: 00000000000076fc R12: 0000000000000005
R13: ffff8803fab99c20 R14: 0000000000000001 R15: ffffffff816c6aee
FS:  00007f915a059700(0000) GS:ffff88041fb80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000030 CR3: 00000003fef39000 CR4: 00000000001406e0
Call Trace:
 rxrpc_discard_prealloc+0x325/0x341
 rxrpc_listen+0xf9/0x146
 kernel_listen+0xb/0xd
 afs_close_socket+0x3e/0x173 [kafs]
 afs_exit+0x1f/0x57 [kafs]
 SyS_delete_module+0x10f/0x19a
 do_syscall_64+0x8a/0x149
 entry_SYSCALL64_slow_path+0x25/0x25

Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/call_accept.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index dd30d74824b0..ec3383f97d4c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -223,6 +223,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 	tail = b->call_backlog_tail;
 	while (CIRC_CNT(head, tail, size) > 0) {
 		struct rxrpc_call *call = b->call_backlog[tail];
+		call->socket = rx;
 		if (rx->discard_new_call) {
 			_debug("discard %lx", call->user_call_ID);
 			rx->discard_new_call(call, call->user_call_ID);

From 4f8a881acc9d1adaf1e552349a0b1df28933a04c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 18 Aug 2017 11:01:36 +0800
Subject: [PATCH 22/27] net: sched: fix NULL pointer dereference when action
 calls some targets

As we know in some target's checkentry it may dereference par.entryinfo
to check entry stuff inside. But when sched action calls xt_check_target,
par.entryinfo is set with NULL. It would cause kernel panic when calling
some targets.

It can be reproduce with:
  # tc qd add dev eth1 ingress handle ffff:
  # tc filter add dev eth1 parent ffff: u32 match u32 0 0 action xt \
    -j ECN --ecn-tcp-remove

It could also crash kernel when using target CLUSTERIP or TPROXY.

By now there's no proper value for par.entryinfo in ipt_init_target,
but it can not be set with NULL. This patch is to void all these
panics by setting it with an ipt_entry obj with all members = 0.

Note that this issue has been there since the very beginning.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ipt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d516ba8178b8..541707802a23 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -41,6 +41,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t,
 {
 	struct xt_tgchk_param par;
 	struct xt_target *target;
+	struct ipt_entry e = {};
 	int ret = 0;
 
 	target = xt_request_find_target(AF_INET, t->u.user.name,
@@ -52,6 +53,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t,
 	memset(&par, 0, sizeof(par));
 	par.net       = net;
 	par.table     = table;
+	par.entryinfo = &e;
 	par.target    = target;
 	par.targinfo  = t->data;
 	par.hook_mask = hook;

From d4dd2d75a26ef07cadc2949efeea9fabc2a5c299 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 21 Aug 2017 00:26:03 +0200
Subject: [PATCH 23/27] bpf, doc: also add s390x as arch to sysctl description

Looks like this was accidentally missed, so still add s390x
as supported eBPF JIT arch to bpf_jit_enable.

Fixes: 014cd0a368dc ("bpf: Update sysctl documentation to list all supported architectures")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index d7c2b88b92ae..28596e03220b 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -49,6 +49,7 @@ two flavors of JITs, the newer eBPF JIT currently supported on:
   - ppc64
   - sparc64
   - mips64
+  - s390x
 
 And the older cBPF JIT supported on the following archs:
   - arm

From 5a78449810b06c3bc5fcd002d52e1a64f9bb397e Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Mon, 21 Aug 2017 08:52:54 +1200
Subject: [PATCH 24/27] switchdev: documentation: minor typo fixes

Two typos in switchdev.txt

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/switchdev.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
index 3e7b946dea27..5e40e1f68873 100644
--- a/Documentation/networking/switchdev.txt
+++ b/Documentation/networking/switchdev.txt
@@ -228,7 +228,7 @@ Learning on the device port should be enabled, as well as learning_sync:
 	bridge link set dev DEV learning on self
 	bridge link set dev DEV learning_sync on self
 
-Learning_sync attribute enables syncing of the learned/forgotton FDB entry to
+Learning_sync attribute enables syncing of the learned/forgotten FDB entry to
 the bridge's FDB.  It's possible, but not optimal, to enable learning on the
 device port and on the bridge port, and disable learning_sync.
 
@@ -245,7 +245,7 @@ the responsibility of the port driver/device to age out these entries.  If the
 port device supports ageing, when the FDB entry expires, it will notify the
 driver which in turn will notify the bridge with SWITCHDEV_FDB_DEL.  If the
 device does not support ageing, the driver can simulate ageing using a
-garbage collection timer to monitor FBD entries.  Expired entries will be
+garbage collection timer to monitor FDB entries.  Expired entries will be
 notified to the bridge using SWITCHDEV_FDB_DEL.  See rocker driver for
 example of driver running ageing timer.
 

From 49bf4b36fdee4db3c8bc0507a8413a93a8c711cf Mon Sep 17 00:00:00 2001
From: Eric Leblond <eric@regit.org>
Date: Sun, 20 Aug 2017 21:48:14 +0200
Subject: [PATCH 25/27] tools lib bpf: improve warning

Signed-off-by: Eric Leblond <eric@regit.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/lib/bpf/libbpf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1a2c07eb7795..8c67a90dbd82 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -879,7 +879,8 @@ bpf_object__create_maps(struct bpf_object *obj)
 			size_t j;
 			int err = *pfd;
 
-			pr_warning("failed to create map: %s\n",
+			pr_warning("failed to create map (name: '%s'): %s\n",
+				   obj->maps[i].name,
 				   strerror(errno));
 			for (j = 0; j < i; j++)
 				zclose(obj->maps[j].fd);

From 68a66d149a8c78ec6720f268597302883e48e9fa Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sat, 19 Aug 2017 15:37:07 +0300
Subject: [PATCH 26/27] net_sched: fix order of queue length updates in
 qdisc_replace()

This important to call qdisc_tree_reduce_backlog() after changing queue
length. Parent qdisc should deactivate class in ->qlen_notify() called from
qdisc_tree_reduce_backlog() but this happens only if qdisc->q.qlen in zero.

Missed class deactivations leads to crashes/warnings at picking packets
from empty qdisc and corrupting state at reactivating this class in future.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Fixes: 86a7996cc8a0 ("net_sched: introduce qdisc_replace() helper")
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1c123e2b2415..67f815e5d525 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -806,8 +806,11 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	old = *pold;
 	*pold = new;
 	if (old != NULL) {
-		qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
+		unsigned int qlen = old->q.qlen;
+		unsigned int backlog = old->qstats.backlog;
+
 		qdisc_reset(old);
+		qdisc_tree_reduce_backlog(old, qlen, backlog);
 	}
 	sch_tree_unlock(sch);
 

From 348a4002729ccab8b888b38cbc099efa2f2a2036 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 18 Aug 2017 17:14:49 -0700
Subject: [PATCH 27/27] ipv6: repair fib6 tree in failure case

In fib6_add(), it is possible that fib6_add_1() picks an intermediate
node and sets the node's fn->leaf to NULL in order to add this new
route. However, if fib6_add_rt2node() fails to add the new
route for some reason, fn->leaf will be left as NULL and could
potentially cause crash when fn->leaf is accessed in fib6_locate().
This patch makes sure fib6_repair_tree() is called to properly repair
fn->leaf in the above failure case.

Here is the syzkaller reported general protection fault in fib6_locate:
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 0 PID: 40937 Comm: syz-executor3 Not tainted
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff8801d7d64100 ti: ffff8801d01a0000 task.ti: ffff8801d01a0000
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] __ipv6_prefix_equal64_half include/net/ipv6.h:475 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] ipv6_prefix_equal include/net/ipv6.h:492 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] fib6_locate_1 net/ipv6/ip6_fib.c:1210 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] fib6_locate+0x281/0x3c0 net/ipv6/ip6_fib.c:1233
RSP: 0018:ffff8801d01a36a8  EFLAGS: 00010202
RAX: 0000000000000020 RBX: ffff8801bc790e00 RCX: ffffc90002983000
RDX: 0000000000001219 RSI: ffff8801d01a37a0 RDI: 0000000000000100
RBP: ffff8801d01a36f0 R08: 00000000000000ff R09: 0000000000000000
R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001
R13: dffffc0000000000 R14: ffff8801d01a37a0 R15: 0000000000000000
FS:  00007f6afd68c700(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004c6340 CR3: 00000000ba41f000 CR4: 00000000001426f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Stack:
 ffff8801d01a37a8 ffff8801d01a3780 ffffed003a0346f5 0000000c82a23ea0
 ffff8800b7bd7700 ffff8801d01a3780 ffff8800b6a1c940 ffffffff82a23ea0
 ffff8801d01a3920 ffff8801d01a3748 ffffffff82a223d6 ffff8801d7d64988
Call Trace:
 [<ffffffff82a223d6>] ip6_route_del+0x106/0x570 net/ipv6/route.c:2109
 [<ffffffff82a23f9d>] inet6_rtm_delroute+0xfd/0x100 net/ipv6/route.c:3075
 [<ffffffff82621359>] rtnetlink_rcv_msg+0x549/0x7a0 net/core/rtnetlink.c:3450
 [<ffffffff8274c1d1>] netlink_rcv_skb+0x141/0x370 net/netlink/af_netlink.c:2281
 [<ffffffff82613ddf>] rtnetlink_rcv+0x2f/0x40 net/core/rtnetlink.c:3456
 [<ffffffff8274ad38>] netlink_unicast_kernel net/netlink/af_netlink.c:1206 [inline]
 [<ffffffff8274ad38>] netlink_unicast+0x518/0x750 net/netlink/af_netlink.c:1232
 [<ffffffff8274b83e>] netlink_sendmsg+0x8ce/0xc30 net/netlink/af_netlink.c:1778
 [<ffffffff82564aff>] sock_sendmsg_nosec net/socket.c:609 [inline]
 [<ffffffff82564aff>] sock_sendmsg+0xcf/0x110 net/socket.c:619
 [<ffffffff82564d62>] sock_write_iter+0x222/0x3a0 net/socket.c:834
 [<ffffffff8178523d>] new_sync_write+0x1dd/0x2b0 fs/read_write.c:478
 [<ffffffff817853f4>] __vfs_write+0xe4/0x110 fs/read_write.c:491
 [<ffffffff81786c38>] vfs_write+0x178/0x4b0 fs/read_write.c:538
 [<ffffffff817892a9>] SYSC_write fs/read_write.c:585 [inline]
 [<ffffffff817892a9>] SyS_write+0xd9/0x1b0 fs/read_write.c:577
 [<ffffffff82c71e32>] entry_SYSCALL_64_fastpath+0x12/0x17

Note: there is no "Fixes" tag as this seems to be a bug introduced
very early.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e6f878067c68..5cc0ea038198 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1018,7 +1018,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			/* Create subtree root node */
 			sfn = node_alloc();
 			if (!sfn)
-				goto st_failure;
+				goto failure;
 
 			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
 			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
@@ -1035,12 +1035,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			if (IS_ERR(sn)) {
 				/* If it is failed, discard just allocated
-				   root, and then (in st_failure) stale node
+				   root, and then (in failure) stale node
 				   in main tree.
 				 */
 				node_free(sfn);
 				err = PTR_ERR(sn);
-				goto st_failure;
+				goto failure;
 			}
 
 			/* Now link new subtree to main tree */
@@ -1055,7 +1055,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			if (IS_ERR(sn)) {
 				err = PTR_ERR(sn);
-				goto st_failure;
+				goto failure;
 			}
 		}
 
@@ -1096,18 +1096,17 @@ out:
 			atomic_inc(&pn->leaf->rt6i_ref);
 		}
 #endif
-		/* Always release dst as dst->__refcnt is guaranteed
-		 * to be taken before entering this function
-		 */
-		dst_release_immediate(&rt->dst);
+		goto failure;
 	}
 	return err;
 
-#ifdef CONFIG_IPV6_SUBTREES
-	/* Subtree creation failed, probably main tree node
-	   is orphan. If it is, shoot it.
+failure:
+	/* fn->leaf could be NULL if fn is an intermediate node and we
+	 * failed to add the new route to it in both subtree creation
+	 * failure and fib6_add_rt2node() failure case.
+	 * In both cases, fib6_repair_tree() should be called to fix
+	 * fn->leaf.
 	 */
-st_failure:
 	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
 		fib6_repair_tree(info->nl_net, fn);
 	/* Always release dst as dst->__refcnt is guaranteed
@@ -1115,7 +1114,6 @@ st_failure:
 	 */
 	dst_release_immediate(&rt->dst);
 	return err;
-#endif
 }
 
 /*