From 4288ea006c73e37c2a4f60dfaef20dd167b8df31 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:33:21 +0100 Subject: [PATCH 01/26] bpf: harden flags check in cgroup_storage_update_elem() cgroup_storage_update_elem() shouldn't accept any flags argument values except BPF_ANY and BPF_EXIST to guarantee the backward compatibility, had a new flag value been added. Fixes: de9cbbaadba5 ("bpf: introduce cgroup storage maps") Signed-off-by: Roman Gushchin Reported-by: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..94126cbffc88 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -129,7 +129,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags & BPF_NOEXIST) + if (flags != BPF_ANY && flags != BPF_EXIST) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, From 848e616e66d4592fe9afc40743d3504deb7632b4 Mon Sep 17 00:00:00 2001 From: Stefan Seyfried Date: Sun, 30 Sep 2018 12:53:00 +0200 Subject: [PATCH 02/26] cfg80211: fix wext-compat memory leak cfg80211_wext_giwrate and sinfo.pertid might allocate sinfo.pertid via rdev_get_station(), but never release it. Fix that. Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info") Signed-off-by: Stefan Seyfried [johannes: fix error path, use cfg80211_sinfo_release_content(), add Fixes] Signed-off-by: Johannes Berg --- net/wireless/wext-compat.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 167f7025ac98..06943d9c9835 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1278,12 +1278,16 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) - return -EOPNOTSUPP; + if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { + err = -EOPNOTSUPP; + goto free; + } rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); - return 0; +free: + cfg80211_sinfo_release_content(&sinfo); + return err; } /* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */ @@ -1293,7 +1297,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use static structs */ static struct iw_statistics wstats; - static struct station_info sinfo; + static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) @@ -1352,6 +1356,8 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; + cfg80211_sinfo_release_content(&sinfo); + return &wstats; } From 211710ca74adf790b46ab3867fcce8047b573cd1 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 29 Sep 2018 16:01:58 +0200 Subject: [PATCH 03/26] mac80211: fix setting IEEE80211_KEY_FLAG_RX_MGMT for AP mode keys key->sta is only valid after ieee80211_key_link, which is called later in this function. Because of that, the IEEE80211_KEY_FLAG_RX_MGMT is never set when management frame protection is enabled. Fixes: e548c49e6dc6b ("mac80211: add key flag for management keys") Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d25da0e66da1..5d22eda8a6b1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -427,7 +427,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ - if (key->sta && test_sta_flag(key->sta, WLAN_STA_MFP)) + if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: From 1db58529454742f67ebd96e3588315e880b72837 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 27 Sep 2018 17:05:04 -0600 Subject: [PATCH 04/26] cfg80211: fix use-after-free in reg_process_hint() reg_process_hint_country_ie() can free regulatory_request and return REG_REQ_ALREADY_SET. We shouldn't use regulatory_request after it's called. KASAN error was observed when this happens. BUG: KASAN: use-after-free in reg_process_hint+0x839/0x8aa [cfg80211] Read of size 4 at addr ffff8800c430d434 by task kworker/1:3/89 Workqueue: events reg_todo [cfg80211] Call Trace: dump_stack+0xc1/0x10c ? _atomic_dec_and_lock+0x1ad/0x1ad ? _raw_spin_lock_irqsave+0xa0/0xd2 print_address_description+0x86/0x26f ? reg_process_hint+0x839/0x8aa [cfg80211] kasan_report+0x241/0x29b reg_process_hint+0x839/0x8aa [cfg80211] reg_todo+0x204/0x5b9 [cfg80211] process_one_work+0x55f/0x8d0 ? worker_detach_from_pool+0x1b5/0x1b5 ? _raw_spin_unlock_irq+0x65/0xdd ? _raw_spin_unlock_irqrestore+0xf3/0xf3 worker_thread+0x5dd/0x841 ? kthread_parkme+0x1d/0x1d kthread+0x270/0x285 ? pr_cont_work+0xe3/0xe3 ? rcu_read_unlock_sched_notrace+0xca/0xca ret_from_fork+0x22/0x40 Allocated by task 2718: set_track+0x63/0xfa __kmalloc+0x119/0x1ac regulatory_hint_country_ie+0x38/0x329 [cfg80211] __cfg80211_connect_result+0x854/0xadd [cfg80211] cfg80211_rx_assoc_resp+0x3bc/0x4f0 [cfg80211] smsc95xx v1.0.6 ieee80211_sta_rx_queued_mgmt+0x1803/0x7ed5 [mac80211] ieee80211_iface_work+0x411/0x696 [mac80211] process_one_work+0x55f/0x8d0 worker_thread+0x5dd/0x841 kthread+0x270/0x285 ret_from_fork+0x22/0x40 Freed by task 89: set_track+0x63/0xfa kasan_slab_free+0x6a/0x87 kfree+0xdc/0x470 reg_process_hint+0x31e/0x8aa [cfg80211] reg_todo+0x204/0x5b9 [cfg80211] process_one_work+0x55f/0x8d0 worker_thread+0x5dd/0x841 kthread+0x270/0x285 ret_from_fork+0x22/0x40 Signed-off-by: Yu Zhao Signed-off-by: Johannes Berg --- net/wireless/reg.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 765dedb12361..24cfa2776f50 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2661,11 +2661,12 @@ static void reg_process_hint(struct regulatory_request *reg_request) { struct wiphy *wiphy = NULL; enum reg_request_treatment treatment; + enum nl80211_reg_initiator initiator = reg_request->initiator; if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); - switch (reg_request->initiator) { + switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: treatment = reg_process_hint_core(reg_request); break; @@ -2683,7 +2684,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) treatment = reg_process_hint_country_ie(wiphy, reg_request); break; default: - WARN(1, "invalid initiator %d\n", reg_request->initiator); + WARN(1, "invalid initiator %d\n", initiator); goto out_free; } @@ -2698,7 +2699,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - wiphy_update_regulatory(wiphy, reg_request->initiator); + wiphy_update_regulatory(wiphy, initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_check_channels(); } From b0584ea66d73919cbf5878a3420a837f06ab8396 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 2 Oct 2018 02:41:53 +0000 Subject: [PATCH 05/26] bpf: don't accept cgroup local storage with zero value size Explicitly forbid creating cgroup local storage maps with zero value size, as it makes no sense and might even cause a panic. Reported-by: syzbot+18628320d3b14a5c459c@syzkaller.appspotmail.com Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 94126cbffc88..830d7f095748 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -195,6 +195,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); From f3c84a8e3e922afdcbc55f04df8fdf8a548f5a21 Mon Sep 17 00:00:00 2001 From: Nir Dotan Date: Thu, 4 Oct 2018 15:48:02 +0000 Subject: [PATCH 06/26] mlxsw: pci: Derive event type from event queue number Due to a hardware issue in Spectrum-2, the field event_type of the event queue element (EQE) has become reserved. It was used to distinguish between command interface completion events and completion events. Use queue number to determine event type, as command interface completion events are always received on EQ0 and mlxsw driver maps completion events to EQ1. Fixes: c3ab435466d5 ("mlxsw: spectrum: Extend to support Spectrum-2 ASIC") Signed-off-by: Nir Dotan Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 4d271fb3de3d..5890fdfd62c3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -718,14 +718,17 @@ static void mlxsw_pci_eq_tasklet(unsigned long data) memset(&active_cqns, 0, sizeof(active_cqns)); while ((eqe = mlxsw_pci_eq_sw_eqe_get(q))) { - u8 event_type = mlxsw_pci_eqe_event_type_get(eqe); - switch (event_type) { - case MLXSW_PCI_EQE_EVENT_TYPE_CMD: + /* Command interface completion events are always received on + * queue MLXSW_PCI_EQ_ASYNC_NUM (EQ0) and completion events + * are mapped to queue MLXSW_PCI_EQ_COMP_NUM (EQ1). + */ + switch (q->num) { + case MLXSW_PCI_EQ_ASYNC_NUM: mlxsw_pci_eq_cmd_event(mlxsw_pci, eqe); q->u.eq.ev_cmd_count++; break; - case MLXSW_PCI_EQE_EVENT_TYPE_COMP: + case MLXSW_PCI_EQ_COMP_NUM: cqn = mlxsw_pci_eqe_cqn_get(eqe); set_bit(cqn, active_cqns); cq_handle = true; From c360867ec46a4ec5cb19e5c329d65dff522cc69d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 4 Oct 2018 15:48:03 +0000 Subject: [PATCH 07/26] mlxsw: spectrum: Delete RIF when VLAN device is removed In commit 602b74eda813 ("mlxsw: spectrum_switchdev: Do not leak RIFs when removing bridge") I handled the case where RIFs created for VLAN devices were not properly cleaned up when their real device (a bridge) was removed. However, I forgot to handle the case of the VLAN device itself being removed. Do so now when the VLAN device is being unlinked from its real device. Fixes: 99f44bb3527b ("mlxsw: spectrum: Enable L3 interfaces on top of bridge devices") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reported-by: Artem Shvorin Tested-by: Artem Shvorin Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index b492152c8881..30bb2c533cec 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4845,6 +4845,8 @@ static int mlxsw_sp_netdevice_bridge_event(struct net_device *br_dev, upper_dev = info->upper_dev; if (info->linking) break; + if (is_vlan_dev(upper_dev)) + mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, upper_dev); if (netif_is_macvlan(upper_dev)) mlxsw_sp_rif_macvlan_del(mlxsw_sp, upper_dev); break; From f7b2a56e1f3dcbdb4cf09b2b63e859ffe0e09df8 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 28 Sep 2018 17:04:30 -0600 Subject: [PATCH 08/26] net/usb: cancel pending work when unbinding smsc75xx Cancel pending work before freeing smsc75xx private data structure during binding. This fixes the following crash in the driver: BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 IP: mutex_lock+0x2b/0x3f Workqueue: events smsc75xx_deferred_multicast_write [smsc75xx] task: ffff8caa83e85700 task.stack: ffff948b80518000 RIP: 0010:mutex_lock+0x2b/0x3f Call Trace: smsc75xx_deferred_multicast_write+0x40/0x1af [smsc75xx] process_one_work+0x18d/0x2fc worker_thread+0x1a2/0x269 ? pr_cont_work+0x58/0x58 kthread+0xfa/0x10a ? pr_cont_work+0x58/0x58 ? rcu_read_unlock_sched_notrace+0x48/0x48 ret_from_fork+0x22/0x40 Signed-off-by: Yu Zhao Signed-off-by: David S. Miller --- drivers/net/usb/smsc75xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index e5a4cbb366dc..ec287c9741e8 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -1520,6 +1520,7 @@ static void smsc75xx_unbind(struct usbnet *dev, struct usb_interface *intf) { struct smsc75xx_priv *pdata = (struct smsc75xx_priv *)(dev->data[0]); if (pdata) { + cancel_work_sync(&pdata->set_multicast); netif_dbg(dev, ifdown, dev->net, "free pdata\n"); kfree(pdata); pdata = NULL; From 471b83bd8bbe4e89743683ef8ecb78f7029d8288 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 1 Oct 2018 12:21:59 +0300 Subject: [PATCH 09/26] team: Forbid enslaving team device to itself team's ndo_add_slave() acquires 'team->lock' and later tries to open the newly enslaved device via dev_open(). This emits a 'NETDEV_UP' event that causes the VLAN driver to add VLAN 0 on the team device. team's ndo_vlan_rx_add_vid() will also try to acquire 'team->lock' and deadlock. Fix this by checking early at the enslavement function that a team device is not being enslaved to itself. A similar check was added to the bond driver in commit 09a89c219baf ("bonding: disallow enslaving a bond to itself"). WARNING: possible recursive locking detected 4.18.0-rc7+ #176 Not tainted -------------------------------------------- syz-executor4/6391 is trying to acquire lock: (____ptrval____) (&team->lock){+.+.}, at: team_vlan_rx_add_vid+0x3b/0x1e0 drivers/net/team/team.c:1868 but task is already holding lock: (____ptrval____) (&team->lock){+.+.}, at: team_add_slave+0xdb/0x1c30 drivers/net/team/team.c:1947 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&team->lock); lock(&team->lock); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by syz-executor4/6391: #0: (____ptrval____) (rtnl_mutex){+.+.}, at: rtnl_lock net/core/rtnetlink.c:77 [inline] #0: (____ptrval____) (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x412/0xc30 net/core/rtnetlink.c:4662 #1: (____ptrval____) (&team->lock){+.+.}, at: team_add_slave+0xdb/0x1c30 drivers/net/team/team.c:1947 stack backtrace: CPU: 1 PID: 6391 Comm: syz-executor4 Not tainted 4.18.0-rc7+ #176 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 print_deadlock_bug kernel/locking/lockdep.c:1765 [inline] check_deadlock kernel/locking/lockdep.c:1809 [inline] validate_chain kernel/locking/lockdep.c:2405 [inline] __lock_acquire.cold.64+0x1fb/0x486 kernel/locking/lockdep.c:3435 lock_acquire+0x1e4/0x540 kernel/locking/lockdep.c:3924 __mutex_lock_common kernel/locking/mutex.c:757 [inline] __mutex_lock+0x176/0x1820 kernel/locking/mutex.c:894 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:909 team_vlan_rx_add_vid+0x3b/0x1e0 drivers/net/team/team.c:1868 vlan_add_rx_filter_info+0x14a/0x1d0 net/8021q/vlan_core.c:210 __vlan_vid_add net/8021q/vlan_core.c:278 [inline] vlan_vid_add+0x63e/0x9d0 net/8021q/vlan_core.c:308 vlan_device_event.cold.12+0x2a/0x2f net/8021q/vlan.c:381 notifier_call_chain+0x180/0x390 kernel/notifier.c:93 __raw_notifier_call_chain kernel/notifier.c:394 [inline] raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1735 call_netdevice_notifiers net/core/dev.c:1753 [inline] dev_open+0x173/0x1b0 net/core/dev.c:1433 team_port_add drivers/net/team/team.c:1219 [inline] team_add_slave+0xa8b/0x1c30 drivers/net/team/team.c:1948 do_set_master+0x1c9/0x220 net/core/rtnetlink.c:2248 do_setlink+0xba4/0x3e10 net/core/rtnetlink.c:2382 rtnl_setlink+0x2a9/0x400 net/core/rtnetlink.c:2636 rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4665 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2455 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4683 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x5a0/0x760 net/netlink/af_netlink.c:1343 netlink_sendmsg+0xa18/0xfd0 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:642 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:652 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2126 __sys_sendmsg+0x11d/0x290 net/socket.c:2164 __do_sys_sendmsg net/socket.c:2173 [inline] __se_sys_sendmsg net/socket.c:2171 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2171 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x456b29 Code: fd b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f9706bf8c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f9706bf96d4 RCX: 0000000000456b29 RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000004 RBP: 00000000009300a0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000004d3548 R14: 00000000004c8227 R15: 0000000000000000 Fixes: 87002b03baab ("net: introduce vlan_vid_[add/del] and use them instead of direct [add/kill]_vid ndo calls") Signed-off-by: Ido Schimmel Reported-and-tested-by: syzbot+bd051aba086537515cdb@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- drivers/net/team/team.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 6a047d30e8c6..d887016e54b6 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1167,6 +1167,12 @@ static int team_port_add(struct team *team, struct net_device *port_dev, return -EBUSY; } + if (dev == port_dev) { + NL_SET_ERR_MSG(extack, "Cannot enslave team device to itself"); + netdev_err(dev, "Cannot enslave team device to itself\n"); + return -EINVAL; + } + if (port_dev->features & NETIF_F_VLAN_CHALLENGED && vlan_uses_dev(dev)) { NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up"); From dbe80d446c859873820eedfff4abc61c71f1927b Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 5 Oct 2018 00:26:00 -0400 Subject: [PATCH 10/26] bnxt_en: Fix VNIC reservations on the PF. The enables bit for VNIC was set wrong when calling the HWRM_FUNC_CFG firmware call to reserve VNICs. This has the effect that the firmware will keep a large number of VNICs for the PF, and having very few for VFs. DPDK driver running on the VFs, which requires more VNICs, may not work properly as a result. Fixes: 674f50a5b026 ("bnxt_en: Implement new method to reserve rings.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0478e562abac..2564a92dcb02 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4650,7 +4650,7 @@ __bnxt_hwrm_reserve_pf_rings(struct bnxt *bp, struct hwrm_func_cfg_input *req, FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; enables |= ring_grps ? FUNC_CFG_REQ_ENABLES_NUM_HW_RING_GRPS : 0; - enables |= vnics ? FUNC_VF_CFG_REQ_ENABLES_NUM_VNICS : 0; + enables |= vnics ? FUNC_CFG_REQ_ENABLES_NUM_VNICS : 0; req->num_rx_rings = cpu_to_le16(rx_rings); req->num_hw_ring_grps = cpu_to_le16(ring_grps); From 5db0e0969af6501ad45fe0494039d3b9c797822b Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Fri, 5 Oct 2018 00:26:01 -0400 Subject: [PATCH 11/26] bnxt_en: Fix enables field in HWRM_QUEUE_COS2BW_CFG request In HWRM_QUEUE_COS2BW_CFG request, enables field should have the bits set only for the queue ids which are having the valid parameters. This causes firmware to return error when the TC to hardware CoS queue mapping is not 1:1 during DCBNL ETS setup. Fixes: 2e8ef77ee0ff ("bnxt_en: Add TC to hardware QoS queue mapping logic.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c index ddc98c359488..a85d2be986af 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c @@ -98,13 +98,13 @@ static int bnxt_hwrm_queue_cos2bw_cfg(struct bnxt *bp, struct ieee_ets *ets, bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_COS2BW_CFG, -1, -1); for (i = 0; i < max_tc; i++) { - u8 qidx; + u8 qidx = bp->tc_to_qidx[i]; req.enables |= cpu_to_le32( - QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID << i); + QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID << + qidx); memset(&cos2bw, 0, sizeof(cos2bw)); - qidx = bp->tc_to_qidx[i]; cos2bw.queue_id = bp->q_info[qidx].queue_id; if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_STRICT) { cos2bw.tsa = From a2bf74f4e1b82395dad2b08d2a911d9151db71c1 Mon Sep 17 00:00:00 2001 From: Venkat Duvvuru Date: Fri, 5 Oct 2018 00:26:02 -0400 Subject: [PATCH 12/26] bnxt_en: free hwrm resources, if driver probe fails. When the driver probe fails, all the resources that were allocated prior to the failure must be freed. However, hwrm dma response memory is not getting freed. This patch fixes the problem described above. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Signed-off-by: Venkat Duvvuru Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2564a92dcb02..3718984a8185 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3017,10 +3017,11 @@ static void bnxt_free_hwrm_resources(struct bnxt *bp) { struct pci_dev *pdev = bp->pdev; - dma_free_coherent(&pdev->dev, PAGE_SIZE, bp->hwrm_cmd_resp_addr, - bp->hwrm_cmd_resp_dma_addr); - - bp->hwrm_cmd_resp_addr = NULL; + if (bp->hwrm_cmd_resp_addr) { + dma_free_coherent(&pdev->dev, PAGE_SIZE, bp->hwrm_cmd_resp_addr, + bp->hwrm_cmd_resp_dma_addr); + bp->hwrm_cmd_resp_addr = NULL; + } } static int bnxt_alloc_hwrm_resources(struct bnxt *bp) @@ -9057,6 +9058,7 @@ init_err_cleanup_tc: bnxt_clear_int_mode(bp); init_err_pci_clean: + bnxt_free_hwrm_resources(bp); bnxt_cleanup_pci(bp); init_err_free: From c78fe058879bdea919d44f23e21da26f603e9166 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Fri, 5 Oct 2018 00:26:03 -0400 Subject: [PATCH 13/26] bnxt_en: get the reduced max_irqs by the ones used by RDMA When getting the max rings supported, get the reduced max_irqs by the ones used by RDMA. If the number MSIX is the limiting factor, this bug may cause the max ring count to be higher than it should be when RDMA driver is loaded and may result in ring allocation failures. Fixes: 30f529473ec9 ("bnxt_en: Do not modify max IRQ count after RDMA driver requests/frees IRQs.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3718984a8185..e2d92548226a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8622,7 +8622,7 @@ static void _bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, *max_tx = hw_resc->max_tx_rings; *max_rx = hw_resc->max_rx_rings; *max_cp = min_t(int, bnxt_get_max_func_cp_rings_for_en(bp), - hw_resc->max_irqs); + hw_resc->max_irqs - bnxt_get_ulp_msix_num(bp)); *max_cp = min_t(int, *max_cp, hw_resc->max_stat_ctxs); max_ring_grps = hw_resc->max_hw_ring_grps; if (BNXT_CHIP_TYPE_NITRO_A0(bp) && BNXT_PF(bp)) { From 17c357efe5eceebdc3971a48b3d4d61a03c1178b Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Fri, 28 Sep 2018 14:51:28 -0300 Subject: [PATCH 14/26] openvswitch: load NAT helper Load the respective NAT helper module if the flow uses it. Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 0aeb34c6389d..35ae64cbef33 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1312,6 +1312,10 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, rcu_assign_pointer(help->helper, helper); info->helper = helper; + + if (info->nat) + request_module("ip_nat_%s", name); + return 0; } From ca8931948344c485569b04821d1f6bcebccd376b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 4 Oct 2018 20:24:13 -0700 Subject: [PATCH 15/26] net: dsa: b53: Keep CPU port as tagged in all VLANs Commit c499696e7901 ("net: dsa: b53: Stop using dev->cpu_port incorrectly") was a bit too trigger happy in removing the CPU port from the VLAN membership because we rely on DSA to program the CPU port VLAN, which it does, except it does not bother itself with tagged/untagged and just usese untagged. Having the CPU port "follow" the user ports tagged/untagged is not great and does not allow for properly differentiating, so keep the CPU port tagged in all VLANs. Reported-by: Gerhard Wiesinger Fixes: c499696e7901 ("net: dsa: b53: Stop using dev->cpu_port incorrectly") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index d93c790bfbe8..ad534b90ef21 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1107,7 +1107,7 @@ void b53_vlan_add(struct dsa_switch *ds, int port, b53_get_vlan_entry(dev, vid, vl); vl->members |= BIT(port); - if (untagged) + if (untagged && !dsa_is_cpu_port(ds, port)) vl->untag |= BIT(port); else vl->untag &= ~BIT(port); @@ -1149,7 +1149,7 @@ int b53_vlan_del(struct dsa_switch *ds, int port, pvid = 0; } - if (untagged) + if (untagged && !dsa_is_cpu_port(ds, port)) vl->untag &= ~(BIT(port)); b53_set_vlan_entry(dev, vid, vl); From 9d2f67e43b73e8af7438be219b66a5de0cfa8bd9 Mon Sep 17 00:00:00 2001 From: Jianfeng Tan Date: Sat, 29 Sep 2018 15:41:27 +0000 Subject: [PATCH 16/26] net/packet: fix packet drop as of virtio gso When we use raw socket as the vhost backend, a packet from virito with gso offloading information, cannot be sent out in later validaton at xmit path, as we did not set correct skb->protocol which is further used for looking up the gso function. To fix this, we set this field according to virito hdr information. Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion") Signed-off-by: Jianfeng Tan Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 18 ++++++++++++++++++ net/packet/af_packet.c | 11 +++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 9397628a1967..cb462f9ab7dd 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -5,6 +5,24 @@ #include #include +static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, + const struct virtio_net_hdr *hdr) +{ + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_UDP: + skb->protocol = cpu_to_be16(ETH_P_IP); + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + skb->protocol = cpu_to_be16(ETH_P_IPV6); + break; + default: + return -EINVAL; + } + + return 0; +} + static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 75c92a87e7b2..d6e94dc7e290 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2715,10 +2715,12 @@ tpacket_error: } } - if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr, - vio_le())) { - tp_len = -EINVAL; - goto tpacket_error; + if (po->has_vnet_hdr) { + if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { + tp_len = -EINVAL; + goto tpacket_error; + } + virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; @@ -2915,6 +2917,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (err) goto out_free; len += sizeof(vnet_hdr); + virtio_net_hdr_set_proto(skb, &vnet_hdr); } skb_probe_transport_header(skb, reserve); From 2d52527e80c2dc0c5f43f50adf183781262ec565 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 3 Oct 2018 15:20:58 +0200 Subject: [PATCH 17/26] be2net: don't flip hw_features when VXLANs are added/deleted the be2net implementation of .ndo_tunnel_{add,del}() changes the value of NETIF_F_GSO_UDP_TUNNEL bit in 'features' and 'hw_features', but it forgets to call netdev_features_change(). Moreover, ethtool setting for that bit can potentially be reverted after a tunnel is added or removed. GSO already does software segmentation when 'hw_enc_features' is 0, even if VXLAN offload is turned on. In addition, commit 096de2f83ebc ("benet: stricter vxlan offloading check in be_features_check") avoids hardware segmentation of non-VXLAN tunneled packets, or VXLAN packets having wrong destination port. So, it's safe to avoid flipping the above feature on addition/deletion of VXLAN tunnels. Fixes: 630f4b70567f ("be2net: Export tunnel offloads only when a VxLAN tunnel is created") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 74d122616e76..534787291b44 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -4002,8 +4002,6 @@ static int be_enable_vxlan_offloads(struct be_adapter *adapter) netdev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_UDP_TUNNEL; - netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL; - netdev->features |= NETIF_F_GSO_UDP_TUNNEL; dev_info(dev, "Enabled VxLAN offloads for UDP port %d\n", be16_to_cpu(port)); @@ -4025,8 +4023,6 @@ static void be_disable_vxlan_offloads(struct be_adapter *adapter) adapter->vxlan_port = 0; netdev->hw_enc_features = 0; - netdev->hw_features &= ~(NETIF_F_GSO_UDP_TUNNEL); - netdev->features &= ~(NETIF_F_GSO_UDP_TUNNEL); } static void be_calculate_vf_res(struct be_adapter *adapter, u16 num_vfs, @@ -5320,6 +5316,7 @@ static void be_netdev_init(struct net_device *netdev) struct be_adapter *adapter = netdev_priv(netdev); netdev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_GSO_UDP_TUNNEL | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX; if ((be_if_cap_flags(adapter) & BE_IF_FLAGS_RSS)) From 7e4183752735deb7543e179a44f4f4b44917cd6f Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Wed, 3 Oct 2018 19:04:49 +0300 Subject: [PATCH 18/26] net: phy: phylink: fix SFP interface autodetection When connecting SFP PHY to phylink use the detected interface. Otherwise, the link fails to come up when the configured 'phy-mode' differs from the SFP detected mode. Move most of phylink_connect_phy() into __phylink_connect_phy(), and leave phylink_connect_phy() as a wrapper. phylink_sfp_connect_phy() can now pass the SFP detected PHY interface to __phylink_connect_phy(). This fixes 1GB SFP module link up on eth3 of the Macchiatobin board that is configured in the DT to "2500base-x" phy-mode. Fixes: 9525ae83959b6 ("phylink: add phylink infrastructure") Suggested-by: Russell King Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 48 +++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 3ba5cf2a8a5f..7abca86c3aa9 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -717,6 +717,30 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy) return 0; } +static int __phylink_connect_phy(struct phylink *pl, struct phy_device *phy, + phy_interface_t interface) +{ + int ret; + + if (WARN_ON(pl->link_an_mode == MLO_AN_FIXED || + (pl->link_an_mode == MLO_AN_INBAND && + phy_interface_mode_is_8023z(interface)))) + return -EINVAL; + + if (pl->phydev) + return -EBUSY; + + ret = phy_attach_direct(pl->netdev, phy, 0, interface); + if (ret) + return ret; + + ret = phylink_bringup_phy(pl, phy); + if (ret) + phy_detach(phy); + + return ret; +} + /** * phylink_connect_phy() - connect a PHY to the phylink instance * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -734,31 +758,13 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy) */ int phylink_connect_phy(struct phylink *pl, struct phy_device *phy) { - int ret; - - if (WARN_ON(pl->link_an_mode == MLO_AN_FIXED || - (pl->link_an_mode == MLO_AN_INBAND && - phy_interface_mode_is_8023z(pl->link_interface)))) - return -EINVAL; - - if (pl->phydev) - return -EBUSY; - /* Use PHY device/driver interface */ if (pl->link_interface == PHY_INTERFACE_MODE_NA) { pl->link_interface = phy->interface; pl->link_config.interface = pl->link_interface; } - ret = phy_attach_direct(pl->netdev, phy, 0, pl->link_interface); - if (ret) - return ret; - - ret = phylink_bringup_phy(pl, phy); - if (ret) - phy_detach(phy); - - return ret; + return __phylink_connect_phy(pl, phy, pl->link_interface); } EXPORT_SYMBOL_GPL(phylink_connect_phy); @@ -1672,7 +1678,9 @@ static void phylink_sfp_link_up(void *upstream) static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy) { - return phylink_connect_phy(upstream, phy); + struct phylink *pl = upstream; + + return __phylink_connect_phy(upstream, phy, pl->link_config.interface); } static void phylink_sfp_disconnect_phy(void *upstream) From b799207e1e1816b09e7a5920fbb2d5fcf6edd681 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Oct 2018 18:17:59 +0200 Subject: [PATCH 19/26] bpf: 32-bit RSH verification must truncate input before the ALU op When I wrote commit 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification"), I assumed that, in order to emulate 64-bit arithmetic with 32-bit logic, it is sufficient to just truncate the output to 32 bits; and so I just moved the register size coercion that used to be at the start of the function to the end of the function. That assumption is true for almost every op, but not for 32-bit right shifts, because those can propagate information towards the least significant bit. Fix it by always truncating inputs for 32-bit ops to 32 bits. Also get rid of the coerce_reg_to_size() after the ALU op, since that has no effect. Fixes: 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification") Acked-by: Daniel Borkmann Signed-off-by: Jann Horn Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb07e74b34a2..465952a8e465 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2896,6 +2896,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + if (insn_bitness == 32) { + /* Relevant for 32-bit RSH: Information can propagate towards + * LSB, so it isn't sufficient to only truncate the output to + * 32 bits. + */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -3131,7 +3140,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->32 */ coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); } __reg_deduce_bounds(dst_reg); From 2c05d88818ab6571816b93edce4d53703870d7ae Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Fri, 5 Oct 2018 08:48:27 -0500 Subject: [PATCH 20/26] net: cxgb3_main: fix a missing-check bug In cxgb_extension_ioctl(), the command of the ioctl is firstly copied from the user-space buffer 'useraddr' to 'cmd' and checked through the switch statement. If the command is not as expected, an error code EOPNOTSUPP is returned. In the following execution, i.e., the cases of the switch statement, the whole buffer of 'useraddr' is copied again to a specific data structure, according to what kind of command is requested. However, after the second copy, there is no re-check on the newly-copied command. Given that the buffer 'useraddr' is in the user space, a malicious user can race to change the command between the two copies. By doing so, the attacker can supply malicious data to the kernel and cause undefined behavior. This patch adds a re-check in each case of the switch statement if there is a second copy in that case, to re-check whether the command obtained in the second copy is the same as the one in the first copy. If not, an error code EINVAL is returned. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index a19172dbe6be..c34ea385fe4a 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2159,6 +2159,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EPERM; if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_SET_QSET_PARAMS) + return -EINVAL; if (t.qset_idx >= SGE_QSETS) return -EINVAL; if (!in_range(t.intr_lat, 0, M_NEWTIMER) || @@ -2258,6 +2260,9 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_GET_QSET_PARAMS) + return -EINVAL; + /* Display qsets for all ports when offload enabled */ if (test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) { q1 = 0; @@ -2303,6 +2308,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EBUSY; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; + if (edata.cmd != CHELSIO_SET_QSET_NUM) + return -EINVAL; if (edata.val < 1 || (edata.val > 1 && !(adapter->flags & USING_MSIX))) return -EINVAL; @@ -2343,6 +2350,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EPERM; if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_LOAD_FW) + return -EINVAL; /* Check t.len sanity ? */ fw_data = memdup_user(useraddr + sizeof(t), t.len); if (IS_ERR(fw_data)) @@ -2366,6 +2375,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EBUSY; if (copy_from_user(&m, useraddr, sizeof(m))) return -EFAULT; + if (m.cmd != CHELSIO_SETMTUTAB) + return -EINVAL; if (m.nmtus != NMTUS) return -EINVAL; if (m.mtus[0] < 81) /* accommodate SACK */ @@ -2407,6 +2418,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EBUSY; if (copy_from_user(&m, useraddr, sizeof(m))) return -EFAULT; + if (m.cmd != CHELSIO_SET_PM) + return -EINVAL; if (!is_power_of_2(m.rx_pg_sz) || !is_power_of_2(m.tx_pg_sz)) return -EINVAL; /* not power of 2 */ @@ -2440,6 +2453,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EIO; /* need the memory controllers */ if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_GET_MEM) + return -EINVAL; if ((t.addr & 7) || (t.len & 7)) return -EINVAL; if (t.mem_id == MEM_CM) @@ -2492,6 +2507,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EAGAIN; if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_SET_TRACE_FILTER) + return -EINVAL; tp = (const struct trace_params *)&t.sip; if (t.config_tx) From 33aa8da1f8a7dc050b9d68f1db761ab787621065 Mon Sep 17 00:00:00 2001 From: Shanthosh RK Date: Fri, 5 Oct 2018 20:57:48 +0530 Subject: [PATCH 21/26] net: bpfilter: Fix type cast and pointer warnings Fixes the following Sparse warnings: net/bpfilter/bpfilter_kern.c:62:21: warning: cast removes address space of expression net/bpfilter/bpfilter_kern.c:101:49: warning: Using plain integer as NULL pointer Signed-off-by: Shanthosh RK Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index f0fc182d3db7..b64e1649993b 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -59,7 +59,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.is_set = is_set; req.pid = current->pid; req.cmd = optname; - req.addr = (long)optval; + req.addr = (long __force __user)optval; req.len = optlen; mutex_lock(&bpfilter_lock); if (!info.pid) @@ -98,7 +98,7 @@ static int __init load_umh(void) pr_info("Loaded bpfilter_umh pid %d\n", info.pid); /* health check that usermode process started correctly */ - if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) { + if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { stop_umh(); return -EFAULT; } From 0781168e23a2fc8dceb989f11fc5b39b3ccacc35 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Fri, 5 Oct 2018 10:59:36 -0500 Subject: [PATCH 22/26] yam: fix a missing-check bug In yam_ioctl(), the concrete ioctl command is firstly copied from the user-space buffer 'ifr->ifr_data' to 'ioctl_cmd' and checked through the following switch statement. If the command is not as expected, an error code EINVAL is returned. In the following execution the buffer 'ifr->ifr_data' is copied again in the cases of the switch statement to specific data structures according to what kind of ioctl command is requested. However, after the second copy, no re-check is enforced on the newly-copied command. Given that the buffer 'ifr->ifr_data' is in the user space, a malicious user can race to change the command between the two copies. This way, the attacker can inject inconsistent data and cause undefined behavior. This patch adds a re-check in each case of the switch statement if there is a second copy in that case, to re-check whether the command obtained in the second copy is the same as the one in the first copy. If not, an error code EINVAL will be returned. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- drivers/net/hamradio/yam.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 16ec7af6ab7b..ba9df430fca6 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -966,6 +966,8 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) sizeof(struct yamdrv_ioctl_mcs)); if (IS_ERR(ym)) return PTR_ERR(ym); + if (ym->cmd != SIOCYAMSMCS) + return -EINVAL; if (ym->bitrate > YAM_MAXBITRATE) { kfree(ym); return -EINVAL; @@ -981,6 +983,8 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (copy_from_user(&yi, ifr->ifr_data, sizeof(struct yamdrv_ioctl_cfg))) return -EFAULT; + if (yi.cmd != SIOCYAMSCFG) + return -EINVAL; if ((yi.cfg.mask & YAM_IOBASE) && netif_running(dev)) return -EINVAL; /* Cannot change this parameter when up */ if ((yi.cfg.mask & YAM_IRQ) && netif_running(dev)) From bd961c9bc66497f0c63f4ba1d02900bb85078366 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Mon, 1 Oct 2018 22:46:40 -0300 Subject: [PATCH 23/26] rtnetlink: fix rtnl_fdb_dump() for ndmsg header Currently, rtnl_fdb_dump() assumes the family header is 'struct ifinfomsg', which is not always true -- 'struct ndmsg' is used by iproute2 ('ip neigh'). The problem is, the function bails out early if nlmsg_parse() fails, which does occur for iproute2 usage of 'struct ndmsg' because the payload length is shorter than the family header alone (as 'struct ifinfomsg' is assumed). This breaks backward compatibility with userspace -- nothing is sent back. Some examples with iproute2 and netlink library for go [1]: 1) $ bridge fdb show 33:33:00:00:00:01 dev ens3 self permanent 01:00:5e:00:00:01 dev ens3 self permanent 33:33:ff:15:98:30 dev ens3 self permanent This one works, as it uses 'struct ifinfomsg'. fdb_show() @ iproute2/bridge/fdb.c """ .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), ... if (rtnl_dump_request(&rth, RTM_GETNEIGH, [...] """ 2) $ ip --family bridge neigh RTNETLINK answers: Invalid argument Dump terminated This one fails, as it uses 'struct ndmsg'. do_show_or_flush() @ iproute2/ip/ipneigh.c """ .n.nlmsg_type = RTM_GETNEIGH, .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), """ 3) $ ./neighlist < no output > This one fails, as it uses 'struct ndmsg'-based. neighList() @ netlink/neigh_linux.go """ req := h.newNetlinkRequest(unix.RTM_GETNEIGH, [...] msg := Ndmsg{ """ The actual breakage was introduced by commit 0ff50e83b512 ("net: rtnetlink: bail out from rtnl_fdb_dump() on parse error"), because nlmsg_parse() fails if the payload length (with the _actual_ family header) is less than the family header length alone (which is assumed, in parameter 'hdrlen'). This is true in the examples above with struct ndmsg, with size and payload length shorter than struct ifinfomsg. However, that commit just intends to fix something under the assumption the family header is indeed an 'struct ifinfomsg' - by preventing access to the payload as such (via 'ifm' pointer) if the payload length is not sufficient to actually contain it. The assumption was introduced by commit 5e6d24358799 ("bridge: netlink dump interface at par with brctl"), to support iproute2's 'bridge fdb' command (not 'ip neigh') which indeed uses 'struct ifinfomsg', thus is not broken. So, in order to unbreak the 'struct ndmsg' family headers and still allow 'struct ifinfomsg' to continue to work, check for the known message sizes used with 'struct ndmsg' in iproute2 (with zero or one attribute which is not used in this function anyway) then do not parse the data as ifinfomsg. Same examples with this patch applied (or revert/before the original fix): $ bridge fdb show 33:33:00:00:00:01 dev ens3 self permanent 01:00:5e:00:00:01 dev ens3 self permanent 33:33:ff:15:98:30 dev ens3 self permanent $ ip --family bridge neigh dev ens3 lladdr 33:33:00:00:00:01 PERMANENT dev ens3 lladdr 01:00:5e:00:00:01 PERMANENT dev ens3 lladdr 33:33:ff:15:98:30 PERMANENT $ ./neighlist netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x33, 0x33, 0x0, 0x0, 0x0, 0x1}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x1, 0x0, 0x5e, 0x0, 0x0, 0x1}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x33, 0x33, 0xff, 0x15, 0x98, 0x30}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} Tested on mainline (v4.19-rc6) and net-next (3bd09b05b068). References: [1] netlink library for go (test-case) https://github.com/vishvananda/netlink $ cat ~/go/src/neighlist/main.go package main import ("fmt"; "syscall"; "github.com/vishvananda/netlink") func main() { neighs, _ := netlink.NeighList(0, syscall.AF_BRIDGE) for _, neigh := range neighs { fmt.Printf("%#v\n", neigh) } } $ export GOPATH=~/go $ go get github.com/vishvananda/netlink $ go build neighlist $ ~/go/src/neighlist/neighlist Thanks to David Ahern for suggestions to improve this patch. Fixes: 0ff50e83b512 ("net: rtnetlink: bail out from rtnl_fdb_dump() on parse error") Fixes: 5e6d24358799 ("bridge: netlink dump interface at par with brctl") Reported-by: Aidan Obley Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 448703312fed..37c7936124e6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3748,16 +3748,27 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL); - if (err < 0) { - return -EINVAL; - } else if (err == 0) { - if (tb[IFLA_MASTER]) - br_idx = nla_get_u32(tb[IFLA_MASTER]); - } + /* A hack to preserve kernel<->userspace interface. + * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. + * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. + * So, check for ndmsg with an optional u32 attribute (not used here). + * Fortunately these sizes don't conflict with the size of ifinfomsg + * with an optional attribute. + */ + if (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) && + (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) + + nla_attr_size(sizeof(u32)))) { + err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { + if (tb[IFLA_MASTER]) + br_idx = nla_get_u32(tb[IFLA_MASTER]); + } - brport_idx = ifm->ifi_index; + brport_idx = ifm->ifi_index; + } if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); From 8b4c3cdd9dd8290343ce959a132d3b334062c5b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 3 Oct 2018 15:05:36 -0700 Subject: [PATCH 24/26] net: sched: Add policy validation for tc attributes A number of TC attributes are processed without proper validation (e.g., length checks). Add a tca policy for all input attributes and use when invoking nlmsg_parse. The 2 Fixes tags below cover the latest additions. The other attributes are a string (KIND), nested attribute (OPTIONS which does seem to have validation in most cases), for dumps only or a flag. Fixes: 5bc1701881e39 ("net: sched: introduce multichain support for filters") Fixes: d47a6b0e7c492 ("net: sched: introduce ingress/egress block index attributes for qdisc") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/sched/sch_api.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 98541c6399db..85e73f48e48f 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1311,6 +1311,18 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) * Delete/get qdisc. */ +const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { + [TCA_KIND] = { .type = NLA_STRING }, + [TCA_OPTIONS] = { .type = NLA_NESTED }, + [TCA_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct tc_estimator) }, + [TCA_STAB] = { .type = NLA_NESTED }, + [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, + [TCA_CHAIN] = { .type = NLA_U32 }, + [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, + [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, +}; + static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -1327,7 +1339,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -1411,7 +1424,8 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, replay: /* Reinit, just in case something touches this. */ - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -1645,7 +1659,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, + rtm_tca_policy, NULL); if (err < 0) return err; @@ -1864,7 +1879,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; From a688caa34beb2fd2a92f1b6d33e40cde433ba160 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 4 Oct 2018 10:12:37 -0700 Subject: [PATCH 25/26] ipv6: take rcu lock in rawv6_send_hdrinc() In rawv6_send_hdrinc(), in order to avoid an extra dst_hold(), we directly assign the dst to skb and set passed in dst to NULL to avoid double free. However, in error case, we free skb and then do stats update with the dst pointer passed in. This causes use-after-free on the dst. Fix it by taking rcu read lock right before dst could get released to make sure dst does not get freed until the stats update is done. Note: we don't have this issue in ipv4 cause dst is not used for stats update in v4. Syzkaller reported following crash: BUG: KASAN: use-after-free in rawv6_send_hdrinc net/ipv6/raw.c:692 [inline] BUG: KASAN: use-after-free in rawv6_sendmsg+0x4421/0x4630 net/ipv6/raw.c:921 Read of size 8 at addr ffff8801d95ba730 by task syz-executor0/32088 CPU: 1 PID: 32088 Comm: syz-executor0 Not tainted 4.19.0-rc2+ #93 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 rawv6_send_hdrinc net/ipv6/raw.c:692 [inline] rawv6_sendmsg+0x4421/0x4630 net/ipv6/raw.c:921 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:631 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2114 __sys_sendmsg+0x11d/0x280 net/socket.c:2152 __do_sys_sendmsg net/socket.c:2161 [inline] __se_sys_sendmsg net/socket.c:2159 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2159 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457099 Code: fd b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f83756edc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f83756ee6d4 RCX: 0000000000457099 RDX: 0000000000000000 RSI: 0000000020003840 RDI: 0000000000000004 RBP: 00000000009300a0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000004d4b30 R14: 00000000004c90b1 R15: 0000000000000000 Allocated by task 32088: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490 kmem_cache_alloc+0x12e/0x730 mm/slab.c:3554 dst_alloc+0xbb/0x1d0 net/core/dst.c:105 ip6_dst_alloc+0x35/0xa0 net/ipv6/route.c:353 ip6_rt_cache_alloc+0x247/0x7b0 net/ipv6/route.c:1186 ip6_pol_route+0x8f8/0xd90 net/ipv6/route.c:1895 ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2093 fib6_rule_lookup+0x277/0x860 net/ipv6/fib6_rules.c:122 ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2121 ip6_route_output include/net/ip6_route.h:88 [inline] ip6_dst_lookup_tail+0xe27/0x1d60 net/ipv6/ip6_output.c:951 ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1079 rawv6_sendmsg+0x12d9/0x4630 net/ipv6/raw.c:905 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:631 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2114 __sys_sendmsg+0x11d/0x280 net/socket.c:2152 __do_sys_sendmsg net/socket.c:2161 [inline] __se_sys_sendmsg net/socket.c:2159 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2159 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 5356: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kmem_cache_free+0x83/0x290 mm/slab.c:3756 dst_destroy+0x267/0x3c0 net/core/dst.c:141 dst_destroy_rcu+0x16/0x19 net/core/dst.c:154 __rcu_reclaim kernel/rcu/rcu.h:236 [inline] rcu_do_batch kernel/rcu/tree.c:2576 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2880 [inline] __rcu_process_callbacks kernel/rcu/tree.c:2847 [inline] rcu_process_callbacks+0xf23/0x2670 kernel/rcu/tree.c:2864 __do_softirq+0x30b/0xad8 kernel/softirq.c:292 Fixes: 1789a640f556 ("raw: avoid two atomics in xmit") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/raw.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 413d98bf24f4..5e0efd3954e9 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -651,8 +651,6 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; skb->tstamp = sockc->transmit_time; - skb_dst_set(skb, &rt->dst); - *dstp = NULL; skb_put(skb, length); skb_reset_network_header(skb); @@ -665,8 +663,14 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); - if (err) - goto error_fault; + if (err) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + skb_dst_set(skb, &rt->dst); + *dstp = NULL; /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -675,21 +679,28 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (unlikely(!skb)) return 0; + /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev + * in the error path. Since skb has been freed, the dst could + * have been queued for deletion. + */ + rcu_read_lock(); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); - if (err) - goto error; + if (err) { + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); + goto error_check; + } + rcu_read_unlock(); out: return 0; -error_fault: - err = -EFAULT; - kfree_skb(skb); error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); +error_check: if (err == -ENOBUFS && !np->recverr) err = 0; return err; From 35f3625c21852ad839f20c91c7d81c4c1101e207 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 5 Oct 2018 09:04:40 +0200 Subject: [PATCH 26/26] net: mvpp2: Extract the correct ethtype from the skb for tx csum offload When offloading the L3 and L4 csum computation on TX, we need to extract the l3_proto from the ethtype, independently of the presence of a vlan tag. The actual driver uses skb->protocol as-is, resulting in packets with the wrong L4 checksum being sent when there's a vlan tag in the packet header and checksum offloading is enabled. This commit makes use of vlan_protocol_get() to get the correct ethtype regardless the presence of a vlan tag. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 38cc01beea79..a74002b43b51 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -1725,7 +1725,7 @@ static void mvpp2_txq_desc_put(struct mvpp2_tx_queue *txq) } /* Set Tx descriptors fields relevant for CSUM calculation */ -static u32 mvpp2_txq_desc_csum(int l3_offs, int l3_proto, +static u32 mvpp2_txq_desc_csum(int l3_offs, __be16 l3_proto, int ip_hdr_len, int l4_proto) { u32 command; @@ -2600,14 +2600,15 @@ static u32 mvpp2_skb_tx_csum(struct mvpp2_port *port, struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_PARTIAL) { int ip_hdr_len = 0; u8 l4_proto; + __be16 l3_proto = vlan_get_protocol(skb); - if (skb->protocol == htons(ETH_P_IP)) { + if (l3_proto == htons(ETH_P_IP)) { struct iphdr *ip4h = ip_hdr(skb); /* Calculate IPv4 checksum and L4 checksum */ ip_hdr_len = ip4h->ihl; l4_proto = ip4h->protocol; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + } else if (l3_proto == htons(ETH_P_IPV6)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); /* Read l4_protocol from one of IPv6 extra headers */ @@ -2619,7 +2620,7 @@ static u32 mvpp2_skb_tx_csum(struct mvpp2_port *port, struct sk_buff *skb) } return mvpp2_txq_desc_csum(skb_network_offset(skb), - skb->protocol, ip_hdr_len, l4_proto); + l3_proto, ip_hdr_len, l4_proto); } return MVPP2_TXD_L4_CSUM_NOT | MVPP2_TXD_IP_CSUM_DISABLE;