From 1cc5954f44150bb70cac07c3cc5df7cf0dfb61ec Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 10 Apr 2018 12:57:18 +0200 Subject: [PATCH 01/22] ip_gre: clear feature flags when incompatible o_flags are set Commit dd9d598c6657 ("ip_gre: add the support for i/o_flags update via netlink") added the ability to change o_flags, but missed that the GSO/LLTX features are disabled by default, and only enabled some gre features are unused. Thus we also need to disable the GSO/LLTX features on the device when the TUNNEL_SEQ or TUNNEL_CSUM flags are set. These two examples should result in the same features being set: ip link add gre_none type gre local 192.168.0.10 remote 192.168.0.20 ttl 255 key 0 ip link set gre_none type gre seq ip link add gre_seq type gre local 192.168.0.10 remote 192.168.0.20 ttl 255 key 1 seq Fixes: dd9d598c6657 ("ip_gre: add the support for i/o_flags update via netlink") Signed-off-by: Sabrina Dubroca Reviewed-by: Xin Long Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a8772a978224..9c169bb2444d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -781,8 +781,14 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) tunnel->encap.type == TUNNEL_ENCAP_NONE) { dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } else { + dev->features &= ~NETIF_F_GSO_SOFTWARE; + dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; } dev->features |= NETIF_F_LLTX; + } else { + dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; + dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE); } } From a43cced9a348901f9015f4730b70b69e7c41a9c9 Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Wed, 11 Apr 2018 00:57:25 -0700 Subject: [PATCH 02/22] rds: MP-RDS may use an invalid c_path rds_sendmsg() calls rds_send_mprds_hash() to find a c_path to use to send a message. Suppose the RDS connection is not yet up. In rds_send_mprds_hash(), it does if (conn->c_npaths == 0) wait_event_interruptible(conn->c_hs_waitq, (conn->c_npaths != 0)); If it is interrupted before the connection is set up, rds_send_mprds_hash() will return a non-zero hash value. Hence rds_sendmsg() will use a non-zero c_path to send the message. But if the RDS connection ends up to be non-MP capable, the message will be lost as only the zero c_path can be used. Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/rds/send.c b/net/rds/send.c index acad04243b41..94c7f74909be 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -1017,10 +1017,15 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) if (conn->c_npaths == 0 && hash != 0) { rds_send_ping(conn, 0); - if (conn->c_npaths == 0) { - wait_event_interruptible(conn->c_hs_waitq, - (conn->c_npaths != 0)); - } + /* The underlying connection is not up yet. Need to wait + * until it is up to be sure that the non-zero c_path can be + * used. But if we are interrupted, we have to use the zero + * c_path in case the connection ends up being non-MP capable. + */ + if (conn->c_npaths == 0) + if (wait_event_interruptible(conn->c_hs_waitq, + conn->c_npaths != 0)) + hash = 0; if (conn->c_npaths == 1) hash = 0; } From 4bfc33807a9a02764bdd1e42e794b3b401240f27 Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Wed, 11 Apr 2018 10:59:17 +0100 Subject: [PATCH 03/22] lan78xx: Correctly indicate invalid OTP lan78xx_read_otp tries to return -EINVAL in the event of invalid OTP content, but the value gets overwritten before it is returned and the read goes ahead anyway. Make the read conditional as it should be and preserve the error code. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Phil Elwell Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index aff105f5f58c..145bb7cbf5b2 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -928,7 +928,8 @@ static int lan78xx_read_otp(struct lan78xx_net *dev, u32 offset, offset += 0x100; else ret = -EINVAL; - ret = lan78xx_read_raw_otp(dev, offset, length, data); + if (!ret) + ret = lan78xx_read_raw_otp(dev, offset, length, data); } return ret; From fed56079e76461765f1e21c214bd1b614b7981e8 Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Wed, 11 Apr 2018 12:02:47 +0100 Subject: [PATCH 04/22] lan78xx: Avoid spurious kevent 4 "error" lan78xx_defer_event generates an error message whenever the work item is already scheduled. lan78xx_open defers three events - EVENT_STAT_UPDATE, EVENT_DEV_OPEN and EVENT_LINK_RESET. Being aware of the likelihood (or certainty) of an error message, the DEV_OPEN event is added to the set of pending events directly, relying on the subsequent deferral of the EVENT_LINK_RESET call to schedule the work. Take the same precaution with EVENT_STAT_UPDATE to avoid a totally unnecessary error message. Signed-off-by: Phil Elwell Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 145bb7cbf5b2..bdb696612e11 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2503,7 +2503,7 @@ static void lan78xx_init_stats(struct lan78xx_net *dev) dev->stats.rollover_max.eee_tx_lpi_transitions = 0xFFFFFFFF; dev->stats.rollover_max.eee_tx_lpi_time = 0xFFFFFFFF; - lan78xx_defer_kevent(dev, EVENT_STAT_UPDATE); + set_bit(EVENT_STAT_UPDATE, &dev->flags); } static int lan78xx_open(struct net_device *net) From 3f01ddb962dc506916c243f9524e8bef97119b77 Mon Sep 17 00:00:00 2001 From: Tejaswi Tanikella Date: Wed, 11 Apr 2018 16:34:47 +0530 Subject: [PATCH 05/22] slip: Check if rstate is initialized before uncompressing On receiving a packet the state index points to the rstate which must be used to fill up IP and TCP headers. But if the state index points to a rstate which is unitialized, i.e. filled with zeros, it gets stuck in an infinite loop inside ip_fast_csum trying to compute the ip checsum of a header with zero length. 89.666953: <2> [] slhc_uncompress+0x464/0x468 89.666965: <2> [] ppp_receive_nonmp_frame+0x3b4/0x65c 89.666978: <2> [] ppp_receive_frame+0x64/0x7e0 89.666991: <2> [] ppp_input+0x104/0x198 89.667005: <2> [] pppopns_recv_core+0x238/0x370 89.667027: <2> [] __sk_receive_skb+0xdc/0x250 89.667040: <2> [] pppopns_recv+0x44/0x60 89.667053: <2> [] __sock_queue_rcv_skb+0x16c/0x24c 89.667065: <2> [] sock_queue_rcv_skb+0x2c/0x38 89.667085: <2> [] raw_rcv+0x124/0x154 89.667098: <2> [] raw_local_deliver+0x1e0/0x22c 89.667117: <2> [] ip_local_deliver_finish+0x70/0x24c 89.667131: <2> [] ip_local_deliver+0x100/0x10c ./scripts/faddr2line vmlinux slhc_uncompress+0x464/0x468 output: ip_fast_csum at arch/arm64/include/asm/checksum.h:40 (inlined by) slhc_uncompress at drivers/net/slip/slhc.c:615 Adding a variable to indicate if the current rstate is initialized. If such a packet arrives, move to toss state. Signed-off-by: Tejaswi Tanikella Signed-off-by: David S. Miller --- drivers/net/slip/slhc.c | 5 +++++ include/net/slhc_vj.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index 5782733959f0..f4e93f5fc204 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -509,6 +509,10 @@ slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) if(x < 0 || x > comp->rslot_limit) goto bad; + /* Check if the cstate is initialized */ + if (!comp->rstate[x].initialized) + goto bad; + comp->flags &=~ SLF_TOSS; comp->recv_current = x; } else { @@ -673,6 +677,7 @@ slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) if (cs->cs_tcp.doff > 5) memcpy(cs->cs_tcpopt, icp + ihl*4 + sizeof(struct tcphdr), (cs->cs_tcp.doff - 5) * 4); cs->cs_hsize = ihl*2 + cs->cs_tcp.doff*2; + cs->initialized = true; /* Put headers back on packet * Neither header checksum is recalculated */ diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h index 8716d5942b65..8fcf8908a694 100644 --- a/include/net/slhc_vj.h +++ b/include/net/slhc_vj.h @@ -127,6 +127,7 @@ typedef __u32 int32; */ struct cstate { byte_t cs_this; /* connection id number (xmit) */ + bool initialized; /* true if initialized */ struct cstate *next; /* next in ring (xmit) */ struct iphdr cs_ip; /* ip/tcp hdr from most recent packet */ struct tcphdr cs_tcp; From 53765341ee821c0a0f1dec41adc89c9096ad694c Mon Sep 17 00:00:00 2001 From: Bassem Boubaker Date: Wed, 11 Apr 2018 13:15:53 +0200 Subject: [PATCH 06/22] cdc_ether: flag the Cinterion AHS8 modem by gemalto as WWAN The Cinterion AHS8 is a 3G device with one embedded WWAN interface using cdc_ether as a driver. The modem is controlled via AT commands through the exposed TTYs. AT+CGDCONT write command can be used to activate or deactivate a WWAN connection for a PDP context defined with the same command. UE supports one WWAN adapter. Signed-off-by: Bassem Boubaker Acked-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index fff4b13eece2..5c42cf81a08b 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -901,6 +901,12 @@ static const struct usb_device_id products[] = { USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, +}, { + /* Cinterion AHS3 modem by GEMALTO */ + USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0055, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), From cce96d1883dae4b79f44890e5118243d806da286 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Wed, 11 Apr 2018 15:23:24 +0300 Subject: [PATCH 07/22] net: aquantia: Regression on reset with 1.x firmware On ASUS XG-C100C with 1.5.44 firmware a special mode called "dirty wake" is active. With this mode when motherboard gets powered (but no poweron happens yet), NIC automatically enables powersave link and watches for WOL packet. This normally allows to powerup the PC after AC power failures. Not all motherboards or bios settings gives power to PCI slots, so this mode is not enabled on all the hardware. 4.16 linux driver introduced full hardware reset sequence This is required since before that we had no NIC hardware reset implemented and there were side effects of "not clean start". But this full reset is incompatible with "dirty wake" WOL feature it keeps the PHY link in a special mode forever. As a consequence, driver sees no link and no traffic. To fix this we forcibly change FW state to idle state before doing the full reset. This makes FW to restore link state. Fixes: c8c82eb net: aquantia: Introduce global AQC hardware reset sequence Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../aquantia/atlantic/hw_atl/hw_atl_utils.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 84d7f4dd4ce1..e652d86b87d4 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -48,6 +48,8 @@ #define FORCE_FLASHLESS 0 static int hw_atl_utils_ver_match(u32 ver_expected, u32 ver_actual); +static int hw_atl_utils_mpi_set_state(struct aq_hw_s *self, + enum hal_atl_utils_fw_state_e state); int hw_atl_utils_initfw(struct aq_hw_s *self, const struct aq_fw_ops **fw_ops) { @@ -247,6 +249,20 @@ int hw_atl_utils_soft_reset(struct aq_hw_s *self) self->rbl_enabled = (boot_exit_code != 0); + /* FW 1.x may bootup in an invalid POWER state (WOL feature). + * We should work around this by forcing its state back to DEINIT + */ + if (!hw_atl_utils_ver_match(HW_ATL_FW_VER_1X, + aq_hw_read_reg(self, + HW_ATL_MPI_FW_VERSION))) { + int err = 0; + + hw_atl_utils_mpi_set_state(self, MPI_DEINIT); + AQ_HW_WAIT_FOR((aq_hw_read_reg(self, HW_ATL_MPI_STATE_ADR) & + HW_ATL_MPI_STATE_MSK) == MPI_DEINIT, + 10, 1000U); + } + if (self->rbl_enabled) return hw_atl_utils_soft_reset_rbl(self); else From 9a11aff25fd43d5bd2660ababdc9f564b0ba183a Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Wed, 11 Apr 2018 15:23:25 +0300 Subject: [PATCH 08/22] net: aquantia: oops when shutdown on already stopped device In case netdev is closed at the moment of pci shutdown, aq_nic_stop gets called second time. napi_disable in that case hangs indefinitely. In other case, if device was never opened at all, we get oops because of null pointer access. We should invoke aq_nic_stop conditionally, only if device is running at the moment of shutdown. Reported-by: David Arcari Fixes: 90869ddfefeb ("net: aquantia: Implement pci shutdown callback") Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index c96a92118b8b..32f6d2e24d66 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -951,9 +951,11 @@ void aq_nic_shutdown(struct aq_nic_s *self) netif_device_detach(self->ndev); - err = aq_nic_stop(self); - if (err < 0) - goto err_exit; + if (netif_running(self->ndev)) { + err = aq_nic_stop(self); + if (err < 0) + goto err_exit; + } aq_nic_deinit(self); err_exit: From 7ced6c98c7ab7a1f6743931e28671b833af79b1e Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Wed, 11 Apr 2018 15:30:38 +0200 Subject: [PATCH 09/22] vhost: Fix vhost_copy_to_user() vhost_copy_to_user is used to copy vring used elements to userspace. We should use VHOST_ADDR_USED instead of VHOST_ADDR_DESC. Fixes: f88949138058 ("vhost: introduce O(1) vq metadata cache") Signed-off-by: Eric Auger Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index bec722e41f58..f44aead98d60 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -744,7 +744,7 @@ static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to, struct iov_iter t; void __user *uaddr = vhost_vq_meta_fetch(vq, (u64)(uintptr_t)to, size, - VHOST_ADDR_DESC); + VHOST_ADDR_USED); if (uaddr) return __copy_to_user(uaddr, from, size); From d14d2b78090c7de0557362b26a4ca591aa6a9faa Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 11 Apr 2018 10:35:40 +0800 Subject: [PATCH 10/22] vhost: fix vhost_vq_access_ok() log check Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log when IOTLB is enabled") introduced a regression. The logic was originally: if (vq->iotlb) return 1; return A && B; After the patch the short-circuit logic for A was inverted: if (A || vq->iotlb) return A; return B; This patch fixes the regression by rewriting the checks in the obvious way, no longer returning A when vq->iotlb is non-NULL (which is hard to understand). Reported-by: syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com Cc: Jason Wang Signed-off-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f44aead98d60..9a18535fafba 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1244,10 +1244,12 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq, /* Caller should have vq mutex and device mutex */ int vhost_vq_access_ok(struct vhost_virtqueue *vq) { - int ret = vq_log_access_ok(vq, vq->log_base); + if (!vq_log_access_ok(vq, vq->log_base)) + return 0; - if (ret || vq->iotlb) - return ret; + /* Access validation occurs at prefetch time with IOTLB */ + if (vq->iotlb) + return 1; return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); } From ddd3d4081ffa806ffef28eaeefde757ba2b6812a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 11 Apr 2018 10:35:41 +0800 Subject: [PATCH 11/22] vhost: return bool from *_access_ok() functions Currently vhost *_access_ok() functions return int. This is error-prone because there are two popular conventions: 1. 0 means failure, 1 means success 2. -errno means failure, 0 means success Although vhost mostly uses #1, it does not do so consistently. umem_access_ok() uses #2. This patch changes the return type from int to bool so that false means failure and true means success. This eliminates a potential source of errors. Suggested-by: Linus Torvalds Signed-off-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 66 +++++++++++++++++++++---------------------- drivers/vhost/vhost.h | 4 +-- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 9a18535fafba..f3bd8e941224 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -641,14 +641,14 @@ void vhost_dev_cleanup(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_cleanup); -static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) +static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz) { u64 a = addr / VHOST_PAGE_SIZE / 8; /* Make sure 64 bit math will not overflow. */ if (a > ULONG_MAX - (unsigned long)log_base || a + (unsigned long)log_base > ULONG_MAX) - return 0; + return false; return access_ok(VERIFY_WRITE, log_base + a, (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); @@ -661,30 +661,30 @@ static bool vhost_overflow(u64 uaddr, u64 size) } /* Caller should have vq mutex and device mutex. */ -static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, - int log_all) +static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, + int log_all) { struct vhost_umem_node *node; if (!umem) - return 0; + return false; list_for_each_entry(node, &umem->umem_list, link) { unsigned long a = node->userspace_addr; if (vhost_overflow(node->userspace_addr, node->size)) - return 0; + return false; if (!access_ok(VERIFY_WRITE, (void __user *)a, node->size)) - return 0; + return false; else if (log_all && !log_access_ok(log_base, node->start, node->size)) - return 0; + return false; } - return 1; + return true; } static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, @@ -701,13 +701,13 @@ static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, /* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ -static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, - int log_all) +static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, + int log_all) { int i; for (i = 0; i < d->nvqs; ++i) { - int ok; + bool ok; bool log; mutex_lock(&d->vqs[i]->mutex); @@ -717,12 +717,12 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, ok = vq_memory_access_ok(d->vqs[i]->log_base, umem, log); else - ok = 1; + ok = true; mutex_unlock(&d->vqs[i]->mutex); if (!ok) - return 0; + return false; } - return 1; + return true; } static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, @@ -959,21 +959,21 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d, spin_unlock(&d->iotlb_lock); } -static int umem_access_ok(u64 uaddr, u64 size, int access) +static bool umem_access_ok(u64 uaddr, u64 size, int access) { unsigned long a = uaddr; /* Make sure 64 bit math will not overflow. */ if (vhost_overflow(uaddr, size)) - return -EFAULT; + return false; if ((access & VHOST_ACCESS_RO) && !access_ok(VERIFY_READ, (void __user *)a, size)) - return -EFAULT; + return false; if ((access & VHOST_ACCESS_WO) && !access_ok(VERIFY_WRITE, (void __user *)a, size)) - return -EFAULT; - return 0; + return false; + return true; } static int vhost_process_iotlb_msg(struct vhost_dev *dev, @@ -988,7 +988,7 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, ret = -EFAULT; break; } - if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) { + if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) { ret = -EFAULT; break; } @@ -1135,10 +1135,10 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access) return 0; } -static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used) +static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) { size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; @@ -1161,8 +1161,8 @@ static void vhost_vq_meta_update(struct vhost_virtqueue *vq, vq->meta_iotlb[type] = node; } -static int iotlb_access_ok(struct vhost_virtqueue *vq, - int access, u64 addr, u64 len, int type) +static bool iotlb_access_ok(struct vhost_virtqueue *vq, + int access, u64 addr, u64 len, int type) { const struct vhost_umem_node *node; struct vhost_umem *umem = vq->iotlb; @@ -1220,7 +1220,7 @@ EXPORT_SYMBOL_GPL(vq_iotlb_prefetch); /* Can we log writes? */ /* Caller should have device mutex but not vq mutex */ -int vhost_log_access_ok(struct vhost_dev *dev) +bool vhost_log_access_ok(struct vhost_dev *dev) { return memory_access_ok(dev, dev->umem, 1); } @@ -1228,8 +1228,8 @@ EXPORT_SYMBOL_GPL(vhost_log_access_ok); /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ -static int vq_log_access_ok(struct vhost_virtqueue *vq, - void __user *log_base) +static bool vq_log_access_ok(struct vhost_virtqueue *vq, + void __user *log_base) { size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; @@ -1242,14 +1242,14 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq, /* Can we start vq? */ /* Caller should have vq mutex and device mutex */ -int vhost_vq_access_ok(struct vhost_virtqueue *vq) +bool vhost_vq_access_ok(struct vhost_virtqueue *vq) { if (!vq_log_access_ok(vq, vq->log_base)) - return 0; + return false; /* Access validation occurs at prefetch time with IOTLB */ if (vq->iotlb) - return 1; + return true; return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); } diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index d8ee85ae8fdc..6c844b90a168 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -178,8 +178,8 @@ void vhost_dev_cleanup(struct vhost_dev *); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); -int vhost_vq_access_ok(struct vhost_virtqueue *vq); -int vhost_log_access_ok(struct vhost_dev *); +bool vhost_vq_access_ok(struct vhost_virtqueue *vq); +bool vhost_log_access_ok(struct vhost_dev *); int vhost_get_vq_desc(struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_count, From 7991cb9cfbce1b60ac1cff819350b05de4d902e1 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 11 Apr 2018 11:50:13 -0400 Subject: [PATCH 12/22] bnxt_en: Fix ethtool -x crash when device is down. Fix ethtool .get_rxfh() crash by checking for valid indirection table address before copying the data. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 8d8ccd67e0e2..1f622ca2a64f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -870,17 +870,22 @@ static int bnxt_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc) { struct bnxt *bp = netdev_priv(dev); - struct bnxt_vnic_info *vnic = &bp->vnic_info[0]; + struct bnxt_vnic_info *vnic; int i = 0; if (hfunc) *hfunc = ETH_RSS_HASH_TOP; - if (indir) + if (!bp->vnic_info) + return 0; + + vnic = &bp->vnic_info[0]; + if (indir && vnic->rss_table) { for (i = 0; i < HW_HASH_INDEX_SIZE; i++) indir[i] = le16_to_cpu(vnic->rss_table[i]); + } - if (key) + if (key && vnic->rss_hash_key) memcpy(key, vnic->rss_hash_key, HW_HASH_KEY_SIZE); return 0; From e85a9be93cf144623a823a0a60e4eda6ee337aef Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Wed, 11 Apr 2018 11:50:14 -0400 Subject: [PATCH 13/22] bnxt_en: do not allow wildcard matches for L2 flows Before this patch the following commands would succeed as far as the user was concerned: $ tc qdisc add dev p1p1 ingress $ tc filter add dev p1p1 parent ffff: protocol all \ flower skip_sw action drop $ tc filter add dev p1p1 parent ffff: protocol ipv4 \ flower skip_sw src_mac 00:02:00:00:00:01/44 action drop The current flow offload infrastructure used does not support wildcard matching for ethernet headers, so do not allow the second or third commands to succeed. If a user wants to drop traffic on that interface the protocol and MAC addresses need to be specified explicitly: $ tc qdisc add dev p1p1 ingress $ tc filter add dev p1p1 parent ffff: protocol arp \ flower skip_sw action drop $ tc filter add dev p1p1 parent ffff: protocol ipv4 \ flower skip_sw action drop ... $ tc filter add dev p1p1 parent ffff: protocol ipv4 \ flower skip_sw src_mac 00:02:00:00:00:01 action drop $ tc filter add dev p1p1 parent ffff: protocol ipv4 \ flower skip_sw src_mac 00:02:00:00:00:02 action drop ... There are also checks for VLAN parameters in this patch as other callers may wildcard those parameters even if tc does not. Using different flow infrastructure could allow this to work in the future for L2 flows, but for now it does not. Fixes: 2ae7408fedfe ("bnxt_en: bnxt: add TC flower filter offload support") Signed-off-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 59 ++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 65c2cee35766..ac193408a418 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -377,6 +377,30 @@ static bool is_wildcard(void *mask, int len) return true; } +static bool is_exactmatch(void *mask, int len) +{ + const u8 *p = mask; + int i; + + for (i = 0; i < len; i++) + if (p[i] != 0xff) + return false; + + return true; +} + +static bool bits_set(void *key, int len) +{ + const u8 *p = key; + int i; + + for (i = 0; i < len; i++) + if (p[i] != 0) + return true; + + return false; +} + static int bnxt_hwrm_cfa_flow_alloc(struct bnxt *bp, struct bnxt_tc_flow *flow, __le16 ref_flow_handle, __le32 tunnel_handle, __le16 *flow_handle) @@ -764,6 +788,41 @@ static bool bnxt_tc_can_offload(struct bnxt *bp, struct bnxt_tc_flow *flow) return false; } + /* Currently source/dest MAC cannot be partial wildcard */ + if (bits_set(&flow->l2_key.smac, sizeof(flow->l2_key.smac)) && + !is_exactmatch(flow->l2_mask.smac, sizeof(flow->l2_mask.smac))) { + netdev_info(bp->dev, "Wildcard match unsupported for Source MAC\n"); + return false; + } + if (bits_set(&flow->l2_key.dmac, sizeof(flow->l2_key.dmac)) && + !is_exactmatch(&flow->l2_mask.dmac, sizeof(flow->l2_mask.dmac))) { + netdev_info(bp->dev, "Wildcard match unsupported for Dest MAC\n"); + return false; + } + + /* Currently VLAN fields cannot be partial wildcard */ + if (bits_set(&flow->l2_key.inner_vlan_tci, + sizeof(flow->l2_key.inner_vlan_tci)) && + !is_exactmatch(&flow->l2_mask.inner_vlan_tci, + sizeof(flow->l2_mask.inner_vlan_tci))) { + netdev_info(bp->dev, "Wildcard match unsupported for VLAN TCI\n"); + return false; + } + if (bits_set(&flow->l2_key.inner_vlan_tpid, + sizeof(flow->l2_key.inner_vlan_tpid)) && + !is_exactmatch(&flow->l2_mask.inner_vlan_tpid, + sizeof(flow->l2_mask.inner_vlan_tpid))) { + netdev_info(bp->dev, "Wildcard match unsupported for VLAN TPID\n"); + return false; + } + + /* Currently Ethertype must be set */ + if (!is_exactmatch(&flow->l2_mask.ether_type, + sizeof(flow->l2_mask.ether_type))) { + netdev_info(bp->dev, "Wildcard match unsupported for Ethertype\n"); + return false; + } + return true; } From 479ca3bf91da971fcefc003cf5773e8d7db24794 Mon Sep 17 00:00:00 2001 From: Sriharsha Basavapatna Date: Wed, 11 Apr 2018 11:50:15 -0400 Subject: [PATCH 14/22] bnxt_en: Ignore src port field in decap filter nodes The driver currently uses src port field (along with other fields) in the decap tunnel key, while looking up and adding tunnel nodes. This leads to redundant cfa_decap_filter_alloc() requests to the FW and flow-miss in the flow engine. Fix this by ignoring the src port field in decap tunnel nodes. Fixes: f484f6782e01 ("bnxt_en: add hwrm FW cmds for cfa_encap_record and decap_filter") Signed-off-by: Sriharsha Basavapatna Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index ac193408a418..795f45024c20 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -1051,8 +1051,10 @@ static int bnxt_tc_get_decap_handle(struct bnxt *bp, struct bnxt_tc_flow *flow, /* Check if there's another flow using the same tunnel decap. * If not, add this tunnel to the table and resolve the other - * tunnel header fileds + * tunnel header fileds. Ignore src_port in the tunnel_key, + * since it is not required for decap filters. */ + decap_key->tp_src = 0; decap_node = bnxt_tc_get_tunnel_node(bp, &tc_info->decap_table, &tc_info->decap_ht_params, decap_key); From 9d96465b111edd6c4f94345783e6e01db7f435d6 Mon Sep 17 00:00:00 2001 From: Sriharsha Basavapatna Date: Wed, 11 Apr 2018 11:50:16 -0400 Subject: [PATCH 15/22] bnxt_en: Support max-mtu with VF-reps While a VF is configured with a bigger mtu (> 1500), any packets that are punted to the VF-rep (slow-path) get dropped by OVS kernel-datapath with the following message: "dropped over-mtu packet". Fix this by returning the max-mtu value for a VF-rep derived from its corresponding VF. VF-rep's mtu can be changed using 'ip' command as shown in this example: $ ip link set bnxt0_pf0vf0 mtu 9000 Signed-off-by: Sriharsha Basavapatna Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c index 26290403f38f..38f635cf8408 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c @@ -64,6 +64,31 @@ static int hwrm_cfa_vfr_free(struct bnxt *bp, u16 vf_idx) return rc; } +static int bnxt_hwrm_vfr_qcfg(struct bnxt *bp, struct bnxt_vf_rep *vf_rep, + u16 *max_mtu) +{ + struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr; + struct hwrm_func_qcfg_input req = {0}; + u16 mtu; + int rc; + + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1); + req.fid = cpu_to_le16(bp->pf.vf[vf_rep->vf_idx].fw_fid); + + mutex_lock(&bp->hwrm_cmd_lock); + + rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + if (!rc) { + mtu = le16_to_cpu(resp->max_mtu_configured); + if (!mtu) + *max_mtu = BNXT_MAX_MTU; + else + *max_mtu = mtu; + } + mutex_unlock(&bp->hwrm_cmd_lock); + return rc; +} + static int bnxt_vf_rep_open(struct net_device *dev) { struct bnxt_vf_rep *vf_rep = netdev_priv(dev); @@ -365,6 +390,7 @@ static void bnxt_vf_rep_netdev_init(struct bnxt *bp, struct bnxt_vf_rep *vf_rep, struct net_device *dev) { struct net_device *pf_dev = bp->dev; + u16 max_mtu; dev->netdev_ops = &bnxt_vf_rep_netdev_ops; dev->ethtool_ops = &bnxt_vf_rep_ethtool_ops; @@ -380,6 +406,10 @@ static void bnxt_vf_rep_netdev_init(struct bnxt *bp, struct bnxt_vf_rep *vf_rep, bnxt_vf_rep_eth_addr_gen(bp->pf.mac_addr, vf_rep->vf_idx, dev->perm_addr); ether_addr_copy(dev->dev_addr, dev->perm_addr); + /* Set VF-Rep's max-mtu to the corresponding VF's max-mtu */ + if (!bnxt_hwrm_vfr_qcfg(bp, vf_rep, &max_mtu)) + dev->max_mtu = max_mtu; + dev->min_mtu = ETH_ZLEN; } static int bnxt_pcie_dsn_get(struct bnxt *bp, u8 dsn[]) From 11c3ec7bb940b6fa3f87f05f01b7f45eef08dfbb Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 11 Apr 2018 11:50:17 -0400 Subject: [PATCH 16/22] bnxt_en: Need to include RDMA rings in bnxt_check_rings(). With recent changes to reserve both L2 and RDMA rings, we need to include the RDMA rings in bnxt_check_rings(). Otherwise we will under-estimate the rings we need during ethtool -L and may lead to failure. Fixes: fbcfc8e46741 ("bnxt_en: Reserve completion rings and MSIX for bnxt_re RDMA driver.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1991f0c7bc0e..9cb8b4bd7312 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7686,6 +7686,8 @@ int bnxt_check_rings(struct bnxt *bp, int tx, int rx, bool sh, int tcs, if (bp->flags & BNXT_FLAG_AGG_RINGS) rx_rings <<= 1; cp = sh ? max_t(int, tx_rings_needed, rx) : tx_rings_needed + rx; + if (bp->flags & BNXT_FLAG_NEW_RM) + cp += bnxt_get_ulp_msix_num(bp); return bnxt_hwrm_check_rings(bp, tx_rings_needed, rx_rings, rx, cp, vnics); } From cb98526bf9b985866d648dbb9c983ba9eb59daba Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 11 Apr 2018 11:50:18 -0400 Subject: [PATCH 17/22] bnxt_en: Fix NULL pointer dereference at bnxt_free_irq(). When open fails during ethtool -L ring change, for example, the driver may crash at bnxt_free_irq() because bp->bnapi is NULL. If we fail to allocate all the new rings, bnxt_open_nic() will free all the memory including bp->bnapi. Subsequent call to bnxt_close_nic() will try to dereference bp->bnapi in bnxt_free_irq(). Fix it by checking for !bp->bnapi in bnxt_free_irq(). Fixes: e5811b8c09df ("bnxt_en: Add IRQ remapping logic.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 9cb8b4bd7312..f83769d8047b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6090,7 +6090,7 @@ static void bnxt_free_irq(struct bnxt *bp) free_irq_cpu_rmap(bp->dev->rx_cpu_rmap); bp->dev->rx_cpu_rmap = NULL; #endif - if (!bp->irq_tbl) + if (!bp->irq_tbl || !bp->bnapi) return; for (i = 0; i < bp->cp_nr_rings; i++) { From 47b998653fea4ef69e3e89574956386f262bccca Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Tue, 10 Apr 2018 13:18:25 +0100 Subject: [PATCH 18/22] lan78xx: Don't reset the interface on open Commit 92571a1aae40 ("lan78xx: Connect phy early") moves the PHY initialisation into lan78xx_probe, but lan78xx_open subsequently calls lan78xx_reset. As well as forcing a second round of link negotiation, this reset frequently prevents the phy interrupt from being generated (even though the link is up), rendering the interface unusable. Fix this issue by removing the lan78xx_reset call from lan78xx_open. Fixes: 92571a1aae40 ("lan78xx: Connect phy early") Signed-off-by: Phil Elwell Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index bdb696612e11..0867f7275852 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2515,10 +2515,6 @@ static int lan78xx_open(struct net_device *net) if (ret < 0) goto out; - ret = lan78xx_reset(dev); - if (ret < 0) - goto done; - phy_start(net->phydev); netif_dbg(dev, ifup, dev->net, "phy initialised successfully"); From 9fffc5c6dd7dafb2e021dbbe9a30be2566a6949a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 10 Apr 2018 16:28:55 +0200 Subject: [PATCH 19/22] tun: set the flags before registering the netdevice Otherwise, register_netdevice advertises the creation of the device with the default flags, instead of what the user requested. Reported-by: Thomas Haller Fixes: 1ec010e70593 ("tun: export flags, uid, gid, queue information over netlink") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/net/tun.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a1ba262f40ad..c9e68fd76a37 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2564,6 +2564,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) */ return 0; } + + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); } else { char *name; @@ -2642,6 +2645,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); + INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI); if (err < 0) @@ -2656,9 +2662,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun_debug(KERN_INFO, tun, "tun_set_iff\n"); - tun->flags = (tun->flags & ~TUN_FEATURES) | - (ifr->ifr_flags & TUN_FEATURES); - /* Make sure persistent devices do not get stuck in * xoff state. */ From 83c1f36f9880814b24cdf6c2f91f66f61db65326 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 10 Apr 2018 16:28:56 +0200 Subject: [PATCH 20/22] tun: send netlink notification when the device is modified I added dumping of link information about tun devices over netlink in commit 1ec010e70593 ("tun: export flags, uid, gid, queue information over netlink"), but didn't add the missing netlink notifications when the device's exported properties change. This patch adds notifications when owner/group or flags are modified, when queues are attached/detached, and when a tun fd is closed. Reported-by: Thomas Haller Fixes: 1ec010e70593 ("tun: export flags, uid, gid, queue information over netlink") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/net/tun.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c9e68fd76a37..28583aa0c17d 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -743,8 +743,15 @@ static void __tun_detach(struct tun_file *tfile, bool clean) static void tun_detach(struct tun_file *tfile, bool clean) { + struct tun_struct *tun; + struct net_device *dev; + rtnl_lock(); + tun = rtnl_dereference(tfile->tun); + dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); + if (dev) + netdev_state_change(dev); rtnl_unlock(); } @@ -2562,13 +2569,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) /* One or more queue has already been attached, no need * to initialize the device again. */ + netdev_state_change(dev); return 0; } tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); - } - else { + + netdev_state_change(dev); + } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? @@ -2808,6 +2817,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) } else ret = -EINVAL; + if (ret >= 0) + netdev_state_change(tun->dev); + unlock: rtnl_unlock(); return ret; @@ -2848,6 +2860,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned int ifindex; int le; int ret; + bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { @@ -2944,10 +2957,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); + do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); + do_notify = true; } tun_debug(KERN_INFO, tun, "persist %s\n", @@ -2962,6 +2977,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } tun->owner = owner; + do_notify = true; tun_debug(KERN_INFO, tun, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; @@ -2974,6 +2990,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } tun->group = group; + do_notify = true; tun_debug(KERN_INFO, tun, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; @@ -3133,6 +3150,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } + if (do_notify) + netdev_state_change(tun->dev); + unlock: rtnl_unlock(); if (tun) From 6b9f34239b00e6956a267abed2bc559ede556ad6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 10 Apr 2018 21:01:12 +0200 Subject: [PATCH 21/22] l2tp: fix races in tunnel creation l2tp_tunnel_create() inserts the new tunnel into the namespace's tunnel list and sets the socket's ->sk_user_data field, before returning it to the caller. Therefore, there are two ways the tunnel can be accessed and freed, before the caller even had the opportunity to take a reference. In practice, syzbot could crash the module by closing the socket right after a new tunnel was returned to pppol2tp_create(). This patch moves tunnel registration out of l2tp_tunnel_create(), so that the caller can safely hold a reference before publishing the tunnel. This second step is done with the new l2tp_tunnel_register() function, which is now responsible for associating the tunnel to its socket and for inserting it into the namespace's list. While moving the code to l2tp_tunnel_register(), a few modifications have been done. First, the socket validation tests are done in a helper function, for clarity. Also, modifying the socket is now done after having inserted the tunnel to the namespace's tunnels list. This will allow insertion to fail, without having to revert theses modifications in the error path (a followup patch will check for duplicate tunnels before insertion). Either the socket is a kernel socket which we control, or it is a user-space socket for which we have a reference on the file descriptor. In any case, the socket isn't going to be closed from under us. Reported-by: syzbot+fbeeb5c3b538e8545644@syzkaller.appspotmail.com Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 192 ++++++++++++++++++---------------------- net/l2tp/l2tp_core.h | 3 + net/l2tp/l2tp_netlink.c | 16 +++- net/l2tp/l2tp_ppp.c | 9 ++ 4 files changed, 110 insertions(+), 110 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 14b67dfacc4b..afb42d142807 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1436,74 +1436,11 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 { struct l2tp_tunnel *tunnel = NULL; int err; - struct socket *sock = NULL; - struct sock *sk = NULL; - struct l2tp_net *pn; enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP; - /* Get the tunnel socket from the fd, which was opened by - * the userspace L2TP daemon. If not specified, create a - * kernel socket. - */ - if (fd < 0) { - err = l2tp_tunnel_sock_create(net, tunnel_id, peer_tunnel_id, - cfg, &sock); - if (err < 0) - goto err; - } else { - sock = sockfd_lookup(fd, &err); - if (!sock) { - pr_err("tunl %u: sockfd_lookup(fd=%d) returned %d\n", - tunnel_id, fd, err); - err = -EBADF; - goto err; - } - - /* Reject namespace mismatches */ - if (!net_eq(sock_net(sock->sk), net)) { - pr_err("tunl %u: netns mismatch\n", tunnel_id); - err = -EINVAL; - goto err; - } - } - - sk = sock->sk; - if (cfg != NULL) encap = cfg->encap; - /* Quick sanity checks */ - err = -EPROTONOSUPPORT; - if (sk->sk_type != SOCK_DGRAM) { - pr_debug("tunl %hu: fd %d wrong socket type\n", - tunnel_id, fd); - goto err; - } - switch (encap) { - case L2TP_ENCAPTYPE_UDP: - if (sk->sk_protocol != IPPROTO_UDP) { - pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n", - tunnel_id, fd, sk->sk_protocol, IPPROTO_UDP); - goto err; - } - break; - case L2TP_ENCAPTYPE_IP: - if (sk->sk_protocol != IPPROTO_L2TP) { - pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n", - tunnel_id, fd, sk->sk_protocol, IPPROTO_L2TP); - goto err; - } - break; - } - - /* Check if this socket has already been prepped */ - tunnel = l2tp_tunnel(sk); - if (tunnel != NULL) { - /* This socket has already been prepped */ - err = -EBUSY; - goto err; - } - tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL); if (tunnel == NULL) { err = -ENOMEM; @@ -1520,72 +1457,113 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 rwlock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; - /* The net we belong to */ - tunnel->l2tp_net = net; - pn = l2tp_pernet(net); - if (cfg != NULL) tunnel->debug = cfg->debug; - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ tunnel->encap = encap; - if (encap == L2TP_ENCAPTYPE_UDP) { - struct udp_tunnel_sock_cfg udp_cfg = { }; - udp_cfg.sk_user_data = tunnel; - udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP; - udp_cfg.encap_rcv = l2tp_udp_encap_recv; - udp_cfg.encap_destroy = l2tp_udp_encap_destroy; - - setup_udp_tunnel_sock(net, sock, &udp_cfg); - } else { - sk->sk_user_data = tunnel; - } - - /* Bump the reference count. The tunnel context is deleted - * only when this drops to zero. A reference is also held on - * the tunnel socket to ensure that it is not released while - * the tunnel is extant. Must be done before sk_destruct is - * set. - */ refcount_set(&tunnel->ref_count, 1); - sock_hold(sk); - tunnel->sock = sk; tunnel->fd = fd; - /* Hook on the tunnel socket destructor so that we can cleanup - * if the tunnel socket goes away. - */ - tunnel->old_sk_destruct = sk->sk_destruct; - sk->sk_destruct = &l2tp_tunnel_destruct; - lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock"); - - sk->sk_allocation = GFP_ATOMIC; - /* Init delete workqueue struct */ INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work); - /* Add tunnel to our list */ INIT_LIST_HEAD(&tunnel->list); - spin_lock_bh(&pn->l2tp_tunnel_list_lock); - list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); - spin_unlock_bh(&pn->l2tp_tunnel_list_lock); err = 0; err: if (tunnelp) *tunnelp = tunnel; - /* If tunnel's socket was created by the kernel, it doesn't - * have a file. - */ - if (sock && sock->file) - sockfd_put(sock); - return err; } EXPORT_SYMBOL_GPL(l2tp_tunnel_create); +static int l2tp_validate_socket(const struct sock *sk, const struct net *net, + enum l2tp_encap_type encap) +{ + if (!net_eq(sock_net(sk), net)) + return -EINVAL; + + if (sk->sk_type != SOCK_DGRAM) + return -EPROTONOSUPPORT; + + if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) || + (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP)) + return -EPROTONOSUPPORT; + + if (sk->sk_user_data) + return -EBUSY; + + return 0; +} + +int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, + struct l2tp_tunnel_cfg *cfg) +{ + struct l2tp_net *pn; + struct socket *sock; + struct sock *sk; + int ret; + + if (tunnel->fd < 0) { + ret = l2tp_tunnel_sock_create(net, tunnel->tunnel_id, + tunnel->peer_tunnel_id, cfg, + &sock); + if (ret < 0) + goto err; + } else { + sock = sockfd_lookup(tunnel->fd, &ret); + if (!sock) + goto err; + + ret = l2tp_validate_socket(sock->sk, net, tunnel->encap); + if (ret < 0) + goto err_sock; + } + + sk = sock->sk; + + sock_hold(sk); + tunnel->sock = sk; + tunnel->l2tp_net = net; + + pn = l2tp_pernet(net); + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { + struct udp_tunnel_sock_cfg udp_cfg = { + .sk_user_data = tunnel, + .encap_type = UDP_ENCAP_L2TPINUDP, + .encap_rcv = l2tp_udp_encap_recv, + .encap_destroy = l2tp_udp_encap_destroy, + }; + + setup_udp_tunnel_sock(net, sock, &udp_cfg); + } else { + sk->sk_user_data = tunnel; + } + + tunnel->old_sk_destruct = sk->sk_destruct; + sk->sk_destruct = &l2tp_tunnel_destruct; + lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, + "l2tp_sock"); + sk->sk_allocation = GFP_ATOMIC; + + if (tunnel->fd >= 0) + sockfd_put(sock); + + return 0; + +err_sock: + sockfd_put(sock); +err: + return ret; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_register); + /* This function is used by the netlink TUNNEL_DELETE command. */ void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 2718d0b284d0..12f0fa82f162 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -226,6 +226,9 @@ struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth); int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); +int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, + struct l2tp_tunnel_cfg *cfg); + void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index e7ea9c4b89ff..45db9b73eb1a 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -251,9 +251,19 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info break; } - if (ret >= 0) - ret = l2tp_tunnel_notify(&l2tp_nl_family, info, - tunnel, L2TP_CMD_TUNNEL_CREATE); + if (ret < 0) + goto out; + + l2tp_tunnel_inc_refcount(tunnel); + ret = l2tp_tunnel_register(tunnel, net, &cfg); + if (ret < 0) { + kfree(tunnel); + goto out; + } + ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, + L2TP_CMD_TUNNEL_CREATE); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index d6deca11da19..896bbca9bdaa 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -698,6 +698,15 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); if (error < 0) goto end; + + l2tp_tunnel_inc_refcount(tunnel); + error = l2tp_tunnel_register(tunnel, sock_net(sk), + &tcfg); + if (error < 0) { + kfree(tunnel); + goto end; + } + drop_tunnel = true; } } else { /* Error if we can't find the tunnel */ From f6cd651b056ffd3b4e8496afd44d4ed44bf69136 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 10 Apr 2018 21:01:13 +0200 Subject: [PATCH 22/22] l2tp: fix race in duplicate tunnel detection We can't use l2tp_tunnel_find() to prevent l2tp_nl_cmd_tunnel_create() from creating a duplicate tunnel. A tunnel can be concurrently registered after l2tp_tunnel_find() returns. Therefore, searching for duplicates must be done at registration time. Finally, remove l2tp_tunnel_find() entirely as it isn't use anywhere anymore. Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 35 ++++++++++++++--------------------- net/l2tp/l2tp_core.h | 1 - net/l2tp/l2tp_netlink.c | 6 ------ 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index afb42d142807..0fbd3ee26165 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -335,26 +335,6 @@ err_tlock: } EXPORT_SYMBOL_GPL(l2tp_session_register); -/* Lookup a tunnel by id - */ -struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id) -{ - struct l2tp_tunnel *tunnel; - struct l2tp_net *pn = l2tp_pernet(net); - - rcu_read_lock_bh(); - list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (tunnel->tunnel_id == tunnel_id) { - rcu_read_unlock_bh(); - return tunnel; - } - } - rcu_read_unlock_bh(); - - return NULL; -} -EXPORT_SYMBOL_GPL(l2tp_tunnel_find); - struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth) { struct l2tp_net *pn = l2tp_pernet(net); @@ -1501,6 +1481,7 @@ static int l2tp_validate_socket(const struct sock *sk, const struct net *net, int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg) { + struct l2tp_tunnel *tunnel_walk; struct l2tp_net *pn; struct socket *sock; struct sock *sk; @@ -1529,7 +1510,16 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, tunnel->l2tp_net = net; pn = l2tp_pernet(net); + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_for_each_entry(tunnel_walk, &pn->l2tp_tunnel_list, list) { + if (tunnel_walk->tunnel_id == tunnel->tunnel_id) { + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + + ret = -EEXIST; + goto err_sock; + } + } list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); @@ -1558,7 +1548,10 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, return 0; err_sock: - sockfd_put(sock); + if (tunnel->fd < 0) + sock_release(sock); + else + sockfd_put(sock); err: return ret; } diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 12f0fa82f162..ba33cbec71eb 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -220,7 +220,6 @@ struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); -struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth); int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 45db9b73eb1a..b05dbd9ffcb2 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -236,12 +236,6 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info if (info->attrs[L2TP_ATTR_DEBUG]) cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel != NULL) { - ret = -EEXIST; - goto out; - } - ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: