From ca876c7483b697b498868b1f575997191b077885 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Thu, 12 Jul 2018 17:25:06 +0200 Subject: [PATCH 01/23] gpiolib-acpi: make sure we trigger edge events at least once on boot On some systems using edge triggered ACPI Event Interrupts, the initial state at boot is not setup by the firmware, instead relying on the edge irq event handler running at least once to setup the initial state. 2 known examples of this are: 1) The Surface 3 has its _LID state controlled by an ACPI operation region triggered by a GPIO event: OperationRegion (GPOR, GeneralPurposeIo, Zero, One) Field (GPOR, ByteAcc, NoLock, Preserve) { Connection ( GpioIo (Shared, PullNone, 0x0000, 0x0000, IoRestrictionNone, "\\_SB.GPO0", 0x00, ResourceConsumer, , ) { // Pin list 0x004C } ), HELD, 1 } Method (_E4C, 0, Serialized) // _Exx: Edge-Triggered GPE { If ((HELD == One)) { ^^LID.LIDB = One } Else { ^^LID.LIDB = Zero Notify (LID, 0x80) // Status Change } Notify (^^PCI0.SPI1.NTRG, One) // Device Check } Currently, the state of LIDB is wrong until the user actually closes or open the cover. We need to trigger the GPIO event once to update the internal ACPI state. Coincidentally, this also enables the Surface 2 integrated HID sensor hub which also requires an ACPI gpio operation region to start initialization. 2) Various Bay Trail based tablets come with an external USB mux and TI T1210B USB phy to enable USB gadget mode. The mux is controlled by a GPIO which is controlled by an edge triggered ACPI Event Interrupt which monitors the micro-USB ID pin. When the tablet is connected to a PC (or no cable is plugged in), the ID pin is high and the tablet should be in gadget mode. But the GPIO controlling the mux is initialized by the firmware so that the USB data lines are muxed to the host controller. This means that if the user wants to use gadget mode, the user needs to first plug in a host-cable to force the ID pin low and then unplug it and connect the tablet to a PC, to get the ACPI event handler to run and switch the mux to device mode, This commit fixes both by running the event-handler once on boot. Note that the running of the event-handler is done from a late_initcall, this is done because the handler AML code may rely on OperationRegions registered by other builtin drivers. This avoids errors like these: [ 0.133026] ACPI Error: No handler for Region [XSCG] ((____ptrval____)) [GenericSerialBus] (20180531/evregion-132) [ 0.133036] ACPI Error: Region GenericSerialBus (ID=9) has no handler (20180531/exfldio-265) [ 0.133046] ACPI Error: Method parse/execution failed \_SB.GPO2._E12, AE_NOT_EXIST (20180531/psparse-516) Signed-off-by: Benjamin Tissoires [hdegoede: Document BYT USB mux reliance on initial trigger] [hdegoede: Run event handler from a late_initcall, rather then immediately] Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Acked-by: Mika Westerberg Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib-acpi.c | 56 ++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index e2232cbcec8b..addd9fecc198 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -25,6 +25,7 @@ struct acpi_gpio_event { struct list_head node; + struct list_head initial_sync_list; acpi_handle handle; unsigned int pin; unsigned int irq; @@ -50,6 +51,9 @@ struct acpi_gpio_chip { struct list_head events; }; +static LIST_HEAD(acpi_gpio_initial_sync_list); +static DEFINE_MUTEX(acpi_gpio_initial_sync_list_lock); + static int acpi_gpiochip_find(struct gpio_chip *gc, void *data) { if (!gc->parent) @@ -85,6 +89,21 @@ static struct gpio_desc *acpi_get_gpiod(char *path, int pin) return gpiochip_get_desc(chip, pin); } +static void acpi_gpio_add_to_initial_sync_list(struct acpi_gpio_event *event) +{ + mutex_lock(&acpi_gpio_initial_sync_list_lock); + list_add(&event->initial_sync_list, &acpi_gpio_initial_sync_list); + mutex_unlock(&acpi_gpio_initial_sync_list_lock); +} + +static void acpi_gpio_del_from_initial_sync_list(struct acpi_gpio_event *event) +{ + mutex_lock(&acpi_gpio_initial_sync_list_lock); + if (!list_empty(&event->initial_sync_list)) + list_del_init(&event->initial_sync_list); + mutex_unlock(&acpi_gpio_initial_sync_list_lock); +} + static irqreturn_t acpi_gpio_irq_handler(int irq, void *data) { struct acpi_gpio_event *event = data; @@ -136,7 +155,7 @@ static acpi_status acpi_gpiochip_request_interrupt(struct acpi_resource *ares, irq_handler_t handler = NULL; struct gpio_desc *desc; unsigned long irqflags; - int ret, pin, irq; + int ret, pin, irq, value; if (!acpi_gpio_get_irq_resource(ares, &agpio)) return AE_OK; @@ -167,6 +186,8 @@ static acpi_status acpi_gpiochip_request_interrupt(struct acpi_resource *ares, gpiod_direction_input(desc); + value = gpiod_get_value(desc); + ret = gpiochip_lock_as_irq(chip, pin); if (ret) { dev_err(chip->parent, "Failed to lock GPIO as interrupt\n"); @@ -208,6 +229,7 @@ static acpi_status acpi_gpiochip_request_interrupt(struct acpi_resource *ares, event->irq = irq; event->pin = pin; event->desc = desc; + INIT_LIST_HEAD(&event->initial_sync_list); ret = request_threaded_irq(event->irq, NULL, handler, irqflags, "ACPI:Event", event); @@ -222,6 +244,18 @@ static acpi_status acpi_gpiochip_request_interrupt(struct acpi_resource *ares, enable_irq_wake(irq); list_add_tail(&event->node, &acpi_gpio->events); + + /* + * Make sure we trigger the initial state of the IRQ when using RISING + * or FALLING. Note we run the handlers on late_init, the AML code + * may refer to OperationRegions from other (builtin) drivers which + * may be probed after us. + */ + if (handler == acpi_gpio_irq_handler && + (((irqflags & IRQF_TRIGGER_RISING) && value == 1) || + ((irqflags & IRQF_TRIGGER_FALLING) && value == 0))) + acpi_gpio_add_to_initial_sync_list(event); + return AE_OK; fail_free_event: @@ -294,6 +328,8 @@ void acpi_gpiochip_free_interrupts(struct gpio_chip *chip) list_for_each_entry_safe_reverse(event, ep, &acpi_gpio->events, node) { struct gpio_desc *desc; + acpi_gpio_del_from_initial_sync_list(event); + if (irqd_is_wakeup_set(irq_get_irq_data(event->irq))) disable_irq_wake(event->irq); @@ -1158,3 +1194,21 @@ bool acpi_can_fallback_to_crs(struct acpi_device *adev, const char *con_id) return con_id == NULL; } + +/* Sync the initial state of handlers after all builtin drivers have probed */ +static int acpi_gpio_initial_sync(void) +{ + struct acpi_gpio_event *event, *ep; + + mutex_lock(&acpi_gpio_initial_sync_list_lock); + list_for_each_entry_safe(event, ep, &acpi_gpio_initial_sync_list, + initial_sync_list) { + acpi_evaluate_object(event->handle, NULL, NULL, NULL); + list_del_init(&event->initial_sync_list); + } + mutex_unlock(&acpi_gpio_initial_sync_list_lock); + + return 0; +} +/* We must use _sync so that this runs after the first deferred_probe run */ +late_initcall_sync(acpi_gpio_initial_sync); From e70a3aad44cc8b24986687ffc98c4a4f6ecf25ea Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Aug 2018 23:20:38 -0700 Subject: [PATCH 02/23] ipv6: fix double refcount of fib6_metrics All the callers of ip6_rt_copy_init()/rt6_set_from() hold refcnt of the "from" fib6_info, so there is no need to hold fib6_metrics refcnt again, because fib6_metrics refcnt is only released when fib6_info is gone, that is, they have the same life time, so the whole fib6_metrics refcnt can be removed actually. This fixes a kmemleak warning reported by Sabrina. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: Sabrina Dubroca Cc: Sabrina Dubroca Cc: David Ahern Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ec18b3ce8b6d..7208c16302f6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -978,10 +978,6 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); - if (from->fib6_metrics != &dst_default_metrics) { - rt->dst._metrics |= DST_METRICS_REFCOUNTED; - refcount_inc(&from->fib6_metrics->refcnt); - } } /* Caller must already hold reference to @ort */ From 82a40777de12728dedf4075453b694f0d1baee80 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 5 Aug 2018 22:46:07 +0800 Subject: [PATCH 03/23] ip6_tunnel: use the right value for ipv4 min mtu check in ip6_tnl_xmit According to RFC791, 68 bytes is the minimum size of IPv4 datagram every device must be able to forward without further fragmentation while 576 bytes is the minimum size of IPv4 datagram every device has to be able to receive, so in ip6_tnl_xmit(), 68(IPV4_MIN_MTU) should be the right value for the ipv4 min mtu check in ip6_tnl_xmit. While at it, change to use max() instead of if statement. Fixes: c9fefa08190f ("ip6_tunnel: get the min mtu properly in ip6_tnl_xmit") Reported-by: Sabrina Dubroca Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 00e138a44cbb..1cc9650af9fb 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1133,12 +1133,8 @@ route_lookup: max_headroom += 8; mtu -= 8; } - if (skb->protocol == htons(ETH_P_IPV6)) { - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - } else if (mtu < 576) { - mtu = 576; - } + mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? + IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { From 4576cd469d980317c4edd9173f8b694aa71ea3a3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 6 Aug 2018 10:38:34 -0400 Subject: [PATCH 04/23] packet: refine ring v3 block size test to hold one frame TPACKET_V3 stores variable length frames in fixed length blocks. Blocks must be able to store a block header, optional private space and at least one minimum sized frame. Frames, even for a zero snaplen packet, store metadata headers and optional reserved space. In the block size bounds check, ensure that the frame of the chosen configuration fits. This includes sockaddr_ll and optional tp_reserve. Syzbot was able to construct a ring with insuffient room for the sockaddr_ll in the header of a zero-length frame, triggering an out-of-bounds write in dev_parse_header. Convert the comparison to less than, as zero is a valid snap len. This matches the test for minimum tp_frame_size immediately below. Fixes: f6fb8f100b80 ("af-packet: TPACKET_V3 flexible buffer implementation.") Fixes: eb73190f4fbe ("net/packet: refine check for priv area size") Reported-by: syzbot Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9b27d0cd766d..e6445d8f3f57 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4226,6 +4226,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } if (req->tp_block_nr) { + unsigned int min_frame_size; + /* Sanity tests and some calculations */ err = -EBUSY; if (unlikely(rb->pg_vec)) @@ -4248,12 +4250,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; + min_frame_size = po->tp_hdrlen + po->tp_reserve; if (po->tp_version >= TPACKET_V3 && - req->tp_block_size <= - BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + sizeof(struct tpacket3_hdr)) + req->tp_block_size < + BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size) goto out; - if (unlikely(req->tp_frame_size < po->tp_hdrlen + - po->tp_reserve)) + if (unlikely(req->tp_frame_size < min_frame_size)) goto out; if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; From f10dc56c64bb662822475304508c1ce99f194e70 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 29 Jul 2018 16:52:30 +0200 Subject: [PATCH 05/23] crypto: arm64 - revert NEON yield for fast AEAD implementations As it turns out, checking the TIF_NEED_RESCHED flag after each iteration results in a significant performance regression (~10%) when running fast algorithms (i.e., ones that use special instructions and operate in the < 4 cycles per byte range) on in-order cores with comparatively slow memory accesses such as the Cortex-A53. Given the speed of these ciphers, and the fact that the page based nature of the AEAD scatterwalk API guarantees that the core NEON transform is never invoked with more than a single page's worth of input, we can estimate the worst case duration of any resulting scheduling blackout: on a 1 GHz Cortex-A53 running with 64k pages, processing a page's worth of input at 4 cycles per byte results in a delay of ~250 us, which is a reasonable upper bound. So let's remove the yield checks from the fused AES-CCM and AES-GCM routines entirely. This reverts commit 7b67ae4d5ce8e2f912377f5fbccb95811a92097f and partially reverts commit 7c50136a8aba8784f07fb66a950cc61a7f3d2ee3. Fixes: 7c50136a8aba ("crypto: arm64/aes-ghash - yield NEON after every ...") Fixes: 7b67ae4d5ce8 ("crypto: arm64/aes-ccm - yield NEON after every ...") Signed-off-by: Ard Biesheuvel Acked-by: Herbert Xu Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-ce-ccm-core.S | 148 ++++++++++------------------ arch/arm64/crypto/ghash-ce-core.S | 78 +++++---------- 2 files changed, 80 insertions(+), 146 deletions(-) diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 88f5aef7934c..e3a375c4cb83 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -19,33 +19,24 @@ * u32 *macp, u8 const rk[], u32 rounds); */ ENTRY(ce_aes_ccm_auth_data) - frame_push 7 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - - ldr w25, [x22] /* leftover from prev round? */ + ldr w8, [x3] /* leftover from prev round? */ ld1 {v0.16b}, [x0] /* load mac */ - cbz w25, 1f - sub w25, w25, #16 + cbz w8, 1f + sub w8, w8, #16 eor v1.16b, v1.16b, v1.16b -0: ldrb w7, [x20], #1 /* get 1 byte of input */ - subs w21, w21, #1 - add w25, w25, #1 +0: ldrb w7, [x1], #1 /* get 1 byte of input */ + subs w2, w2, #1 + add w8, w8, #1 ins v1.b[0], w7 ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ beq 8f /* out of input? */ - cbnz w25, 0b + cbnz w8, 0b eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.4s}, [x23] /* load first round key */ - prfm pldl1strm, [x20] - cmp w24, #12 /* which key size? */ - add x6, x23, #16 - sub w7, w24, #2 /* modified # of rounds */ +1: ld1 {v3.4s}, [x4] /* load first round key */ + prfm pldl1strm, [x1] + cmp w5, #12 /* which key size? */ + add x6, x4, #16 + sub w7, w5, #2 /* modified # of rounds */ bmi 2f bne 5f mov v5.16b, v3.16b @@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data) ld1 {v5.4s}, [x6], #16 /* load next round key */ bpl 3b aese v0.16b, v4.16b - subs w21, w21, #16 /* last data? */ + subs w2, w2, #16 /* last data? */ eor v0.16b, v0.16b, v5.16b /* final round */ bmi 6f - ld1 {v1.16b}, [x20], #16 /* load next input block */ + ld1 {v1.16b}, [x1], #16 /* load next input block */ eor v0.16b, v0.16b, v1.16b /* xor with mac */ - beq 6f - - if_will_cond_yield_neon - st1 {v0.16b}, [x19] /* store mac */ - do_cond_yield_neon - ld1 {v0.16b}, [x19] /* reload mac */ - endif_yield_neon - - b 1b -6: st1 {v0.16b}, [x19] /* store mac */ + bne 1b +6: st1 {v0.16b}, [x0] /* store mac */ beq 10f - adds w21, w21, #16 + adds w2, w2, #16 beq 10f - mov w25, w21 -7: ldrb w7, [x20], #1 + mov w8, w2 +7: ldrb w7, [x1], #1 umov w6, v0.b[0] eor w6, w6, w7 - strb w6, [x19], #1 - subs w21, w21, #1 + strb w6, [x0], #1 + subs w2, w2, #1 beq 10f ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ b 7b -8: mov w7, w25 - add w25, w25, #16 +8: mov w7, w8 + add w8, w8, #16 9: ext v1.16b, v1.16b, v1.16b, #1 adds w7, w7, #1 bne 9b eor v0.16b, v0.16b, v1.16b - st1 {v0.16b}, [x19] -10: str w25, [x22] - - frame_pop + st1 {v0.16b}, [x0] +10: str w8, [x3] ret ENDPROC(ce_aes_ccm_auth_data) @@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final) ENDPROC(ce_aes_ccm_final) .macro aes_ccm_do_crypt,enc - frame_push 8 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - - ldr x26, [x25, #8] /* load lower ctr */ - ld1 {v0.16b}, [x24] /* load mac */ -CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ + ldr x8, [x6, #8] /* load lower ctr */ + ld1 {v0.16b}, [x5] /* load mac */ +CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ 0: /* outer loop */ - ld1 {v1.8b}, [x25] /* load upper ctr */ - prfm pldl1strm, [x20] - add x26, x26, #1 - rev x9, x26 - cmp w23, #12 /* which key size? */ - sub w7, w23, #2 /* get modified # of rounds */ + ld1 {v1.8b}, [x6] /* load upper ctr */ + prfm pldl1strm, [x1] + add x8, x8, #1 + rev x9, x8 + cmp w4, #12 /* which key size? */ + sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.4s}, [x22] /* load first round key */ - add x10, x22, #16 + ld1 {v3.4s}, [x3] /* load first round key */ + add x10, x3, #16 bmi 1f bne 4f mov v5.16b, v3.16b @@ -194,9 +165,9 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ bpl 2b aese v0.16b, v4.16b aese v1.16b, v4.16b - subs w21, w21, #16 - bmi 7f /* partial block? */ - ld1 {v2.16b}, [x20], #16 /* load next input block */ + subs w2, w2, #16 + bmi 6f /* partial block? */ + ld1 {v2.16b}, [x1], #16 /* load next input block */ .if \enc == 1 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ @@ -205,29 +176,18 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ eor v1.16b, v2.16b, v5.16b /* final round enc */ .endif eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ - st1 {v1.16b}, [x19], #16 /* write output block */ - beq 5f + st1 {v1.16b}, [x0], #16 /* write output block */ + bne 0b +CPU_LE( rev x8, x8 ) + st1 {v0.16b}, [x5] /* store mac */ + str x8, [x6, #8] /* store lsb end of ctr (BE) */ +5: ret - if_will_cond_yield_neon - st1 {v0.16b}, [x24] /* store mac */ - do_cond_yield_neon - ld1 {v0.16b}, [x24] /* reload mac */ - endif_yield_neon - - b 0b -5: -CPU_LE( rev x26, x26 ) - st1 {v0.16b}, [x24] /* store mac */ - str x26, [x25, #8] /* store lsb end of ctr (BE) */ - -6: frame_pop - ret - -7: eor v0.16b, v0.16b, v5.16b /* final round mac */ +6: eor v0.16b, v0.16b, v5.16b /* final round mac */ eor v1.16b, v1.16b, v5.16b /* final round enc */ - st1 {v0.16b}, [x24] /* store mac */ - add w21, w21, #16 /* process partial tail block */ -8: ldrb w9, [x20], #1 /* get 1 byte of input */ + st1 {v0.16b}, [x5] /* store mac */ + add w2, w2, #16 /* process partial tail block */ +7: ldrb w9, [x1], #1 /* get 1 byte of input */ umov w6, v1.b[0] /* get top crypted ctr byte */ umov w7, v0.b[0] /* get top mac byte */ .if \enc == 1 @@ -237,13 +197,13 @@ CPU_LE( rev x26, x26 ) eor w9, w9, w6 eor w7, w7, w9 .endif - strb w9, [x19], #1 /* store out byte */ - strb w7, [x24], #1 /* store mac byte */ - subs w21, w21, #1 - beq 6b + strb w9, [x0], #1 /* store out byte */ + strb w7, [x5], #1 /* store mac byte */ + subs w2, w2, #1 + beq 5b ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ - b 8b + b 7b .endm /* diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index dcffb9e77589..c723647b37db 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8) .endm .macro pmull_gcm_do_crypt, enc - frame_push 10 + ld1 {SHASH.2d}, [x4] + ld1 {XL.2d}, [x1] + ldr x8, [x5, #8] // load lower counter - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - mov x26, x7 - .if \enc == 1 - ldr x27, [sp, #96] // first stacked arg - .endif - - ldr x28, [x24, #8] // load lower counter -CPU_LE( rev x28, x28 ) - -0: mov x0, x25 - load_round_keys w26, x0 - ld1 {SHASH.2d}, [x23] - ld1 {XL.2d}, [x20] + load_round_keys w7, x6 movi MASK.16b, #0xe1 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 +CPU_LE( rev x8, x8 ) shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, SHASH.16b .if \enc == 1 - ld1 {KS.16b}, [x27] + ldr x10, [sp] + ld1 {KS.16b}, [x10] .endif -1: ld1 {CTR.8b}, [x24] // load upper counter - ld1 {INP.16b}, [x22], #16 - rev x9, x28 - add x28, x28, #1 - sub w19, w19, #1 +0: ld1 {CTR.8b}, [x5] // load upper counter + ld1 {INP.16b}, [x3], #16 + rev x9, x8 + add x8, x8, #1 + sub w0, w0, #1 ins CTR.d[1], x9 // set lower counter .if \enc == 1 eor INP.16b, INP.16b, KS.16b // encrypt input - st1 {INP.16b}, [x21], #16 + st1 {INP.16b}, [x2], #16 .endif rev64 T1.16b, INP.16b - cmp w26, #12 - b.ge 4f // AES-192/256? + cmp w7, #12 + b.ge 2f // AES-192/256? -2: enc_round CTR, v21 +1: enc_round CTR, v21 ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 @@ -425,39 +411,27 @@ CPU_LE( rev x28, x28 ) .if \enc == 0 eor INP.16b, INP.16b, KS.16b - st1 {INP.16b}, [x21], #16 + st1 {INP.16b}, [x2], #16 .endif - cbz w19, 3f + cbnz w0, 0b + +CPU_LE( rev x8, x8 ) + st1 {XL.2d}, [x1] + str x8, [x5, #8] // store lower counter - if_will_cond_yield_neon - st1 {XL.2d}, [x20] .if \enc == 1 - st1 {KS.16b}, [x27] - .endif - do_cond_yield_neon - b 0b - endif_yield_neon - - b 1b - -3: st1 {XL.2d}, [x20] - .if \enc == 1 - st1 {KS.16b}, [x27] + st1 {KS.16b}, [x10] .endif -CPU_LE( rev x28, x28 ) - str x28, [x24, #8] // store lower counter - - frame_pop ret -4: b.eq 5f // AES-192? +2: b.eq 3f // AES-192? enc_round CTR, v17 enc_round CTR, v18 -5: enc_round CTR, v19 +3: enc_round CTR, v19 enc_round CTR, v20 - b 2b + b 1b .endm /* From 877ccce7cbe8409256616f5e6bdedb08ce2e82db Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 3 Aug 2018 13:37:50 +0200 Subject: [PATCH 06/23] crypto: x86/aegis,morus - Fix and simplify CPUID checks It turns out I had misunderstood how the x86_match_cpu() function works. It evaluates a logical OR of the matching conditions, not logical AND. This caused the CPU feature checks for AEGIS to pass even if only SSE2 (but not AES-NI) was supported (or vice versa), leading to potential crashes if something tried to use the registered algs. This patch switches the checks to a simpler method that is used e.g. in the Camellia x86 code. The patch also removes the MODULE_DEVICE_TABLE declarations which actually seem to cause the modules to be auto-loaded at boot, which is not desired. The crypto API on-demand module loading is sufficient. Fixes: 1d373d4e8e15 ("crypto: x86 - Add optimized AEGIS implementations") Fixes: 6ecc9d9ff91f ("crypto: x86 - Add optimized MORUS implementations") Signed-off-by: Ondrej Mosnacek Tested-by: Milan Broz Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-glue.c | 12 ++++-------- arch/x86/crypto/aegis128l-aesni-glue.c | 12 ++++-------- arch/x86/crypto/aegis256-aesni-glue.c | 12 ++++-------- arch/x86/crypto/morus1280-avx2-glue.c | 10 +++------- arch/x86/crypto/morus1280-sse2-glue.c | 10 +++------- arch/x86/crypto/morus640-sse2-glue.c | 10 +++------- 6 files changed, 21 insertions(+), 45 deletions(-) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 5de7c0d46edf..acd11b3bf639 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis128_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis128_aesni_alg, diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 876e4866e633..2071c3d1ae07 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128l_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis128l_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis128l_aesni_alg, diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index 2b5dd3af8f4d..b5f2a8fd5a71 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis256_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis256_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis256_aesni_alg, diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c index f111f36d26dc..6634907d6ccd 100644 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); -static const struct x86_cpu_id avx2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AVX2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id); - static int __init crypto_morus1280_avx2_module_init(void) { - if (!x86_match_cpu(avx2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus1280_avx2_algs, diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index 839270aa713c..95cf857d2cbb 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); -static const struct x86_cpu_id sse2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); - static int __init crypto_morus1280_sse2_module_init(void) { - if (!x86_match_cpu(sse2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus1280_sse2_algs, diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 26b47e2db8d2..615fb7bc9a32 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); -static const struct x86_cpu_id sse2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); - static int __init crypto_morus640_sse2_module_init(void) { - if (!x86_match_cpu(sse2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus640_sse2_algs, From adfb442dbbd31bd2138d02b5822914178170df33 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 Aug 2018 18:22:38 +0100 Subject: [PATCH 07/23] cxgb4: mk_act_open_req() buggers ->{local, peer}_ip on big-endian hosts Unlike fs.val.lport and fs.val.fport, cxgb4_process_flow_match() sets fs.val.{l,f}ip to net-endian values without conversion - they come straight from flow_dissector_key_ipv4_addrs ->dst and ->src resp. So the assignment in mk_act_open_req() ought to be a straight copy. As far as I know, T4 PCIe cards do exist, so it's not as if that thing could only be found on little-endian systems... Signed-off-by: Al Viro Acked-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 00fc5f1afb1d..7dddb9e748b8 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -1038,10 +1038,8 @@ static void mk_act_open_req(struct filter_entry *f, struct sk_buff *skb, OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_filterid)); req->local_port = cpu_to_be16(f->fs.val.lport); req->peer_port = cpu_to_be16(f->fs.val.fport); - req->local_ip = f->fs.val.lip[0] | f->fs.val.lip[1] << 8 | - f->fs.val.lip[2] << 16 | f->fs.val.lip[3] << 24; - req->peer_ip = f->fs.val.fip[0] | f->fs.val.fip[1] << 8 | - f->fs.val.fip[2] << 16 | f->fs.val.fip[3] << 24; + memcpy(&req->local_ip, f->fs.val.lip, 4); + memcpy(&req->peer_ip, f->fs.val.fip, 4); req->opt0 = cpu_to_be64(NAGLE_V(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | DELACK_V(f->fs.hitcnts) | From a94cead71c4651da5649134e3cdc29b1f32327a2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 6 Aug 2018 17:50:45 +0100 Subject: [PATCH 08/23] net: thunderx: check for failed allocation lmac->dmacs The allocation of lmac->dmacs is not being checked for allocation failure. Add the check. Fixes: 3a34ecfd9d3f ("net: thunderx: add MAC address filter tracking for LMAC") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index 5d08d2aeb172..e337da6ba2a4 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1083,6 +1083,8 @@ static int bgx_lmac_enable(struct bgx *bgx, u8 lmacid) lmac->dmacs_count = (RX_DMAC_COUNT / bgx->lmac_count); lmac->dmacs = kcalloc(lmac->dmacs_count, sizeof(*lmac->dmacs), GFP_KERNEL); + if (!lmac->dmacs) + return -ENOMEM; /* Enable lmac */ bgx_reg_modify(bgx, lmacid, BGX_CMRX_CFG, CMR_EN); From 455f05ecd2b219e9a216050796d30c830d9bc393 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 6 Aug 2018 11:06:02 -0700 Subject: [PATCH 09/23] vsock: split dwork to avoid reinitializations syzbot reported that we reinitialize an active delayed work in vsock_stream_connect(): ODEBUG: init active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x90 kernel/workqueue.c:1414 WARNING: CPU: 1 PID: 11518 at lib/debugobjects.c:329 debug_print_object+0x16a/0x210 lib/debugobjects.c:326 The pattern is apparently wrong, we should only initialize the dealyed work once and could repeatly schedule it. So we have to move out the initializations to allocation side. And to avoid confusion, we can split the shared dwork into two, instead of re-using the same one. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reported-by: Cc: Andy king Cc: Stefan Hajnoczi Cc: Jorgen Hansen Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/af_vsock.h | 4 ++-- net/vmw_vsock/af_vsock.c | 15 ++++++++------- net/vmw_vsock/vmci_transport.c | 3 +-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 9324ac2d9ff2..43913ae79f64 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -64,7 +64,8 @@ struct vsock_sock { struct list_head pending_links; struct list_head accept_queue; bool rejected; - struct delayed_work dwork; + struct delayed_work connect_work; + struct delayed_work pending_work; struct delayed_work close_work; bool close_work_scheduled; u32 peer_shutdown; @@ -77,7 +78,6 @@ struct vsock_sock { s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); -void vsock_pending_work(struct work_struct *work); struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index c1076c19b858..ab27a2872935 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -451,14 +451,14 @@ static int vsock_send_shutdown(struct sock *sk, int mode) return transport->shutdown(vsock_sk(sk), mode); } -void vsock_pending_work(struct work_struct *work) +static void vsock_pending_work(struct work_struct *work) { struct sock *sk; struct sock *listener; struct vsock_sock *vsk; bool cleanup; - vsk = container_of(work, struct vsock_sock, dwork.work); + vsk = container_of(work, struct vsock_sock, pending_work.work); sk = sk_vsock(vsk); listener = vsk->listener; cleanup = true; @@ -498,7 +498,6 @@ out: sock_put(sk); sock_put(listener); } -EXPORT_SYMBOL_GPL(vsock_pending_work); /**** SOCKET OPERATIONS ****/ @@ -597,6 +596,8 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) return retval; } +static void vsock_connect_timeout(struct work_struct *work); + struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, @@ -638,6 +639,8 @@ struct sock *__vsock_create(struct net *net, vsk->sent_request = false; vsk->ignore_connecting_rst = false; vsk->peer_shutdown = 0; + INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); + INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); psk = parent ? vsock_sk(parent) : NULL; if (parent) { @@ -1117,7 +1120,7 @@ static void vsock_connect_timeout(struct work_struct *work) struct vsock_sock *vsk; int cancel = 0; - vsk = container_of(work, struct vsock_sock, dwork.work); + vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); lock_sock(sk); @@ -1221,9 +1224,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, * timeout fires. */ sock_hold(sk); - INIT_DELAYED_WORK(&vsk->dwork, - vsock_connect_timeout); - schedule_delayed_work(&vsk->dwork, timeout); + schedule_delayed_work(&vsk->connect_work, timeout); /* Skip ahead to preserve error code set above. */ goto out_wait; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index a7a73ffe675b..cb332adb84cd 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1094,8 +1094,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vpending->listener = sk; sock_hold(sk); sock_hold(pending); - INIT_DELAYED_WORK(&vpending->dwork, vsock_pending_work); - schedule_delayed_work(&vpending->dwork, HZ); + schedule_delayed_work(&vpending->pending_work, HZ); out: return err; From 37436d9c0e8f62c3eebe204ff5776ff31fd64658 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 7 Aug 2018 15:52:32 +0800 Subject: [PATCH 10/23] tipc: fix an interrupt unsafe locking scenario Commit 9faa89d4ed9d ("tipc: make function tipc_net_finalize() thread safe") tries to make it thread safe to set node address, so it uses node_list_lock lock to serialize the whole process of setting node address in tipc_net_finalize(). But it causes the following interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- rht_deferred_worker() rhashtable_rehash_table() lock(&(&ht->lock)->rlock) tipc_nl_compat_doit() tipc_net_finalize() local_irq_disable(); lock(&(&tn->node_list_lock)->rlock); tipc_sk_reinit() rhashtable_walk_enter() lock(&(&ht->lock)->rlock); tipc_disc_rcv() tipc_node_check_dest() tipc_node_create() lock(&(&tn->node_list_lock)->rlock); *** DEADLOCK *** When rhashtable_rehash_table() holds ht->lock on CPU0, it doesn't disable BH. So if an interrupt happens after the lock, it can create an inverse lock ordering between ht->lock and tn->node_list_lock. As a consequence, deadlock might happen. The reason causing the inverse lock ordering scenario above is because the initial purpose of node_list_lock is not designed to do the serialization of node address setting. As cmpxchg() can guarantee CAS (compare-and-swap) process is atomic, we use it to replace node_list_lock to ensure setting node address can be atomically finished. It turns out the potential deadlock can be avoided as well. Fixes: 9faa89d4ed9d ("tipc: make function tipc_net_finalize() thread safe") Signed-off-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/net.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/tipc/net.c b/net/tipc/net.c index a7f6964c3a4b..62199cf5a56c 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -123,15 +123,13 @@ void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); - spin_lock_bh(&tn->node_list_lock); - if (!tipc_own_addr(net)) { + if (!cmpxchg(&tn->node_addr, 0, addr)) { tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, TIPC_CLUSTER_SCOPE, 0, addr); } - spin_unlock_bh(&tn->node_list_lock); } void tipc_net_stop(struct net *net) From 61ef4b07fcdc30535889990cf4229766502561cf Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 7 Aug 2018 20:03:57 +0300 Subject: [PATCH 11/23] dccp: fix undefined behavior with 'cwnd' shift in ccid2_cwnd_restart() The shift of 'cwnd' with '(now - hc->tx_lsndtime) / hc->tx_rto' value can lead to undefined behavior [1]. In order to fix this use a gradual shift of the window with a 'while' loop, similar to what tcp_cwnd_restart() is doing. When comparing delta and RTO there is a minor difference between TCP and DCCP, the last one also invokes dccp_cwnd_restart() and reduces 'cwnd' if delta equals RTO. That case is preserved in this change. [1]: [40850.963623] UBSAN: Undefined behaviour in net/dccp/ccids/ccid2.c:237:7 [40851.043858] shift exponent 67 is too large for 32-bit type 'unsigned int' [40851.127163] CPU: 3 PID: 15940 Comm: netstress Tainted: G W E 4.18.0-rc7.x86_64 #1 ... [40851.377176] Call Trace: [40851.408503] dump_stack+0xf1/0x17b [40851.451331] ? show_regs_print_info+0x5/0x5 [40851.503555] ubsan_epilogue+0x9/0x7c [40851.548363] __ubsan_handle_shift_out_of_bounds+0x25b/0x2b4 [40851.617109] ? __ubsan_handle_load_invalid_value+0x18f/0x18f [40851.686796] ? xfrm4_output_finish+0x80/0x80 [40851.739827] ? lock_downgrade+0x6d0/0x6d0 [40851.789744] ? xfrm4_prepare_output+0x160/0x160 [40851.845912] ? ip_queue_xmit+0x810/0x1db0 [40851.895845] ? ccid2_hc_tx_packet_sent+0xd36/0x10a0 [dccp] [40851.963530] ccid2_hc_tx_packet_sent+0xd36/0x10a0 [dccp] [40852.029063] dccp_xmit_packet+0x1d3/0x720 [dccp] [40852.086254] dccp_write_xmit+0x116/0x1d0 [dccp] [40852.142412] dccp_sendmsg+0x428/0xb20 [dccp] [40852.195454] ? inet_dccp_listen+0x200/0x200 [dccp] [40852.254833] ? sched_clock+0x5/0x10 [40852.298508] ? sched_clock+0x5/0x10 [40852.342194] ? inet_create+0xdf0/0xdf0 [40852.388988] sock_sendmsg+0xd9/0x160 ... Fixes: 113ced1f52e5 ("dccp ccid-2: Perform congestion-window validation") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 2b75df469220..842a9c7c73a3 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -229,14 +229,16 @@ static void ccid2_cwnd_restart(struct sock *sk, const u32 now) struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); u32 cwnd = hc->tx_cwnd, restart_cwnd, iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); + s32 delta = now - hc->tx_lsndtime; hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); /* don't reduce cwnd below the initial window (IW) */ restart_cwnd = min(cwnd, iwnd); - cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto; - hc->tx_cwnd = max(cwnd, restart_cwnd); + while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd) + cwnd >>= 1; + hc->tx_cwnd = max(cwnd, restart_cwnd); hc->tx_cwnd_stamp = now; hc->tx_cwnd_used = 0; From 0dcb82254d65f72333aa50ad626d1e9665ad093b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 7 Aug 2018 12:41:38 -0700 Subject: [PATCH 12/23] llc: use refcount_inc_not_zero() for llc_sap_find() llc_sap_put() decreases the refcnt before deleting sap from the global list. Therefore, there is a chance llc_sap_find() could find a sap with zero refcnt in this global list. Close this race condition by checking if refcnt is zero or not in llc_sap_find(), if it is zero then it is being removed so we can just treat it as gone. Reported-by: Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/llc.h | 5 +++++ net/llc/llc_core.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/llc.h b/include/net/llc.h index dc35f25eb679..890a87318014 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -116,6 +116,11 @@ static inline void llc_sap_hold(struct llc_sap *sap) refcount_inc(&sap->refcnt); } +static inline bool llc_sap_hold_safe(struct llc_sap *sap) +{ + return refcount_inc_not_zero(&sap->refcnt); +} + void llc_sap_close(struct llc_sap *sap); static inline void llc_sap_put(struct llc_sap *sap) diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 89041260784c..260b3dc1b4a2 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -73,8 +73,8 @@ struct llc_sap *llc_sap_find(unsigned char sap_value) rcu_read_lock_bh(); sap = __llc_sap_find(sap_value); - if (sap) - llc_sap_hold(sap); + if (!sap || !llc_sap_hold_safe(sap)) + sap = NULL; rcu_read_unlock_bh(); return sap; } From b13f9c6364373a1b9f71e9846dc4fb199296f926 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 8 Aug 2018 11:43:04 +0800 Subject: [PATCH 13/23] vhost: reset metadata cache when initializing new IOTLB We need to reset metadata cache during new IOTLB initialization, otherwise the stale pointers to previous IOTLB may be still accessed which will lead a use after free. Reported-by: syzbot+c51e6736a1bf614b3272@syzkaller.appspotmail.com Fixes: f88949138058 ("vhost: introduce O(1) vq metadata cache") Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index a502f1af4a21..ed3114556fda 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1560,9 +1560,12 @@ int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled) d->iotlb = niotlb; for (i = 0; i < d->nvqs; ++i) { - mutex_lock(&d->vqs[i]->mutex); - d->vqs[i]->iotlb = niotlb; - mutex_unlock(&d->vqs[i]->mutex); + struct vhost_virtqueue *vq = d->vqs[i]; + + mutex_lock(&vq->mutex); + vq->iotlb = niotlb; + __vhost_vq_meta_reset(vq); + mutex_unlock(&vq->mutex); } vhost_umem_clean(oiotlb); From 66509a276c8c1d19ee3f661a41b418d101c57d29 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 28 Jul 2018 11:47:17 +0200 Subject: [PATCH 14/23] parisc: Enable CONFIG_MLONGCALLS by default Enable the -mlong-calls compiler option by default, because otherwise in most cases linking the vmlinux binary fails due to truncations of R_PARISC_PCREL22F relocations. This fixes building the 64-bit defconfig. Cc: stable@vger.kernel.org # 4.0+ Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 17526bebcbd2..46f656b8fc23 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -195,7 +195,7 @@ config PREFETCH config MLONGCALLS bool "Enable the -mlong-calls compiler option for big kernels" - def_bool y if (!MODULES) + default y depends on PA8X00 help If you configure the kernel to include many drivers built-in instead From fedb8da96355f5f64353625bf96dc69423ad1826 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sun, 5 Aug 2018 13:30:31 -0400 Subject: [PATCH 15/23] parisc: Define mb() and add memory barriers to assembler unlock sequences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For years I thought all parisc machines executed loads and stores in order. However, Jeff Law recently indicated on gcc-patches that this is not correct. There are various degrees of out-of-order execution all the way back to the PA7xxx processor series (hit-under-miss). The PA8xxx series has full out-of-order execution for both integer operations, and loads and stores. This is described in the following article: http://web.archive.org/web/20040214092531/http://www.cpus.hp.com/technical_references/advperf.shtml For this reason, we need to define mb() and to insert a memory barrier before the store unlocking spinlocks. This ensures that all memory accesses are complete prior to unlocking. The ldcw instruction performs the same function on entry. Signed-off-by: John David Anglin Cc: stable@vger.kernel.org # 4.0+ Signed-off-by: Helge Deller --- arch/parisc/include/asm/barrier.h | 32 +++++++++++++++++++++++++++++++ arch/parisc/kernel/entry.S | 2 ++ arch/parisc/kernel/pacache.S | 1 + arch/parisc/kernel/syscall.S | 4 ++++ 4 files changed, 39 insertions(+) create mode 100644 arch/parisc/include/asm/barrier.h diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h new file mode 100644 index 000000000000..dbaaca84f27f --- /dev/null +++ b/arch/parisc/include/asm/barrier.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_BARRIER_H +#define __ASM_BARRIER_H + +#ifndef __ASSEMBLY__ + +/* The synchronize caches instruction executes as a nop on systems in + which all memory references are performed in order. */ +#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory") + +#if defined(CONFIG_SMP) +#define mb() do { synchronize_caches(); } while (0) +#define rmb() mb() +#define wmb() mb() +#define dma_rmb() mb() +#define dma_wmb() mb() +#else +#define mb() barrier() +#define rmb() barrier() +#define wmb() barrier() +#define dma_rmb() barrier() +#define dma_wmb() barrier() +#endif + +#define __smp_mb() mb() +#define __smp_rmb() mb() +#define __smp_wmb() mb() + +#include + +#endif /* !__ASSEMBLY__ */ +#endif /* __ASM_BARRIER_H */ diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index e95207c0565e..1b4732e20137 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -481,6 +481,8 @@ /* Release pa_tlb_lock lock without reloading lock address. */ .macro tlb_unlock0 spc,tmp #ifdef CONFIG_SMP + or,COND(=) %r0,\spc,%r0 + sync or,COND(=) %r0,\spc,%r0 stw \spc,0(\tmp) #endif diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index 22e6374ece44..97451e67d35b 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -353,6 +353,7 @@ ENDPROC_CFI(flush_data_cache_local) .macro tlb_unlock la,flags,tmp #ifdef CONFIG_SMP ldi 1,\tmp + sync stw \tmp,0(\la) mtsm \flags #endif diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index e775f80ae28c..4886a6db42e9 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -633,6 +633,7 @@ cas_action: sub,<> %r28, %r25, %r0 2: stw,ma %r24, 0(%r26) /* Free lock */ + sync stw,ma %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG /* Clear thread register indicator */ @@ -647,6 +648,7 @@ cas_action: 3: /* Error occurred on load or store */ /* Free lock */ + sync stw %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG stw %r0, 4(%sr2,%r20) @@ -848,6 +850,7 @@ cas2_action: cas2_end: /* Free lock */ + sync stw,ma %r20, 0(%sr2,%r20) /* Enable interrupts */ ssm PSW_SM_I, %r0 @@ -858,6 +861,7 @@ cas2_end: 22: /* Error occurred on load or store */ /* Free lock */ + sync stw %r20, 0(%sr2,%r20) ssm PSW_SM_I, %r0 ldo 1(%r0),%r28 From 816f670623692b5da2787f278cbfdb331ed29b8a Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 8 Aug 2018 15:48:07 -0700 Subject: [PATCH 16/23] net/mlx5e: Properly check if hairpin is possible between two functions The current check relies on function BDF addresses and can get us wrong e.g when two VFs are assigned into a VM and the PCI v-address is set by the hypervisor. Fixes: 5c65c564c962 ('net/mlx5e: Support offloading TC NIC hairpin flows') Signed-off-by: Or Gerlitz Reported-by: Alaa Hleihel Tested-by: Alaa Hleihel Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 3a2c4e548226..dfbcda0d0e08 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1970,15 +1970,15 @@ static bool actions_match_supported(struct mlx5e_priv *priv, static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv) { struct mlx5_core_dev *fmdev, *pmdev; - u16 func_id, peer_id; + u64 fsystem_guid, psystem_guid; fmdev = priv->mdev; pmdev = peer_priv->mdev; - func_id = (u16)((fmdev->pdev->bus->number << 8) | PCI_SLOT(fmdev->pdev->devfn)); - peer_id = (u16)((pmdev->pdev->bus->number << 8) | PCI_SLOT(pmdev->pdev->devfn)); + mlx5_query_nic_vport_system_image_guid(fmdev, &fsystem_guid); + mlx5_query_nic_vport_system_image_guid(pmdev, &psystem_guid); - return (func_id == peer_id); + return (fsystem_guid == psystem_guid); } static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, From f280c6a1e548cd3223a56bb480454ffb96050e87 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 8 Aug 2018 15:48:08 -0700 Subject: [PATCH 17/23] net/mlx5e: Cleanup of dcbnl related fields Remove unused netdev_registered_init/remove in en.h Return ENOSUPPORT if the check MLX5_DSCP_SUPPORTED fails. Remove extra white space Fixes: 2a5e7a1344f4 ("net/mlx5e: Add dcbnl dscp to priority support") Signed-off-by: Huy Nguyen Cc: Yuval Shaia Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 -- .../ethernet/mellanox/mlx5/core/en_dcbnl.c | 30 +++++++------------ 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index eb9eb7aa953a..405236cf0b04 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -858,8 +858,6 @@ struct mlx5e_profile { mlx5e_fp_handle_rx_cqe handle_rx_cqe; mlx5e_fp_handle_rx_cqe handle_rx_cqe_mpwqe; } rx_handlers; - void (*netdev_registered_init)(struct mlx5e_priv *priv); - void (*netdev_registered_remove)(struct mlx5e_priv *priv); int max_tc; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index e33afa8d2417..722998d68564 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -443,16 +443,12 @@ static int mlx5e_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) bool is_new; int err; - if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP) - return -EINVAL; + if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager) || + !MLX5_DSCP_SUPPORTED(priv->mdev)) + return -EOPNOTSUPP; - if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager)) - return -EINVAL; - - if (!MLX5_DSCP_SUPPORTED(priv->mdev)) - return -EINVAL; - - if (app->protocol >= MLX5E_MAX_DSCP) + if ((app->selector != IEEE_8021QAZ_APP_SEL_DSCP) || + (app->protocol >= MLX5E_MAX_DSCP)) return -EINVAL; /* Save the old entry info */ @@ -500,16 +496,12 @@ static int mlx5e_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) struct mlx5e_priv *priv = netdev_priv(dev); int err; - if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP) - return -EINVAL; + if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager) || + !MLX5_DSCP_SUPPORTED(priv->mdev)) + return -EOPNOTSUPP; - if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager)) - return -EINVAL; - - if (!MLX5_DSCP_SUPPORTED(priv->mdev)) - return -EINVAL; - - if (app->protocol >= MLX5E_MAX_DSCP) + if ((app->selector != IEEE_8021QAZ_APP_SEL_DSCP) || + (app->protocol >= MLX5E_MAX_DSCP)) return -EINVAL; /* Skip if no dscp app entry */ @@ -1146,7 +1138,7 @@ static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state) { int err; - err = mlx5_set_trust_state(priv->mdev, trust_state); + err = mlx5_set_trust_state(priv->mdev, trust_state); if (err) return err; priv->dcbx_dp.trust_state = trust_state; From 330bdcfadceea5e9a1526d731711e163f9a90975 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Aug 2018 11:30:02 +0100 Subject: [PATCH 18/23] rxrpc: Fix the keepalive generator [ver #2] AF_RXRPC has a keepalive message generator that generates a message for a peer ~20s after the last transmission to that peer to keep firewall ports open. The implementation is incorrect in the following ways: (1) It mixes up ktime_t and time64_t types. (2) It uses ktime_get_real(), the output of which may jump forward or backward due to adjustments to the time of day. (3) If the current time jumps forward too much or jumps backwards, the generator function will crank the base of the time ring round one slot at a time (ie. a 1s period) until it catches up, spewing out VERSION packets as it goes. Fix the problem by: (1) Only using time64_t. There's no need for sub-second resolution. (2) Use ktime_get_seconds() rather than ktime_get_real() so that time isn't perceived to go backwards. (3) Simplifying rxrpc_peer_keepalive_worker() by splitting it into two parts: (a) The "worker" function that manages the buckets and the timer. (b) The "dispatch" function that takes the pending peers and potentially transmits a keepalive packet before putting them back in the ring into the slot appropriate to the revised last-Tx time. (4) Taking everything that's pending out of the ring and splicing it into a temporary collector list for processing. In the case that there's been a significant jump forward, the ring gets entirely emptied and then the time base can be warped forward before the peers are processed. The warping can't happen if the ring isn't empty because the slot a peer is in is keepalive-time dependent, relative to the base time. (5) Limit the number of iterations of the bucket array when scanning it. (6) Set the timer to skip any empty slots as there's no point waking up if there's nothing to do yet. This can be triggered by an incoming call from a server after a reboot with AF_RXRPC and AFS built into the kernel causing a peer record to be set up before userspace is started. The system clock is then adjusted by userspace, thereby potentially causing the keepalive generator to have a meltdown - which leads to a message like: watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [kworker/0:1:23] ... Workqueue: krxrpcd rxrpc_peer_keepalive_worker EIP: lock_acquire+0x69/0x80 ... Call Trace: ? rxrpc_peer_keepalive_worker+0x5e/0x350 ? _raw_spin_lock_bh+0x29/0x60 ? rxrpc_peer_keepalive_worker+0x5e/0x350 ? rxrpc_peer_keepalive_worker+0x5e/0x350 ? __lock_acquire+0x3d3/0x870 ? process_one_work+0x110/0x340 ? process_one_work+0x166/0x340 ? process_one_work+0x110/0x340 ? worker_thread+0x39/0x3c0 ? kthread+0xdb/0x110 ? cancel_delayed_work+0x90/0x90 ? kthread_stop+0x70/0x70 ? ret_from_fork+0x19/0x24 Fixes: ace45bec6d77 ("rxrpc: Fix firewall route keepalive") Reported-by: kernel test robot Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 8 +- net/rxrpc/conn_event.c | 4 +- net/rxrpc/net_ns.c | 6 +- net/rxrpc/output.c | 12 +-- net/rxrpc/peer_event.c | 166 ++++++++++++++++++++++------------------ net/rxrpc/peer_object.c | 8 +- net/rxrpc/rxkad.c | 4 +- 7 files changed, 114 insertions(+), 94 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5fb7d3254d9e..707630ab4713 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -104,9 +104,9 @@ struct rxrpc_net { #define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */ u8 peer_keepalive_cursor; - ktime_t peer_keepalive_base; - struct hlist_head peer_keepalive[RXRPC_KEEPALIVE_TIME + 1]; - struct hlist_head peer_keepalive_new; + time64_t peer_keepalive_base; + struct list_head peer_keepalive[32]; + struct list_head peer_keepalive_new; struct timer_list peer_keepalive_timer; struct work_struct peer_keepalive_work; }; @@ -295,7 +295,7 @@ struct rxrpc_peer { struct hlist_head error_targets; /* targets for net error distribution */ struct work_struct error_distributor; struct rb_root service_conns; /* Service connections */ - struct hlist_node keepalive_link; /* Link in net->peer_keepalive[] */ + struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 8229a52c2acd..3fde001fcc39 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -136,7 +136,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(conn->debug_id, serial, ret, rxrpc_tx_fail_call_final_resend); @@ -245,7 +245,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, return -EAGAIN; } - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); _leave(" = 0"); return 0; diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 5d6a773db973..417d80867c4f 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -85,12 +85,12 @@ static __net_init int rxrpc_init_net(struct net *net) hash_init(rxnet->peer_hash); spin_lock_init(&rxnet->peer_hash_lock); for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++) - INIT_HLIST_HEAD(&rxnet->peer_keepalive[i]); - INIT_HLIST_HEAD(&rxnet->peer_keepalive_new); + INIT_LIST_HEAD(&rxnet->peer_keepalive[i]); + INIT_LIST_HEAD(&rxnet->peer_keepalive_new); timer_setup(&rxnet->peer_keepalive_timer, rxrpc_peer_keepalive_timeout, 0); INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker); - rxnet->peer_keepalive_base = ktime_add(ktime_get_real(), NSEC_PER_SEC); + rxnet->peer_keepalive_base = ktime_get_seconds(); ret = -ENOMEM; rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f03de1c59ba3..4774c8f5634d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -209,7 +209,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, now = ktime_get_real(); if (ping) call->ping_time = now; - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_fail_call_ack); @@ -296,7 +296,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, sizeof(pkt)); - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_fail_call_abort); @@ -391,7 +391,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, * message and update the peer record */ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); up_read(&conn->params.local->defrag_sem); if (ret < 0) @@ -457,7 +457,7 @@ send_fragmentable: if (ret == 0) { ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); opt = IP_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, SOL_IP, @@ -475,7 +475,7 @@ send_fragmentable: if (ret == 0) { ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); opt = IPV6_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, @@ -599,6 +599,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) trace_rxrpc_tx_fail(peer->debug_id, 0, ret, rxrpc_tx_fail_version_keepalive); - peer->last_tx_at = ktime_get_real(); + peer->last_tx_at = ktime_get_seconds(); _leave(""); } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 0ed8b651cec2..4f9da2f51c69 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -349,6 +349,56 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, usage, avg); } +/* + * Perform keep-alive pings. + */ +static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, + struct list_head *collector, + time64_t base, + u8 cursor) +{ + struct rxrpc_peer *peer; + const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1; + time64_t keepalive_at; + int slot; + + spin_lock_bh(&rxnet->peer_hash_lock); + + while (!list_empty(collector)) { + peer = list_entry(collector->next, + struct rxrpc_peer, keepalive_link); + + list_del_init(&peer->keepalive_link); + if (!rxrpc_get_peer_maybe(peer)) + continue; + + spin_unlock_bh(&rxnet->peer_hash_lock); + + keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; + slot = keepalive_at - base; + _debug("%02x peer %u t=%d {%pISp}", + cursor, peer->debug_id, slot, &peer->srx.transport); + + if (keepalive_at <= base || + keepalive_at > base + RXRPC_KEEPALIVE_TIME) { + rxrpc_send_keepalive(peer); + slot = RXRPC_KEEPALIVE_TIME; + } + + /* A transmission to this peer occurred since last we examined + * it so put it into the appropriate future bucket. + */ + slot += cursor; + slot &= mask; + spin_lock_bh(&rxnet->peer_hash_lock); + list_add_tail(&peer->keepalive_link, + &rxnet->peer_keepalive[slot & mask]); + rxrpc_put_peer(peer); + } + + spin_unlock_bh(&rxnet->peer_hash_lock); +} + /* * Perform keep-alive pings with VERSION packets to keep any NAT alive. */ @@ -356,91 +406,61 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) { struct rxrpc_net *rxnet = container_of(work, struct rxrpc_net, peer_keepalive_work); - struct rxrpc_peer *peer; - unsigned long delay; - ktime_t base, now = ktime_get_real(); - s64 diff; - u8 cursor, slot; + const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1; + time64_t base, now, delay; + u8 cursor, stop; + LIST_HEAD(collector); + now = ktime_get_seconds(); base = rxnet->peer_keepalive_base; cursor = rxnet->peer_keepalive_cursor; + _enter("%lld,%u", base - now, cursor); - _enter("%u,%lld", cursor, ktime_sub(now, base)); + if (!rxnet->live) + return; -next_bucket: - diff = ktime_to_ns(ktime_sub(now, base)); - if (diff < 0) - goto resched; - - _debug("at %u", cursor); - spin_lock_bh(&rxnet->peer_hash_lock); -next_peer: - if (!rxnet->live) { - spin_unlock_bh(&rxnet->peer_hash_lock); - goto out; - } - - /* Everything in the bucket at the cursor is processed this second; the - * bucket at cursor + 1 goes now + 1s and so on... + /* Remove to a temporary list all the peers that are currently lodged + * in expired buckets plus all new peers. + * + * Everything in the bucket at the cursor is processed this + * second; the bucket at cursor + 1 goes at now + 1s and so + * on... */ - if (hlist_empty(&rxnet->peer_keepalive[cursor])) { - if (hlist_empty(&rxnet->peer_keepalive_new)) { - spin_unlock_bh(&rxnet->peer_hash_lock); - goto emptied_bucket; - } + spin_lock_bh(&rxnet->peer_hash_lock); + list_splice_init(&rxnet->peer_keepalive_new, &collector); - hlist_move_list(&rxnet->peer_keepalive_new, - &rxnet->peer_keepalive[cursor]); + stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); + while (base <= now && (s8)(cursor - stop) < 0) { + list_splice_tail_init(&rxnet->peer_keepalive[cursor & mask], + &collector); + base++; + cursor++; } - peer = hlist_entry(rxnet->peer_keepalive[cursor].first, - struct rxrpc_peer, keepalive_link); - hlist_del_init(&peer->keepalive_link); - if (!rxrpc_get_peer_maybe(peer)) - goto next_peer; - + base = now; spin_unlock_bh(&rxnet->peer_hash_lock); - _debug("peer %u {%pISp}", peer->debug_id, &peer->srx.transport); - -recalc: - diff = ktime_divns(ktime_sub(peer->last_tx_at, base), NSEC_PER_SEC); - if (diff < -30 || diff > 30) - goto send; /* LSW of 64-bit time probably wrapped on 32-bit */ - diff += RXRPC_KEEPALIVE_TIME - 1; - if (diff < 0) - goto send; - - slot = (diff > RXRPC_KEEPALIVE_TIME - 1) ? RXRPC_KEEPALIVE_TIME - 1 : diff; - if (slot == 0) - goto send; - - /* A transmission to this peer occurred since last we examined it so - * put it into the appropriate future bucket. - */ - slot = (slot + cursor) % ARRAY_SIZE(rxnet->peer_keepalive); - spin_lock_bh(&rxnet->peer_hash_lock); - hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive[slot]); - rxrpc_put_peer(peer); - goto next_peer; - -send: - rxrpc_send_keepalive(peer); - now = ktime_get_real(); - goto recalc; - -emptied_bucket: - cursor++; - if (cursor >= ARRAY_SIZE(rxnet->peer_keepalive)) - cursor = 0; - base = ktime_add_ns(base, NSEC_PER_SEC); - goto next_bucket; - -resched: rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; - delay = nsecs_to_jiffies(-diff) + 1; - timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); -out: + rxrpc_peer_keepalive_dispatch(rxnet, &collector, base, cursor); + ASSERT(list_empty(&collector)); + + /* Schedule the timer for the next occupied timeslot. */ + cursor = rxnet->peer_keepalive_cursor; + stop = cursor + RXRPC_KEEPALIVE_TIME - 1; + for (; (s8)(cursor - stop) < 0; cursor++) { + if (!list_empty(&rxnet->peer_keepalive[cursor & mask])) + break; + base++; + } + + now = ktime_get_seconds(); + delay = base - now; + if (delay < 1) + delay = 1; + delay *= HZ; + if (rxnet->live) + timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); + _leave(""); } diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 1b7e8107b3ae..24ec7cdcf332 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -322,7 +322,7 @@ struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, if (!peer) { peer = prealloc; hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); - hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive_new); + list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); } spin_unlock(&rxnet->peer_hash_lock); @@ -367,8 +367,8 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, if (!peer) { hash_add_rcu(rxnet->peer_hash, &candidate->hash_link, hash_key); - hlist_add_head(&candidate->keepalive_link, - &rxnet->peer_keepalive_new); + list_add_tail(&candidate->keepalive_link, + &rxnet->peer_keepalive_new); } spin_unlock_bh(&rxnet->peer_hash_lock); @@ -441,7 +441,7 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); - hlist_del_init(&peer->keepalive_link); + list_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); kfree_rcu(peer, rcu); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 278ac0807a60..47cb019c521a 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -669,7 +669,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) return -EAGAIN; } - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); _leave(" = 0"); return 0; } @@ -725,7 +725,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn, return -EAGAIN; } - conn->params.peer->last_tx_at = ktime_get_real(); + conn->params.peer->last_tx_at = ktime_get_seconds(); _leave(" = 0"); return 0; } From 11ba961c916127651e12af6cad3891f8aeb25aa9 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Wed, 8 Aug 2018 14:06:32 +0300 Subject: [PATCH 19/23] net: aquantia: Fix IFF_ALLMULTI flag functionality It was noticed that NIC always pass all multicast traffic to the host regardless of IFF_ALLMULTI flag on the interface. The rule in MC Filter Table in NIC, that is configured to accept any multicast packets, is turning on if IFF_MULTICAST flag is set on the interface. It leads to passing all multicast traffic to the host. This fix changes the condition to turn on that rule by checking IFF_ALLMULTI flag as it should. Fixes: b21f502f84be ("net:ethernet:aquantia: Fix for multicast filter handling.") Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index 956860a69797..3bdab972420b 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -762,7 +762,7 @@ static int hw_atl_b0_hw_packet_filter_set(struct aq_hw_s *self, hw_atl_rpfl2promiscuous_mode_en_set(self, IS_FILTER_ENABLED(IFF_PROMISC)); hw_atl_rpfl2multicast_flr_en_set(self, - IS_FILTER_ENABLED(IFF_MULTICAST), 0); + IS_FILTER_ENABLED(IFF_ALLMULTI), 0); hw_atl_rpfl2_accept_all_mc_packets_set(self, IS_FILTER_ENABLED(IFF_ALLMULTI)); From caa21e19e08d7a1445116a93f7ab4e187ebbbadb Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 8 Aug 2018 14:13:19 +0200 Subject: [PATCH 20/23] net/smc: no shutdown in state SMC_LISTEN Invoking shutdown for a socket in state SMC_LISTEN does not make sense. Nevertheless programs like syzbot fuzzing the kernel may try to do this. For SMC this means a socket refcounting problem. This patch makes sure a shutdown call for an SMC socket in state SMC_LISTEN simply returns with -ENOTCONN. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 05e4ffe5aabd..1288c7bf40d5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1397,8 +1397,7 @@ static int smc_shutdown(struct socket *sock, int how) lock_sock(sk); rc = -ENOTCONN; - if ((sk->sk_state != SMC_LISTEN) && - (sk->sk_state != SMC_ACTIVE) && + if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && (sk->sk_state != SMC_PEERCLOSEWAIT2) && (sk->sk_state != SMC_APPCLOSEWAIT1) && From bd58c7e0860f54710907903ed6daff699d1fc5b9 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 8 Aug 2018 14:13:20 +0200 Subject: [PATCH 21/23] net/smc: allow sysctl rmem and wmem defaults for servers Without setsockopt SO_SNDBUF and SO_RCVBUF settings, the sysctl defaults net.ipv4.tcp_wmem and net.ipv4.tcp_rmem should be the base for the sizes of the SMC sndbuf and rcvbuf. Any TCP buffer size optimizations for servers should be ignored. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1288c7bf40d5..0ee7721afbe5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1122,6 +1122,8 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); + new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; + new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!schedule_work(&new_smc->smc_listen_work)) sock_put(&new_smc->sk); From 7311d665ca68907b9c43d6d1021f816f9a7bbd57 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 8 Aug 2018 14:13:21 +0200 Subject: [PATCH 22/23] net/smc: move sock lock in smc_ioctl() When an SMC socket is connecting it is decided whether fallback to TCP is needed. To avoid races between connect and ioctl move the sock lock before the use_fallback check. Reported-by: syzbot+5b2cece1a8ecb2ca77d8@syzkaller.appspotmail.com Reported-by: syzbot+19557374321ca3710990@syzkaller.appspotmail.com Fixes: 1992d99882af ("net/smc: take sock lock in smc_ioctl()") Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0ee7721afbe5..e7de5f282722 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1522,12 +1522,16 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, smc = smc_sk(sock->sk); conn = &smc->conn; - if (smc->use_fallback) { - if (!smc->clcsock) - return -EBADF; - return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); - } lock_sock(&smc->sk); + if (smc->use_fallback) { + if (!smc->clcsock) { + release_sock(&smc->sk); + return -EBADF; + } + answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); + release_sock(&smc->sk); + return answ; + } switch (cmd) { case SIOCINQ: /* same as FIONREAD */ if (smc->sk.sk_state == SMC_LISTEN) { From 1be52e97ed3e524f82e25d6e53f48df3c6e85282 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 8 Aug 2018 20:56:40 +0200 Subject: [PATCH 23/23] dsa: slave: eee: Allow ports to use phylink For a port to be able to use EEE, both the MAC and the PHY must support EEE. A phy can be provided by both a phydev or phylink. Verify at least one of these exist, not just phydev. Fixes: aab9c4067d23 ("net: dsa: Plug in PHYLINK support") Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 732369c80644..9864bcd3d317 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -639,7 +639,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev) + if (!dev->phydev && !dp->pl) return -ENODEV; if (!ds->ops->set_mac_eee) @@ -659,7 +659,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev) + if (!dev->phydev && !dp->pl) return -ENODEV; if (!ds->ops->get_mac_eee)