From b805d78d300bcf2c83d6df7da0c818b0fee41427 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 Feb 2019 15:18:59 +0800 Subject: [PATCH 001/181] xfrm: policy: Fix out-of-bound array accesses in __xfrm_policy_unlink UBSAN report this: UBSAN: Undefined behaviour in net/xfrm/xfrm_policy.c:1289:24 index 6 is out of range for type 'unsigned int [6]' CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.4.162-514.55.6.9.x86_64+ #13 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 0000000000000000 1466cf39b41b23c9 ffff8801f6b07a58 ffffffff81cb35f4 0000000041b58ab3 ffffffff83230f9c ffffffff81cb34e0 ffff8801f6b07a80 ffff8801f6b07a20 1466cf39b41b23c9 ffffffff851706e0 ffff8801f6b07ae8 Call Trace: [] __dump_stack lib/dump_stack.c:15 [inline] [] dump_stack+0x114/0x1a0 lib/dump_stack.c:51 [] ubsan_epilogue+0x12/0x8f lib/ubsan.c:164 [] __ubsan_handle_out_of_bounds+0x16e/0x1b2 lib/ubsan.c:382 [] __xfrm_policy_unlink+0x3dd/0x5b0 net/xfrm/xfrm_policy.c:1289 [] xfrm_policy_delete+0x52/0xb0 net/xfrm/xfrm_policy.c:1309 [] xfrm_policy_timer+0x30b/0x590 net/xfrm/xfrm_policy.c:243 [] call_timer_fn+0x237/0x990 kernel/time/timer.c:1144 [] __run_timers kernel/time/timer.c:1218 [inline] [] run_timer_softirq+0x6ce/0xb80 kernel/time/timer.c:1401 [] __do_softirq+0x299/0xe10 kernel/softirq.c:273 [] invoke_softirq kernel/softirq.c:350 [inline] [] irq_exit+0x216/0x2c0 kernel/softirq.c:391 [] exiting_irq arch/x86/include/asm/apic.h:652 [inline] [] smp_apic_timer_interrupt+0x8b/0xc0 arch/x86/kernel/apic/apic.c:926 [] apic_timer_interrupt+0xa5/0xb0 arch/x86/entry/entry_64.S:735 [] ? native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:52 [] arch_safe_halt arch/x86/include/asm/paravirt.h:111 [inline] [] default_idle+0x27/0x430 arch/x86/kernel/process.c:446 [] arch_cpu_idle+0x15/0x20 arch/x86/kernel/process.c:437 [] default_idle_call+0x53/0x90 kernel/sched/idle.c:92 [] cpuidle_idle_call kernel/sched/idle.c:156 [inline] [] cpu_idle_loop kernel/sched/idle.c:251 [inline] [] cpu_startup_entry+0x60d/0x9a0 kernel/sched/idle.c:299 [] start_secondary+0x3c9/0x560 arch/x86/kernel/smpboot.c:245 The issue is triggered as this: xfrm_add_policy -->verify_newpolicy_info //check the index provided by user with XFRM_POLICY_MAX //In my case, the index is 0x6E6BB6, so it pass the check. -->xfrm_policy_construct //copy the user's policy and set xfrm_policy_timer -->xfrm_policy_insert --> __xfrm_policy_link //use the orgin dir, in my case is 2 --> xfrm_gen_index //generate policy index, there is 0x6E6BB6 then xfrm_policy_timer be fired xfrm_policy_timer --> xfrm_policy_id2dir //get dir from (policy index & 7), in my case is 6 --> xfrm_policy_delete --> __xfrm_policy_unlink //access policy_count[dir], trigger out of range access Add xfrm_policy_id2dir check in verify_newpolicy_info, make sure the computed dir is valid, to fix the issue. Reported-by: Hulk Robot Fixes: e682adf021be ("xfrm: Try to honor policy index if it's supplied by user") Signed-off-by: YueHaibing Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a131f9ff979e..8d4d52fd457b 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1424,7 +1424,7 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) ret = verify_policy_dir(p->dir); if (ret) return ret; - if (p->index && ((p->index & XFRM_POLICY_MAX) != p->dir)) + if (p->index && (xfrm_policy_id2dir(p->index) != p->dir)) return -EINVAL; return 0; From 6ed69184ed9c43873b8a1ee721e3bf3c08c2c6be Mon Sep 17 00:00:00 2001 From: Myungho Jung Date: Thu, 7 Mar 2019 10:23:08 +0900 Subject: [PATCH 002/181] xfrm: Reset secpath in xfrm failure In esp4_gro_receive() and esp6_gro_receive(), secpath can be allocated without adding xfrm state to xvec. Then, sp->xvec[sp->len - 1] would fail and result in dereferencing invalid pointer in esp4_gso_segment() and esp6_gso_segment(). Reset secpath if xfrm function returns error. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Reported-by: syzbot+b69368fd933c6c592f4c@syzkaller.appspotmail.com Signed-off-by: Myungho Jung Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 8 +++++--- net/ipv6/esp6_offload.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 8756e0e790d2..d3170a8001b2 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -52,13 +52,13 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, goto out; if (sp->len == XFRM_MAX_DEPTH) - goto out; + goto out_reset; x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ip_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET); if (!x) - goto out; + goto out_reset; sp->xvec[sp->len++] = x; sp->olen++; @@ -66,7 +66,7 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, xo = xfrm_offload(skb); if (!xo) { xfrm_state_put(x); - goto out; + goto out_reset; } } @@ -82,6 +82,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, xfrm_input(skb, IPPROTO_ESP, spi, -2); return ERR_PTR(-EINPROGRESS); +out_reset: + secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d46b4eb645c2..cb99f6fb79b7 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -74,13 +74,13 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, goto out; if (sp->len == XFRM_MAX_DEPTH) - goto out; + goto out_reset; x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET6); if (!x) - goto out; + goto out_reset; sp->xvec[sp->len++] = x; sp->olen++; @@ -88,7 +88,7 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, xo = xfrm_offload(skb); if (!xo) { xfrm_state_put(x); - goto out; + goto out_reset; } } @@ -109,6 +109,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, xfrm_input(skb, IPPROTO_ESP, spi, -2); return ERR_PTR(-EINPROGRESS); +out_reset: + secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; From f10e0010fae8174dc20bdc872bcaa85baa925cb7 Mon Sep 17 00:00:00 2001 From: Su Yanjun Date: Wed, 6 Mar 2019 20:54:08 -0500 Subject: [PATCH 003/181] net: xfrm: Add '_rcu' tag for rcu protected pointer in netns_xfrm For rcu protected pointers, we'd better add '__rcu' for them. Once added '__rcu' tag for rcu protected pointer, the sparse tool reports warnings. net/xfrm/xfrm_user.c:1198:39: sparse: expected struct sock *sk net/xfrm/xfrm_user.c:1198:39: sparse: got struct sock [noderef] *nlsk [...] So introduce a new wrapper function of nlmsg_unicast to handle type conversions. This patch also fixes a direct access of a rcu protected socket. Fixes: be33690d8fcf("[XFRM]: Fix aevent related crash") Signed-off-by: Su Yanjun Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 2 +- net/xfrm/xfrm_user.c | 30 +++++++++++++++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 59f45b1e9dac..d2a36fb9f92a 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -57,7 +57,7 @@ struct netns_xfrm { struct list_head inexact_bins; - struct sock *nlsk; + struct sock __rcu *nlsk; struct sock *nlsk_stash; u32 sysctl_aevent_etime; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8d4d52fd457b..944589832343 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1071,6 +1071,22 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); } +/* A similar wrapper like xfrm_nlmsg_multicast checking that nlsk is still + * available. + */ +static inline int xfrm_nlmsg_unicast(struct net *net, struct sk_buff *skb, + u32 pid) +{ + struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); + + if (!nlsk) { + kfree_skb(skb); + return -EPIPE; + } + + return nlmsg_unicast(nlsk, skb, pid); +} + static inline unsigned int xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) @@ -1195,7 +1211,7 @@ static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_spdinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); + return xfrm_nlmsg_unicast(net, r_skb, sportid); } static inline unsigned int xfrm_sadinfo_msgsize(void) @@ -1254,7 +1270,7 @@ static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_sadinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); + return xfrm_nlmsg_unicast(net, r_skb, sportid); } static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1274,7 +1290,7 @@ static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, NETLINK_CB(skb).portid); } xfrm_state_put(x); out_noput: @@ -1337,7 +1353,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, NETLINK_CB(skb).portid); out: xfrm_state_put(x); @@ -1903,8 +1919,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, - NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, resp_skb, + NETLINK_CB(skb).portid); } } else { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); @@ -2062,7 +2078,7 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_aevent(r_skb, x, &c); BUG_ON(err < 0); - err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid); + err = xfrm_nlmsg_unicast(net, r_skb, NETLINK_CB(skb).portid); spin_unlock_bh(&x->lock); xfrm_state_put(x); return err; From 6ee02a54ef990a71bf542b6f0a4e3321de9d9c66 Mon Sep 17 00:00:00 2001 From: Su Yanjun Date: Thu, 14 Mar 2019 14:59:42 +0800 Subject: [PATCH 004/181] xfrm6_tunnel: Fix potential panic when unloading xfrm6_tunnel module When unloading xfrm6_tunnel module, xfrm6_tunnel_fini directly frees the xfrm6_tunnel_spi_kmem. Maybe someone has gotten the xfrm6_tunnel_spi, so need to wait it. Fixes: 91cc3bb0b04ff("xfrm6_tunnel: RCU conversion") Signed-off-by: Su Yanjun Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_tunnel.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index bc65db782bfb..12cb3aa990af 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -402,6 +402,10 @@ static void __exit xfrm6_tunnel_fini(void) xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); unregister_pernet_subsys(&xfrm6_tunnel_net_ops); + /* Someone maybe has gotten the xfrm6_tunnel_spi. + * So need to wait it. + */ + rcu_barrier(); kmem_cache_destroy(xfrm6_tunnel_spi_kmem); } From bfc01ddff2b0c33de21af436324a669e95ac7e78 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 20 Mar 2019 17:54:44 +0100 Subject: [PATCH 005/181] Revert "net: xfrm: Add '_rcu' tag for rcu protected pointer in netns_xfrm" This reverts commit f10e0010fae8174dc20bdc872bcaa85baa925cb7. This commit was just wrong. It caused a lot of syzbot warnings, so just revert it. Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 2 +- net/xfrm/xfrm_user.c | 30 +++++++----------------------- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index d2a36fb9f92a..59f45b1e9dac 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -57,7 +57,7 @@ struct netns_xfrm { struct list_head inexact_bins; - struct sock __rcu *nlsk; + struct sock *nlsk; struct sock *nlsk_stash; u32 sysctl_aevent_etime; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 944589832343..8d4d52fd457b 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1071,22 +1071,6 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); } -/* A similar wrapper like xfrm_nlmsg_multicast checking that nlsk is still - * available. - */ -static inline int xfrm_nlmsg_unicast(struct net *net, struct sk_buff *skb, - u32 pid) -{ - struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); - - if (!nlsk) { - kfree_skb(skb); - return -EPIPE; - } - - return nlmsg_unicast(nlsk, skb, pid); -} - static inline unsigned int xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) @@ -1211,7 +1195,7 @@ static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_spdinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return xfrm_nlmsg_unicast(net, r_skb, sportid); + return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); } static inline unsigned int xfrm_sadinfo_msgsize(void) @@ -1270,7 +1254,7 @@ static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_sadinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return xfrm_nlmsg_unicast(net, r_skb, sportid); + return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); } static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1290,7 +1274,7 @@ static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = xfrm_nlmsg_unicast(net, resp_skb, NETLINK_CB(skb).portid); + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); } xfrm_state_put(x); out_noput: @@ -1353,7 +1337,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = xfrm_nlmsg_unicast(net, resp_skb, NETLINK_CB(skb).portid); + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); out: xfrm_state_put(x); @@ -1919,8 +1903,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = xfrm_nlmsg_unicast(net, resp_skb, - NETLINK_CB(skb).portid); + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, + NETLINK_CB(skb).portid); } } else { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); @@ -2078,7 +2062,7 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_aevent(r_skb, x, &c); BUG_ON(err < 0); - err = xfrm_nlmsg_unicast(net, r_skb, NETLINK_CB(skb).portid); + err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid); spin_unlock_bh(&x->lock); xfrm_state_put(x); return err; From 5483844c3fc18474de29f5d6733003526e0a9f78 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Tue, 19 Mar 2019 15:39:20 +0000 Subject: [PATCH 006/181] vti4: ipip tunnel deregistration fixes. If tunnel registration failed during module initialization, the module would fail to deregister the IPPROTO_COMP protocol and would attempt to deregister the tunnel. The tunnel was not deregistered during module-exit. Fixes: dd9ee3444014e ("vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel") Signed-off-by: Jeremy Sowden Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 68a21bf75dd0..b6235ca09fa5 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -659,9 +659,9 @@ static int __init vti_init(void) return err; rtnl_link_failed: - xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); -xfrm_tunnel_failed: xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +xfrm_tunnel_failed: + xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: @@ -676,6 +676,7 @@ pernet_dev_failed: static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); From 01ce31c57b3f07c91c9d45bbaf126124cce83a5d Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Tue, 19 Mar 2019 15:39:21 +0000 Subject: [PATCH 007/181] vti4: removed duplicate log message. Removed info log-message if ipip tunnel registration fails during module-initialization: it adds nothing to the error message that is written on all failures. Fixes: dd9ee3444014e ("vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel") Signed-off-by: Jeremy Sowden Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index b6235ca09fa5..35d8346742e2 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -646,10 +646,8 @@ static int __init vti_init(void) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); - if (err < 0) { - pr_info("%s: cant't register tunnel\n",__func__); + if (err < 0) goto xfrm_tunnel_failed; - } msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); From dbb2483b2a46fbaf833cfb5deb5ed9cace9c7399 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 22 Mar 2019 16:26:19 -0700 Subject: [PATCH 008/181] xfrm: clean up xfrm protocol checks In commit 6a53b7593233 ("xfrm: check id proto in validate_tmpl()") I introduced a check for xfrm protocol, but according to Herbert IPSEC_PROTO_ANY should only be used as a wildcard for lookup, so it should be removed from validate_tmpl(). And, IPSEC_PROTO_ANY is expected to only match 3 IPSec-specific protocols, this is why xfrm_state_flush() could still miss IPPROTO_ROUTING, which leads that those entries are left in net->xfrm.state_all before exit net. Fix this by replacing IPSEC_PROTO_ANY with zero. This patch also extracts the check from validate_tmpl() to xfrm_id_proto_valid() and uses it in parse_ipsecrequest(). With this, no other protocols should be added into xfrm. Fixes: 6a53b7593233 ("xfrm: check id proto in validate_tmpl()") Reported-by: syzbot+0bf0519d6e0de15914fe@syzkaller.appspotmail.com Cc: Steffen Klassert Cc: Herbert Xu Signed-off-by: Cong Wang Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 17 +++++++++++++++++ net/ipv6/xfrm6_tunnel.c | 2 +- net/key/af_key.c | 4 +++- net/xfrm/xfrm_state.c | 2 +- net/xfrm/xfrm_user.c | 14 +------------- 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 85386becbaea..902437dfbce7 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1404,6 +1404,23 @@ static inline int xfrm_state_kern(const struct xfrm_state *x) return atomic_read(&x->tunnel_users); } +static inline bool xfrm_id_proto_valid(u8 proto) +{ + switch (proto) { + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: +#endif + return true; + default: + return false; + } +} + +/* IPSEC_PROTO_ANY only matches 3 IPsec protocols, 0 could match all. */ static inline int xfrm_id_proto_match(u8 proto, u8 userproto) { return (!userproto || proto == userproto || diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 12cb3aa990af..d9e5f6808811 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -345,7 +345,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) unsigned int i; xfrm_flush_gc(); - xfrm_state_flush(net, IPSEC_PROTO_ANY, false, true); + xfrm_state_flush(net, 0, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); diff --git a/net/key/af_key.c b/net/key/af_key.c index 5651c29cb5bd..4af1e1d60b9f 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1951,8 +1951,10 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) if (rq->sadb_x_ipsecrequest_mode == 0) return -EINVAL; + if (!xfrm_id_proto_valid(rq->sadb_x_ipsecrequest_proto)) + return -EINVAL; - t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */ + t->id.proto = rq->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1bb971f46fc6..178baaa037e5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2384,7 +2384,7 @@ void xfrm_state_fini(struct net *net) flush_work(&net->xfrm.state_hash_work); flush_work(&xfrm_state_gc_work); - xfrm_state_flush(net, IPSEC_PROTO_ANY, false, true); + xfrm_state_flush(net, 0, false, true); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8d4d52fd457b..6916931b1de1 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1513,20 +1513,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) return -EINVAL; } - switch (ut[i].id.proto) { - case IPPROTO_AH: - case IPPROTO_ESP: - case IPPROTO_COMP: -#if IS_ENABLED(CONFIG_IPV6) - case IPPROTO_ROUTING: - case IPPROTO_DSTOPTS: -#endif - case IPSEC_PROTO_ANY: - break; - default: + if (!xfrm_id_proto_valid(ut[i].id.proto)) return -EINVAL; - } - } return 0; From 8dfb4eba4100e7cdd161a8baef2d8d61b7a7e62e Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Mar 2019 14:30:00 +0100 Subject: [PATCH 009/181] esp4: add length check for UDP encapsulation esp_output_udp_encap can produce a length that doesn't fit in the 16 bits of a UDP header's length field. In that case, we'll send a fragmented packet whose length is larger than IP_MAX_MTU (resulting in "Oversized IP packet" warnings on receive) and with a bogus UDP length. To prevent this, add a length check to esp_output_udp_encap and return -EMSGSIZE on failure. This seems to be older than git history. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 10e809b296ec..fb065a8937ea 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -226,7 +226,7 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { int encap_type; struct udphdr *uh; @@ -234,6 +234,7 @@ static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, stru __be16 sport, dport; struct xfrm_encap_tmpl *encap = x->encap; struct ip_esp_hdr *esph = esp->esph; + unsigned int len; spin_lock_bh(&x->lock); sport = encap->encap_sport; @@ -241,11 +242,14 @@ static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, stru encap_type = encap->encap_type; spin_unlock_bh(&x->lock); + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len + sizeof(struct iphdr) >= IP_MAX_MTU) + return -EMSGSIZE; + uh = (struct udphdr *)esph; uh->source = sport; uh->dest = dport; - uh->len = htons(skb->len + esp->tailen - - skb_transport_offset(skb)); + uh->len = htons(len); uh->check = 0; switch (encap_type) { @@ -262,6 +266,8 @@ static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, stru *skb_mac_header(skb) = IPPROTO_UDP; esp->esph = esph; + + return 0; } int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) @@ -275,8 +281,12 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * int tailen = esp->tailen; /* this is non-NULL only with UDP Encapsulation */ - if (x->encap) - esp_output_udp_encap(x, skb, esp); + if (x->encap) { + int err = esp_output_udp_encap(x, skb, esp); + + if (err < 0) + return err; + } if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { From 025c65e119bf58b610549ca359c9ecc5dee6a8d2 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 26 Mar 2019 13:20:43 +0100 Subject: [PATCH 010/181] xfrm: Honor original L3 slave device in xfrmi policy lookup If an xfrmi is associated to a vrf layer 3 master device, xfrm_policy_check() fails after traffic decapsulation. The input interface is replaced by the layer 3 master device, and hence xfrmi_decode_session() can't match the xfrmi anymore to satisfy policy checking. Extend ingress xfrmi lookup to honor the original layer 3 slave device, allowing xfrm interfaces to operate within a vrf domain. Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Signed-off-by: Martin Willi Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/xfrm/xfrm_interface.c | 17 ++++++++++++++--- net/xfrm/xfrm_policy.c | 2 +- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 902437dfbce7..c9b0b2b5d672 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -295,7 +295,8 @@ struct xfrm_replay { }; struct xfrm_if_cb { - struct xfrm_if *(*decode_session)(struct sk_buff *skb); + struct xfrm_if *(*decode_session)(struct sk_buff *skb, + unsigned short family); }; void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index dbb3c1945b5c..85fec98676d3 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -70,17 +70,28 @@ static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) return NULL; } -static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb) +static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, + unsigned short family) { struct xfrmi_net *xfrmn; - int ifindex; struct xfrm_if *xi; + int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return NULL; + switch (family) { + case AF_INET6: + ifindex = inet6_sdif(skb); + break; + case AF_INET: + ifindex = inet_sdif(skb); + break; + } + if (!ifindex) + ifindex = skb->dev->ifindex; + xfrmn = net_generic(xs_net(xfrm_input_state(skb)), xfrmi_net_id); - ifindex = skb->dev->ifindex; for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { if (ifindex == xi->dev->ifindex && diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8d1a898d0ba5..a6b58df7a70f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3313,7 +3313,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, ifcb = xfrm_if_get_cb(); if (ifcb) { - xi = ifcb->decode_session(skb); + xi = ifcb->decode_session(skb, family); if (xi) { if_id = xi->p.if_id; net = xi->net; From 8742dc86d0c7a9628117a989c11f04a9b6b898f3 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 26 Feb 2019 07:04:50 +0100 Subject: [PATCH 011/181] xfrm4: Fix uninitialized memory read in _decode_session4 We currently don't reload pointers pointing into skb header after doing pskb_may_pull() in _decode_session4(). So in case pskb_may_pull() changed the pointers, we read from random memory. Fix this by putting all the needed infos on the stack, so that we don't need to access the header pointers after doing pskb_may_pull(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_policy.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d73a6d6652f6..2b144b92ae46 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -111,7 +111,8 @@ static void _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) { const struct iphdr *iph = ip_hdr(skb); - u8 *xprth = skb_network_header(skb) + iph->ihl * 4; + int ihl = iph->ihl; + u8 *xprth = skb_network_header(skb) + ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; @@ -122,6 +123,11 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) fl4->flowi4_mark = skb->mark; fl4->flowi4_oif = reverse ? skb->skb_iif : oif; + fl4->flowi4_proto = iph->protocol; + fl4->daddr = reverse ? iph->saddr : iph->daddr; + fl4->saddr = reverse ? iph->daddr : iph->saddr; + fl4->flowi4_tos = iph->tos; + if (!ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_UDP: @@ -133,7 +139,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; ports = (__be16 *)xprth; fl4->fl4_sport = ports[!!reverse]; @@ -146,7 +152,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, xprth + 2 - skb->data)) { u8 *icmp; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; icmp = xprth; fl4->fl4_icmp_type = icmp[0]; @@ -159,7 +165,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be32 *ehdr; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; ehdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ehdr[0]; @@ -171,7 +177,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, xprth + 8 - skb->data)) { __be32 *ah_hdr; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; ah_hdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ah_hdr[1]; @@ -183,7 +189,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ipcomp_hdr; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; ipcomp_hdr = (__be16 *)xprth; fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); @@ -196,7 +202,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) __be16 *greflags; __be32 *gre_hdr; - xprth = skb_network_header(skb) + iph->ihl * 4; + xprth = skb_network_header(skb) + ihl * 4; greflags = (__be16 *)xprth; gre_hdr = (__be32 *)xprth; @@ -213,10 +219,6 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) break; } } - fl4->flowi4_proto = iph->protocol; - fl4->daddr = reverse ? iph->saddr : iph->daddr; - fl4->saddr = reverse ? iph->daddr : iph->saddr; - fl4->flowi4_tos = iph->tos; } static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, From 99bd5fcc505d65ea9c60619202f0b2d926eabbe9 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 21 Mar 2019 17:19:37 -0700 Subject: [PATCH 012/181] ARC: PAE40: don't panic and instead turn off hw ioc HSDK currently panics when built for HIGHMEM/ARC_HAS_PAE40 because ioc is enabled with default which doesn't work for the 2 non contiguous memory nodes. So get PAE working by disabling ioc instead. Tested with !PAE40 by forcing @ioc_enable=0 and running the glibc testsuite over ssh Signed-off-by: Vineet Gupta --- arch/arc/mm/cache.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 4135abec3fb0..63e6e6504699 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -113,10 +113,24 @@ static void read_decode_cache_bcr_arcv2(int cpu) } READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); - if (cbcr.c) + if (cbcr.c) { ioc_exists = 1; - else + + /* + * As for today we don't support both IOC and ZONE_HIGHMEM enabled + * simultaneously. This happens because as of today IOC aperture covers + * only ZONE_NORMAL (low mem) and any dma transactions outside this + * region won't be HW coherent. + * If we want to use both IOC and ZONE_HIGHMEM we can use + * bounce_buffer to handle dma transactions to HIGHMEM. + * Also it is possible to modify dma_direct cache ops or increase IOC + * aperture size if we are planning to use HIGHMEM without PAE. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) || is_pae40_enabled()) + ioc_enable = 0; + } else { ioc_enable = 0; + } /* HS 2.0 didn't have AUX_VOL */ if (cpuinfo_arc700[cpu].core.family > 0x51) { @@ -1158,19 +1172,6 @@ noinline void __init arc_ioc_setup(void) if (!ioc_enable) return; - /* - * As for today we don't support both IOC and ZONE_HIGHMEM enabled - * simultaneously. This happens because as of today IOC aperture covers - * only ZONE_NORMAL (low mem) and any dma transactions outside this - * region won't be HW coherent. - * If we want to use both IOC and ZONE_HIGHMEM we can use - * bounce_buffer to handle dma transactions to HIGHMEM. - * Also it is possible to modify dma_direct cache ops or increase IOC - * aperture size if we are planning to use HIGHMEM without PAE. - */ - if (IS_ENABLED(CONFIG_HIGHMEM)) - panic("IOC and HIGHMEM can't be used simultaneously"); - /* Flush + invalidate + disable L1 dcache */ __dc_disable(); From 21cee1bd1594b2af6798ddffa97555b4bc3586e1 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 2 Apr 2019 12:10:44 -0700 Subject: [PATCH 013/181] ARC: [hsdk] Make it easier to add PAE40 region to DTB 1. Bump top level address-cells/size-cells nodes to 2 (to ensure all down stream addresses are 64-bits, unless explicitly specified otherwise (in "soc" bus with all peripherals) 2. "memory" also specified with address/size 2 3. Add a commented reference for PAE40 region beyond 4GB physical address space Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/hsdk.dts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 69bc1c9e8e50..7425bb0f2d1b 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -18,8 +18,8 @@ model = "snps,hsdk"; compatible = "snps,hsdk"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; chosen { bootargs = "earlycon=uart8250,mmio32,0xf0005000,115200n8 console=ttyS0,115200n8 debug print-fatal-signals=1"; @@ -105,7 +105,7 @@ #size-cells = <1>; interrupt-parent = <&idu_intc>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; cgu_rst: reset-controller@8a0 { compatible = "snps,hsdk-reset"; @@ -269,9 +269,10 @@ }; memory@80000000 { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; device_type = "memory"; - reg = <0x80000000 0x40000000>; /* 1 GiB */ + reg = <0x0 0x80000000 0x0 0x40000000>; /* 1 GB lowmem */ + /* 0x1 0x00000000 0x0 0x40000000>; 1 GB highmem */ }; }; From 55c0c4c793b538fb438bcc72481b9dc2f79fe5a9 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Mon, 8 Apr 2019 16:04:38 +0300 Subject: [PATCH 014/181] ARC: memset: fix build with L1_CACHE_SHIFT != 6 In case of 'L1_CACHE_SHIFT != 6' we define dummy assembly macroses PREALLOC_INSTR and PREFETCHW_INSTR without arguments. However we pass arguments to them in code which cause build errors. Fix that. Signed-off-by: Eugeniy Paltsev Cc: [5.0] Signed-off-by: Vineet Gupta --- arch/arc/lib/memset-archs.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S index f230bb7092fd..b3373f5c88e0 100644 --- a/arch/arc/lib/memset-archs.S +++ b/arch/arc/lib/memset-archs.S @@ -30,10 +30,10 @@ #else -.macro PREALLOC_INSTR +.macro PREALLOC_INSTR reg, off .endm -.macro PREFETCHW_INSTR +.macro PREFETCHW_INSTR reg, off .endm #endif From dbe7208c6c4aec083571f2ec742870a0d0edbea3 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Sun, 7 Apr 2019 11:12:48 -0700 Subject: [PATCH 015/181] power: supply: cpcap-battery: Fix division by zero If called fast enough so samples do not increment, we can get division by zero in kernel: __div0 cpcap_battery_cc_raw_div cpcap_battery_get_property power_supply_get_property.part.1 power_supply_get_property power_supply_show_property power_supply_uevent Fixes: 874b2adbed12 ("power: supply: cpcap-battery: Add a battery driver") Signed-off-by: Tony Lindgren Acked-by: Pavel Machek Signed-off-by: Sebastian Reichel --- drivers/power/supply/cpcap-battery.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/power/supply/cpcap-battery.c b/drivers/power/supply/cpcap-battery.c index 08d5037fd052..6887870ba32c 100644 --- a/drivers/power/supply/cpcap-battery.c +++ b/drivers/power/supply/cpcap-battery.c @@ -221,6 +221,9 @@ static int cpcap_battery_cc_raw_div(struct cpcap_battery_ddata *ddata, int avg_current; u32 cc_lsb; + if (!divider) + return 0; + sample &= 0xffffff; /* 24-bits, unsigned */ offset &= 0x7ff; /* 10-bits, signed */ From e9f33a8fee53c2d4bcdeec9a89478b4bf17bfbbc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Apr 2019 14:11:20 +0200 Subject: [PATCH 016/181] mac80211: fix RX STBC override byte order The original patch neglected to take byte order conversions into account, fix that. Fixes: d9bb410888ce ("mac80211: allow overriding HT STBC capabilities") Signed-off-by: Johannes Berg Reviewed-by: Sergey Matyukevich Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index e03c46ac8e4d..c62101857b9b 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -112,8 +112,9 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, IEEE80211_HT_CAP_TX_STBC); /* Allow user to configure RX STBC bits */ - if (ht_capa_mask->cap_info & IEEE80211_HT_CAP_RX_STBC) - ht_cap->cap |= ht_capa->cap_info & IEEE80211_HT_CAP_RX_STBC; + if (ht_capa_mask->cap_info & cpu_to_le16(IEEE80211_HT_CAP_RX_STBC)) + ht_cap->cap |= le16_to_cpu(ht_capa->cap_info) & + IEEE80211_HT_CAP_RX_STBC; /* Allow user to decrease AMPDU factor */ if (ht_capa_mask->ampdu_params_info & From 102bbe34b31c9159e714432afd64458f6f3876d7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 10 Apr 2019 15:47:54 +0800 Subject: [PATCH 017/181] gpio: eic: sprd: Fix incorrect irq type setting for the sync EIC When setting sync EIC as IRQ_TYPE_EDGE_BOTH type, we missed to set the SPRD_EIC_SYNC_INTMODE register to 0, which means detecting edge signals. Thus this patch fixes the issue. Fixes: 25518e024e3a ("gpio: Add Spreadtrum EIC driver support") Cc: Signed-off-by: Baolin Wang Signed-off-by: Linus Walleij --- drivers/gpio/gpio-eic-sprd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-eic-sprd.c b/drivers/gpio/gpio-eic-sprd.c index f0223cee9774..77092268ee95 100644 --- a/drivers/gpio/gpio-eic-sprd.c +++ b/drivers/gpio/gpio-eic-sprd.c @@ -414,6 +414,7 @@ static int sprd_eic_irq_set_type(struct irq_data *data, unsigned int flow_type) irq_set_handler_locked(data, handle_edge_irq); break; case IRQ_TYPE_EDGE_BOTH: + sprd_eic_update(chip, offset, SPRD_EIC_SYNC_INTMODE, 0); sprd_eic_update(chip, offset, SPRD_EIC_SYNC_INTBOTH, 1); irq_set_handler_locked(data, handle_edge_irq); break; From d15d9fd02575ecfada92d42f655940c4f10af842 Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Wed, 20 Feb 2019 07:52:31 +0000 Subject: [PATCH 018/181] drm: bridge: dw-hdmi: Fix overflow workaround for Rockchip SoCs The Rockchip RK3288 SoC (v2.00a) and RK3328/RK3399 SoCs (v2.11a) have also been identified as needing this workaround with a single iteration. Fixes: be41fc55f1aa ("drm: bridge: dw-hdmi: Handle overflow workaround based on device version") Signed-off-by: Jonas Karlman Tested-by: Heiko Stueber Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/AM3PR03MB0966818FAAAE6192FF4ED11AAC7D0@AM3PR03MB0966.eurprd03.prod.outlook.com --- drivers/gpu/drm/bridge/synopsys/dw-hdmi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c index db761329a1e3..b74ccb6a24c2 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c @@ -1800,6 +1800,8 @@ static void dw_hdmi_clear_overflow(struct dw_hdmi *hdmi) * iteration for others. * The Amlogic Meson GX SoCs (v2.01a) have been identified as needing * the workaround with a single iteration. + * The Rockchip RK3288 SoC (v2.00a) and RK3328/RK3399 SoCs (v2.11a) have + * been identified as needing the workaround with a single iteration. */ switch (hdmi->version) { @@ -1808,7 +1810,9 @@ static void dw_hdmi_clear_overflow(struct dw_hdmi *hdmi) break; case 0x131a: case 0x132a: + case 0x200a: case 0x201a: + case 0x211a: case 0x212a: count = 1; break; From 837f74116585dcd235fae1696e1e1471b6bb9e01 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 9 Apr 2019 17:16:59 +0200 Subject: [PATCH 019/181] xfrm: update doc about xfrm[46]_gc_thresh Those entries are not used anymore. CC: Florian Westphal Fixes: 09c7570480f7 ("xfrm: remove flow cache") Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- Documentation/networking/ip-sysctl.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index acdfb5d2bcaa..f1843b132453 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1336,6 +1336,7 @@ tag - INTEGER Default value is 0. xfrm4_gc_thresh - INTEGER + (Obsolete since linux-4.14) The threshold at which we will start garbage collecting for IPv4 destination cache entries. At twice this value the system will refuse new allocations. @@ -1919,6 +1920,7 @@ echo_ignore_all - BOOLEAN Default: 0 xfrm6_gc_thresh - INTEGER + (Obsolete since linux-4.14) The threshold at which we will start garbage collecting for IPv6 destination cache entries. At twice this value the system will refuse new allocations. From d5bc73f34cc97c4b4b9202cc93182c2515076edf Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 10 Apr 2019 15:05:31 -0600 Subject: [PATCH 020/181] PCI: Fix issue with "pci=disable_acs_redir" parameter being ignored In most cases, kmalloc() will not be available early in boot when pci_setup() is called. Thus, the kstrdup() call that was added to fix the __initdata bug with the disable_acs_redir parameter usually returns NULL, so the parameter is discarded and has no effect. To fix this, store the string that's in initdata until an initcall function can allocate the memory appropriately. This way we don't need any additional static memory. Fixes: d2fd6e81912a ("PCI: Fix __initdata issue with "pci=disable_acs_redir" parameter") Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 7c1b362f599a..766f5779db92 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6262,8 +6262,7 @@ static int __init pci_setup(char *str) } else if (!strncmp(str, "pcie_scan_all", 13)) { pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS); } else if (!strncmp(str, "disable_acs_redir=", 18)) { - disable_acs_redir_param = - kstrdup(str + 18, GFP_KERNEL); + disable_acs_redir_param = str + 18; } else { printk(KERN_ERR "PCI: Unknown option `%s'\n", str); @@ -6274,3 +6273,19 @@ static int __init pci_setup(char *str) return 0; } early_param("pci", pci_setup); + +/* + * 'disable_acs_redir_param' is initialized in pci_setup(), above, to point + * to data in the __initdata section which will be freed after the init + * sequence is complete. We can't allocate memory in pci_setup() because some + * architectures do not have any memory allocation service available during + * an early_param() call. So we allocate memory and copy the variable here + * before the init section is freed. + */ +static int __init pci_realloc_setup_params(void) +{ + disable_acs_redir_param = kstrdup(disable_acs_redir_param, GFP_KERNEL); + + return 0; +} +pure_initcall(pci_realloc_setup_params); From 5aae7832d1b4ec614996ea0f4fafc4d9855ec0b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 26 Mar 2019 16:49:02 +0200 Subject: [PATCH 021/181] drm/i915: Do not enable FEC without DSC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we enable FEC even when DSC is no used. While that is theoretically valid supposedly there isn't much of a benefit from this. But more importantly we do not account for the FEC link bandwidth overhead (2.4%) in the non-DSC link bandwidth computations. So the code may think we have enough bandwidth when we in fact do not. Cc: stable@vger.kernel.org Cc: Anusha Srivatsa Cc: Manasi Navare Fixes: 240999cf339f ("i915/dp/fec: Add fec_enable to the crtc state.") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20190326144903.6617-1-ville.syrjala@linux.intel.com Reviewed-by: Manasi Navare (cherry picked from commit 6fd3134ae3551d4802a04669c0f39f2f5c56f77d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_dp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 8891f29a8c7f..48da4a969a0a 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -1886,6 +1886,9 @@ static int intel_dp_dsc_compute_config(struct intel_dp *intel_dp, int pipe_bpp; int ret; + pipe_config->fec_enable = !intel_dp_is_edp(intel_dp) && + intel_dp_supports_fec(intel_dp, pipe_config); + if (!intel_dp_supports_dsc(intel_dp, pipe_config)) return -EINVAL; @@ -2116,9 +2119,6 @@ intel_dp_compute_config(struct intel_encoder *encoder, if (adjusted_mode->flags & DRM_MODE_FLAG_DBLCLK) return -EINVAL; - pipe_config->fec_enable = !intel_dp_is_edp(intel_dp) && - intel_dp_supports_fec(intel_dp, pipe_config); - ret = intel_dp_compute_link_config(encoder, pipe_config, conn_state); if (ret < 0) return ret; From f5c58ba18ab8ea2169670ed880e4d31ed772ad10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 11 Apr 2019 19:49:25 +0300 Subject: [PATCH 022/181] drm/i915: Restore correct bxt_ddi_phy_calc_lane_lat_optim_mask() calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are no longer calling bxt_ddi_phy_calc_lane_lat_optim_mask() when intel{hdmi,dp}_compute_config() succeeds, and instead only call it when those fail. This is fallout from the bool->int .compute_config() conversion which failed to invert the return value check before calling bxt_ddi_phy_calc_lane_lat_optim_mask(). Let's just replace it with an early bailout so that it's harder to miss. This restores the correct latency optim setting calculation (which could fix some real failures), and avoids the MISSING_CASE() from bxt_ddi_phy_calc_lane_lat_optim_mask() after intel{hdmi,dp}_compute_config() has failed. Cc: Lyude Paul Fixes: 204474a6b859 ("drm/i915: Pass down rc in intel_encoder->compute_config()") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109373 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20190411164925.28491-1-ville.syrjala@linux.intel.com Reviewed-by: Lyude Paul (cherry picked from commit 7a412b8f60cd57ab7dcb72ab701fde2bf81752eb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_ddi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index ab4e60dfd6a3..98cea1f4b3bf 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -3862,14 +3862,16 @@ static int intel_ddi_compute_config(struct intel_encoder *encoder, ret = intel_hdmi_compute_config(encoder, pipe_config, conn_state); else ret = intel_dp_compute_config(encoder, pipe_config, conn_state); + if (ret) + return ret; - if (IS_GEN9_LP(dev_priv) && ret) + if (IS_GEN9_LP(dev_priv)) pipe_config->lane_lat_optim_mask = bxt_ddi_phy_calc_lane_lat_optim_mask(pipe_config->lane_count); intel_ddi_compute_min_voltage_level(dev_priv, pipe_config); - return ret; + return 0; } From 7c39f7f671d2acc0a1f39ebbbee4303ad499bbfa Mon Sep 17 00:00:00 2001 From: Josh Collier Date: Mon, 15 Apr 2019 11:34:22 -0700 Subject: [PATCH 023/181] IB/rdmavt: Fix frwr memory registration Current implementation was not properly handling frwr memory registrations. This was uncovered by commit 27f26cec761das ("xprtrdma: Plant XID in on-the-wire RDMA offset (FRWR)") in which xprtrdma, which is used for NFS over RDMA, started failing as it was the first ULP to modify the ib_mr iova resulting in the NFS server getting REMOTE ACCESS ERROR when attempting to perform RDMA Writes to the client. The fix is to properly capture the true iova, offset, and length in the call to ib_map_mr_sg, and then update the iova when processing the IB_WR_REG_MEM on the send queue. Fixes: a41081aa5936 ("IB/rdmavt: Add support for ib_map_mr_sg") Cc: stable@vger.kernel.org Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl Signed-off-by: Josh Collier Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/mr.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 728795043496..0bb6e39dd03a 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -608,11 +608,6 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr) if (unlikely(mapped_segs == mr->mr.max_segs)) return -ENOMEM; - if (mr->mr.length == 0) { - mr->mr.user_base = addr; - mr->mr.iova = addr; - } - m = mapped_segs / RVT_SEGSZ; n = mapped_segs % RVT_SEGSZ; mr->mr.map[m]->segs[n].vaddr = (void *)addr; @@ -630,17 +625,24 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr) * @sg_nents: number of entries in sg * @sg_offset: offset in bytes into sg * + * Overwrite rvt_mr length with mr length calculated by ib_sg_to_pages. + * * Return: number of sg elements mapped to the memory region */ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset) { struct rvt_mr *mr = to_imr(ibmr); + int ret; mr->mr.length = 0; mr->mr.page_shift = PAGE_SHIFT; - return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, - rvt_set_page); + ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rvt_set_page); + mr->mr.user_base = ibmr->iova; + mr->mr.iova = ibmr->iova; + mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr; + mr->mr.length = (size_t)ibmr->length; + return ret; } /** @@ -671,6 +673,7 @@ int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, ibmr->rkey = key; mr->mr.lkey = key; mr->mr.access_flags = access; + mr->mr.iova = ibmr->iova; atomic_set(&mr->mr.lkey_invalid, 0); return 0; From c01c348ecdc66085e44912c97368809612231520 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 15 Apr 2019 11:51:38 -0400 Subject: [PATCH 024/181] USB: core: Fix unterminated string returned by usb_string() Some drivers (such as the vub300 MMC driver) expect usb_string() to return a properly NUL-terminated string, even when an error occurs. (In fact, vub300's probe routine doesn't bother to check the return code from usb_string().) When the driver goes on to use an unterminated string, it leads to kernel errors such as stack-out-of-bounds, as found by the syzkaller USB fuzzer. An out-of-range string index argument is not at all unlikely, given that some devices don't provide string descriptors and therefore list 0 as the value for their string indexes. This patch makes usb_string() return a properly terminated empty string along with the -EINVAL error code when an out-of-range index is encountered. And since a USB string index is a single-byte value, indexes >= 256 are just as invalid as values of 0 or below. Signed-off-by: Alan Stern Reported-by: syzbot+b75b85111c10b8d680f1@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 82239f27c4cc..e844bb7b5676 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -820,9 +820,11 @@ int usb_string(struct usb_device *dev, int index, char *buf, size_t size) if (dev->state == USB_STATE_SUSPENDED) return -EHOSTUNREACH; - if (size <= 0 || !buf || !index) + if (size <= 0 || !buf) return -EINVAL; buf[0] = 0; + if (index <= 0 || index >= 256) + return -EINVAL; tbuf = kmalloc(256, GFP_NOIO); if (!tbuf) return -ENOMEM; From 8adddf349fda0d3de2f6bb41ddf838cbf36a8ad2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 16 Apr 2019 23:59:02 +1000 Subject: [PATCH 025/181] powerpc/mm/radix: Make Radix require HUGETLB_PAGE Joel reported weird crashes using skiroot_defconfig, in his case we jumped into an NX page: kernel tried to execute exec-protected page (c000000002bff4f0) - exploit attempt? (uid: 0) BUG: Unable to handle kernel instruction fetch Faulting instruction address: 0xc000000002bff4f0 Looking at the disassembly, we had simply branched to that address: c000000000c001bc 49fff335 bl c000000002bff4f0 But that didn't match the original kernel image: c000000000c001bc 4bfff335 bl c000000000bff4f0 When STRICT_KERNEL_RWX is enabled, and we're using the radix MMU, we call radix__change_memory_range() late in boot to change page protections. We do that both to mark rodata read only and also to mark init text no-execute. That involves walking the kernel page tables, and clearing _PAGE_WRITE or _PAGE_EXEC respectively. With radix we may use hugepages for the linear mapping, so the code in radix__change_memory_range() uses eg. pmd_huge() to test if it has found a huge mapping, and if so it stops the page table walk and changes the PMD permissions. However if the kernel is built without HUGETLBFS support, pmd_huge() is just a #define that always returns 0. That causes the code in radix__change_memory_range() to incorrectly interpret the PMD value as a pointer to a PTE page rather than as a PTE at the PMD level. We can see this using `dv` in xmon which also uses pmd_huge(): 0:mon> dv c000000000000000 pgd @ 0xc000000001740000 pgdp @ 0xc000000001740000 = 0x80000000ffffb009 pudp @ 0xc0000000ffffb000 = 0x80000000ffffa009 pmdp @ 0xc0000000ffffa000 = 0xc00000000000018f <- this is a PTE ptep @ 0xc000000000000100 = 0xa64bb17da64ab07d <- kernel text The end result is we treat the value at 0xc000000000000100 as a PTE and clear _PAGE_WRITE or _PAGE_EXEC, potentially corrupting the code at that address. In Joel's specific case we cleared the sign bit in the offset of the branch, causing a backward branch to turn into a forward branch which caused us to branch into a non-executable page. However the exact nature of the crash depends on kernel version, compiler version, and other factors. We need to fix radix__change_memory_range() to not use accessors that depend on HUGETLBFS, but we also have radix memory hotplug code that uses pmd_huge() etc that will also need fixing. So for now just disallow the broken combination of Radix with HUGETLBFS disabled. The only defconfig we have that is affected is skiroot_defconfig, so turn on HUGETLBFS there so that it still gets Radix. Fixes: 566ca99af026 ("powerpc/mm/radix: Add dummy radix_enabled()") Cc: stable@vger.kernel.org # v4.7+ Reported-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/configs/skiroot_defconfig | 1 + arch/powerpc/platforms/Kconfig.cputype | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 5ba131c30f6b..1bcd468ab422 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -266,6 +266,7 @@ CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_PROC_KCORE=y +CONFIG_HUGETLBFS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_NLS=y diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 842b2c7e156a..50cd09b4e05d 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -324,7 +324,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" - depends on PPC_BOOK3S_64 + depends on PPC_BOOK3S_64 && HUGETLB_PAGE select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA default y help From eb9d7a62c38628ab0ba6e59d22d7cb7930e415d1 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 3 Apr 2019 15:12:32 +1100 Subject: [PATCH 026/181] powerpc/mm_iommu: Fix potential deadlock Currently mm_iommu_do_alloc() is called in 2 cases: - VFIO_IOMMU_SPAPR_REGISTER_MEMORY ioctl() for normal memory: this locks &mem_list_mutex and then locks mm::mmap_sem several times when adjusting locked_vm or pinning pages; - vfio_pci_nvgpu_regops::mmap() for GPU memory: this is called with mm::mmap_sem held already and it locks &mem_list_mutex. So one can craft a userspace program to do special ioctl and mmap in 2 threads concurrently and cause a deadlock which lockdep warns about (below). We did not hit this yet because QEMU constructs the machine in a single thread. This moves the overlap check next to where the new entry is added and reduces the amount of time spent with &mem_list_mutex held. This moves locked_vm adjustment from under &mem_list_mutex. This relies on mm_iommu_adjust_locked_vm() doing nothing when entries==0. This is one of the lockdep warnings: ====================================================== WARNING: possible circular locking dependency detected 5.1.0-rc2-le_nv2_aikATfstn1-p1 #363 Not tainted ------------------------------------------------------ qemu-system-ppc/8038 is trying to acquire lock: 000000002ec6c453 (mem_list_mutex){+.+.}, at: mm_iommu_do_alloc+0x70/0x490 but task is already holding lock: 00000000fd7da97f (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0xf0/0x160 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&mm->mmap_sem){++++}: lock_acquire+0xf8/0x260 down_write+0x44/0xa0 mm_iommu_adjust_locked_vm.part.1+0x4c/0x190 mm_iommu_do_alloc+0x310/0x490 tce_iommu_ioctl.part.9+0xb84/0x1150 [vfio_iommu_spapr_tce] vfio_fops_unl_ioctl+0x94/0x430 [vfio] do_vfs_ioctl+0xe4/0x930 ksys_ioctl+0xc4/0x110 sys_ioctl+0x28/0x80 system_call+0x5c/0x70 -> #0 (mem_list_mutex){+.+.}: __lock_acquire+0x1484/0x1900 lock_acquire+0xf8/0x260 __mutex_lock+0x88/0xa70 mm_iommu_do_alloc+0x70/0x490 vfio_pci_nvgpu_mmap+0xc0/0x130 [vfio_pci] vfio_pci_mmap+0x198/0x2a0 [vfio_pci] vfio_device_fops_mmap+0x44/0x70 [vfio] mmap_region+0x5d4/0x770 do_mmap+0x42c/0x650 vm_mmap_pgoff+0x124/0x160 ksys_mmap_pgoff+0xdc/0x2f0 sys_mmap+0x40/0x80 system_call+0x5c/0x70 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(mem_list_mutex); lock(&mm->mmap_sem); lock(mem_list_mutex); *** DEADLOCK *** 1 lock held by qemu-system-ppc/8038: #0: 00000000fd7da97f (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0xf0/0x160 Fixes: c10c21efa4bc ("powerpc/vfio/iommu/kvm: Do not pin device memory", 2018-12-19) Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mmu_context_iommu.c | 77 +++++++++++++++-------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e7a9c4f6bfca..9d9be850f8c2 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -95,28 +95,14 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, unsigned long entries, unsigned long dev_hpa, struct mm_iommu_table_group_mem_t **pmem) { - struct mm_iommu_table_group_mem_t *mem; - long i, ret, locked_entries = 0; + struct mm_iommu_table_group_mem_t *mem, *mem2; + long i, ret, locked_entries = 0, pinned = 0; unsigned int pageshift; - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, - next) { - /* Overlap? */ - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && - (ua < (mem->ua + - (mem->entries << PAGE_SHIFT)))) { - ret = -EINVAL; - goto unlock_exit; - } - - } - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { ret = mm_iommu_adjust_locked_vm(mm, entries, true); if (ret) - goto unlock_exit; + return ret; locked_entries = entries; } @@ -150,15 +136,10 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, down_read(&mm->mmap_sem); ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); up_read(&mm->mmap_sem); + pinned = ret > 0 ? ret : 0; if (ret != entries) { - /* free the reference taken */ - for (i = 0; i < ret; i++) - put_page(mem->hpages[i]); - - vfree(mem->hpas); - kfree(mem); ret = -EFAULT; - goto unlock_exit; + goto free_exit; } pageshift = PAGE_SHIFT; @@ -183,21 +164,43 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } good_exit: - ret = 0; atomic64_set(&mem->mapped, 1); mem->used = 1; mem->ua = ua; mem->entries = entries; - *pmem = mem; + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) { + /* Overlap? */ + if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem2->ua + + (mem2->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + mutex_unlock(&mem_list_mutex); + goto free_exit; + } + } list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); -unlock_exit: - if (locked_entries && ret) - mm_iommu_adjust_locked_vm(mm, locked_entries, false); - mutex_unlock(&mem_list_mutex); + *pmem = mem; + + return 0; + +free_exit: + /* free the reference taken */ + for (i = 0; i < pinned; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + +unlock_exit: + mm_iommu_adjust_locked_vm(mm, locked_entries, false); + return ret; } @@ -266,7 +269,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) { long ret = 0; - unsigned long entries, dev_hpa; + unsigned long unlock_entries = 0; mutex_lock(&mem_list_mutex); @@ -287,17 +290,17 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) goto unlock_exit; } - /* @mapped became 0 so now mappings are disabled, release the region */ - entries = mem->entries; - dev_hpa = mem->dev_hpa; - mm_iommu_release(mem); + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + unlock_entries = mem->entries; - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - mm_iommu_adjust_locked_vm(mm, entries, false); + /* @mapped became 0 so now mappings are disabled, release the region */ + mm_iommu_release(mem); unlock_exit: mutex_unlock(&mem_list_mutex); + mm_iommu_adjust_locked_vm(mm, unlock_entries, false); + return ret; } EXPORT_SYMBOL_GPL(mm_iommu_put); From 7a3a4d763837d3aa654cd1059030950410c04d77 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 3 Apr 2019 15:12:33 +1100 Subject: [PATCH 027/181] powerpc/mm_iommu: Allow pinning large regions When called with vmas_arg==NULL, get_user_pages_longterm() allocates an array of nr_pages*8 which can easily get greater that the max order, for example, registering memory for a 256GB guest does this and fails in __alloc_pages_nodemask(). This adds a loop over chunks of entries to fit the max order limit. Fixes: 678e174c4c16 ("powerpc/mm/iommu: allow migration of cma allocated pages during mm_iommu_do_alloc", 2019-03-05) Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mmu_context_iommu.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 9d9be850f8c2..8330f135294f 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -98,6 +98,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, struct mm_iommu_table_group_mem_t *mem, *mem2; long i, ret, locked_entries = 0, pinned = 0; unsigned int pageshift; + unsigned long entry, chunk; if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { ret = mm_iommu_adjust_locked_vm(mm, entries, true); @@ -134,11 +135,26 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + sizeof(struct vm_area_struct *); + chunk = min(chunk, entries); + for (entry = 0; entry < entries; entry += chunk) { + unsigned long n = min(entries - entry, chunk); + + ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE, mem->hpages + entry, NULL); + if (ret == n) { + pinned += n; + continue; + } + if (ret > 0) + pinned += ret; + break; + } up_read(&mm->mmap_sem); - pinned = ret > 0 ? ret : 0; - if (ret != entries) { - ret = -EFAULT; + if (pinned != entries) { + if (!ret) + ret = -EFAULT; goto free_exit; } From 7249c8ea227a582c14f63e9e8853eb7369122f10 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Wed, 10 Apr 2019 10:59:45 +0300 Subject: [PATCH 028/181] IB/mlx5: Fix scatter to CQE in DCT QP creation When scatter to CQE is enabled on a DCT QP it corrupts the mailbox command since it tried to treat it as as QP create mailbox command instead of a DCT create command. The corrupted mailbox command causes userspace to malfunction as the device doesn't create the QP as expected. A new mlx5 capability is exposed to user-space which ensures that it will not enable the feature on DCT without this fix in the kernel. Fixes: 5d6ff1babe78 ("IB/mlx5: Support scatter to CQE for DC transport type") Signed-off-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 ++ drivers/infiniband/hw/mlx5/qp.c | 11 +++++++---- include/uapi/rdma/mlx5-abi.h | 1 + 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 531ff20b32ad..241d9ead79eb 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1119,6 +1119,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, qp_packet_based)) resp.flags |= MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE; + + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; } if (field_avail(typeof(resp), sw_parsing_caps, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7cd006da1dae..8870c350fda0 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1818,13 +1818,16 @@ static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr, rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); - if (rcqe_sz == 128) { - MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + if (init_attr->qp_type == MLX5_IB_QPT_DCT) { + if (rcqe_sz == 128) + MLX5_SET(dctc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + return; } - if (init_attr->qp_type != MLX5_IB_QPT_DCT) - MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); + MLX5_SET(qpc, qpc, cs_res, + rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE : + MLX5_RES_SCAT_DATA32_CQE); } static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 87b3198f4b5d..f4d4010b7e3e 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -238,6 +238,7 @@ enum mlx5_ib_query_dev_resp_flags { MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP = 1 << 0, MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD = 1 << 1, MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE = 1 << 2, + MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT = 1 << 3, }; enum mlx5_ib_tunnel_offloads { From 1c6bca6d75bca2cc47b5eafb9f7f16e368ffbeca Mon Sep 17 00:00:00 2001 From: Shahar S Matityahu Date: Mon, 15 Apr 2019 14:43:04 +0300 Subject: [PATCH 029/181] iwlwifi: don't panic in error path on non-msix systems The driver uses msix causes-register to handle both msix and non msix interrupts when performing sync nmi. On devices that do not support msix this register is unmapped and accessing it causes a kernel panic. Solve this by differentiating the two cases and accessing the proper causes-register in each case. Reported-by: Michal Hocko Signed-off-by: Shahar S Matityahu Signed-off-by: Luca Coelho --- .../net/wireless/intel/iwlwifi/pcie/trans.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 79c1dc05f948..c4375b868901 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -3644,20 +3644,27 @@ out_no_pci: void iwl_trans_pcie_sync_nmi(struct iwl_trans *trans) { + struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); unsigned long timeout = jiffies + IWL_TRANS_NMI_TIMEOUT; + u32 inta_addr, sw_err_bit; + + if (trans_pcie->msix_enabled) { + inta_addr = CSR_MSIX_HW_INT_CAUSES_AD; + sw_err_bit = MSIX_HW_INT_CAUSES_REG_SW_ERR; + } else { + inta_addr = CSR_INT; + sw_err_bit = CSR_INT_BIT_SW_ERR; + } iwl_disable_interrupts(trans); iwl_force_nmi(trans); while (time_after(timeout, jiffies)) { - u32 inta_hw = iwl_read32(trans, - CSR_MSIX_HW_INT_CAUSES_AD); + u32 inta_hw = iwl_read32(trans, inta_addr); /* Error detected by uCode */ - if (inta_hw & MSIX_HW_INT_CAUSES_REG_SW_ERR) { + if (inta_hw & sw_err_bit) { /* Clear causes register */ - iwl_write32(trans, CSR_MSIX_HW_INT_CAUSES_AD, - inta_hw & - MSIX_HW_INT_CAUSES_REG_SW_ERR); + iwl_write32(trans, inta_addr, inta_hw & sw_err_bit); break; } From 72d3c7bbc9b581e5f2a455e6f399c75626653945 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 25 Mar 2019 14:11:52 +0100 Subject: [PATCH 030/181] iwlwifi: mvm: don't attempt debug collection in rfkill If we fail to initialize because rfkill is enabled, then trying to do debug collection currently just fails. Prevent that in the high-level code, although we should probably also fix the lower level code to do things more carefully. It's not 100% clear that it fixes this commit, as the original dump code at the time might've been more careful. In any case, we don't really need to dump anything in this expected scenario. Signed-off-by: Johannes Berg Fixes: 7125648074e8 ("iwlwifi: add fw dump upon RT ucode start failure") Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 4 +++- drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index 00a47f6f1d81..ab68b5d53ec9 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -1121,7 +1121,9 @@ int iwl_mvm_up(struct iwl_mvm *mvm) ret = iwl_mvm_load_rt_fw(mvm); if (ret) { IWL_ERR(mvm, "Failed to start RT ucode: %d\n", ret); - iwl_fw_dbg_error_collect(&mvm->fwrt, FW_DBG_TRIGGER_DRIVER); + if (ret != -ERFKILL) + iwl_fw_dbg_error_collect(&mvm->fwrt, + FW_DBG_TRIGGER_DRIVER); goto error; } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index ba27dce4c2bb..13681b03c10e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -834,7 +834,7 @@ iwl_op_mode_mvm_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, mutex_lock(&mvm->mutex); iwl_mvm_ref(mvm, IWL_MVM_REF_INIT_UCODE); err = iwl_run_init_mvm_ucode(mvm, true); - if (err) + if (err && err != -ERFKILL) iwl_fw_dbg_error_collect(&mvm->fwrt, FW_DBG_TRIGGER_DRIVER); if (!iwlmvm_mod_params.init_dbg || !err) iwl_mvm_stop_device(mvm); From b35f63972c5c67fc0f908286f7fc624137788876 Mon Sep 17 00:00:00 2001 From: Shahar S Matityahu Date: Wed, 20 Mar 2019 17:41:16 +0200 Subject: [PATCH 031/181] iwlwifi: dbg_ini: check debug TLV type explicitly In ini debug TLVs bit 24 is set. The driver relies on it in the memory allocation for the debug configuration. This implementation is problematic in case of a new debug TLV that is not supported yet is added and uses bit 24. In such a scenario the driver allocate space without using it which causes errors in the apply point enabling flow. Solve it by explicitly checking if a given TLV is part of the list of the supported ini debug TLVs. Signed-off-by: Shahar S Matityahu Fixes: f14cda6f3b31 ("iwlwifi: trans: parse and store debug ini TLVs") Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/fw/file.h | 15 +++++++++------ drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/file.h b/drivers/net/wireless/intel/iwlwifi/fw/file.h index 641c95d03b15..e06407dc088b 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/file.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/file.h @@ -93,7 +93,7 @@ struct iwl_ucode_header { } u; }; -#define IWL_UCODE_INI_TLV_GROUP BIT(24) +#define IWL_UCODE_INI_TLV_GROUP 0x1000000 /* * new TLV uCode file layout @@ -148,11 +148,14 @@ enum iwl_ucode_tlv_type { IWL_UCODE_TLV_UMAC_DEBUG_ADDRS = 54, IWL_UCODE_TLV_LMAC_DEBUG_ADDRS = 55, IWL_UCODE_TLV_FW_RECOVERY_INFO = 57, - IWL_UCODE_TLV_TYPE_BUFFER_ALLOCATION = IWL_UCODE_INI_TLV_GROUP | 0x1, - IWL_UCODE_TLV_TYPE_HCMD = IWL_UCODE_INI_TLV_GROUP | 0x2, - IWL_UCODE_TLV_TYPE_REGIONS = IWL_UCODE_INI_TLV_GROUP | 0x3, - IWL_UCODE_TLV_TYPE_TRIGGERS = IWL_UCODE_INI_TLV_GROUP | 0x4, - IWL_UCODE_TLV_TYPE_DEBUG_FLOW = IWL_UCODE_INI_TLV_GROUP | 0x5, + + IWL_UCODE_TLV_TYPE_BUFFER_ALLOCATION = IWL_UCODE_INI_TLV_GROUP + 0x1, + IWL_UCODE_TLV_DEBUG_BASE = IWL_UCODE_TLV_TYPE_BUFFER_ALLOCATION, + IWL_UCODE_TLV_TYPE_HCMD = IWL_UCODE_INI_TLV_GROUP + 0x2, + IWL_UCODE_TLV_TYPE_REGIONS = IWL_UCODE_INI_TLV_GROUP + 0x3, + IWL_UCODE_TLV_TYPE_TRIGGERS = IWL_UCODE_INI_TLV_GROUP + 0x4, + IWL_UCODE_TLV_TYPE_DEBUG_FLOW = IWL_UCODE_INI_TLV_GROUP + 0x5, + IWL_UCODE_TLV_DEBUG_MAX = IWL_UCODE_TLV_TYPE_DEBUG_FLOW, /* TLVs 0x1000-0x2000 are for internal driver usage */ IWL_UCODE_TLV_FW_DBG_DUMP_LST = 0x1000, diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c index 5798f434f68f..c7070760a10a 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c @@ -126,7 +126,8 @@ void iwl_alloc_dbg_tlv(struct iwl_trans *trans, size_t len, const u8 *data, len -= ALIGN(tlv_len, 4); data += sizeof(*tlv) + ALIGN(tlv_len, 4); - if (!(tlv_type & IWL_UCODE_INI_TLV_GROUP)) + if (tlv_type < IWL_UCODE_TLV_DEBUG_BASE || + tlv_type > IWL_UCODE_TLV_DEBUG_MAX) continue; hdr = (void *)&tlv->data[0]; From 154d4899e4111ae24e68d6ba955f46856cb046bc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 30 Mar 2019 10:31:52 +0100 Subject: [PATCH 032/181] iwlwifi: mvm: properly check debugfs dentry before using it debugfs can now report an error code if something went wrong instead of just NULL. So if the return value is to be used as a "real" dentry, it needs to be checked if it is an error before dereferencing it. This is now happening because of ff9fb72bc077 ("debugfs: return error values, not NULL"). If multiple iwlwifi devices are in the system, this can cause problems when the driver attempts to create the main debugfs directory again. Later on in the code we fail horribly by trying to dereference a pointer that is an error value. Reported-by: Laura Abbott Reported-by: Gabriel Ramirez Cc: Johannes Berg Cc: Emmanuel Grumbach Cc: Luca Coelho Cc: Intel Linux Wireless Cc: Kalle Valo Cc: stable # 5.0 Signed-off-by: Greg Kroah-Hartman Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c index 2453ceabf00d..738eddb2e7ac 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c @@ -774,6 +774,11 @@ void iwl_mvm_vif_dbgfs_register(struct iwl_mvm *mvm, struct ieee80211_vif *vif) return; mvmvif->dbgfs_dir = debugfs_create_dir("iwlmvm", dbgfs_dir); + if (IS_ERR_OR_NULL(mvmvif->dbgfs_dir)) { + IWL_ERR(mvm, "Failed to create debugfs directory under %pd\n", + dbgfs_dir); + return; + } if (!mvmvif->dbgfs_dir) { IWL_ERR(mvm, "Failed to create debugfs directory under %pd\n", From c537e07b000bc00c9a5ac9d119ed2c8456a99b6e Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Tue, 16 Apr 2019 21:02:52 +0300 Subject: [PATCH 033/181] iwlwifi: cfg: use family 22560 based_params for AX210 family Specifically, max_tfd_queue_size should be 0x10000 like in 22560 family and not 0x100 like in 22000 family. Signed-off-by: Shaul Triebitz Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/cfg/22000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index eb6defb6d0cd..0a87d87fbb4f 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -201,7 +201,7 @@ static const struct iwl_ht_params iwl_22000_ht_params = { #define IWL_DEVICE_AX210 \ IWL_DEVICE_AX200_COMMON, \ .device_family = IWL_DEVICE_FAMILY_AX210, \ - .base_params = &iwl_22000_base_params, \ + .base_params = &iwl_22560_base_params, \ .csr = &iwl_csr_v1, \ .min_txq_size = 128 From 44427c0fbc09b448b22410978a4ef6ee37599d25 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Apr 2019 14:35:19 +0800 Subject: [PATCH 034/181] crypto: xts - Fix atomic sleep when walking skcipher When we perform a walk in the completion function, we need to ensure that it is atomic. Reported-by: syzbot+6f72c20560060c98b566@syzkaller.appspotmail.com Fixes: 78105c7e769b ("crypto: xts - Drop use of auxiliary buffer") Cc: Signed-off-by: Herbert Xu Acked-by: Ondrej Mosnacek Signed-off-by: Herbert Xu --- crypto/xts.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crypto/xts.c b/crypto/xts.c index 847f54f76789..2f948328cabb 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -137,8 +137,12 @@ static void crypt_done(struct crypto_async_request *areq, int err) { struct skcipher_request *req = areq->data; - if (!err) + if (!err) { + struct rctx *rctx = skcipher_request_ctx(req); + + rctx->subreq.base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; err = xor_tweak_post(req); + } skcipher_request_complete(req, err); } From b257b48cd5830c5b1d0c347eb281f9c28056f881 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Apr 2019 14:37:34 +0800 Subject: [PATCH 035/181] crypto: lrw - Fix atomic sleep when walking skcipher When we perform a walk in the completion function, we need to ensure that it is atomic. Fixes: ac3c8f36c31d ("crypto: lrw - Do not use auxiliary buffer") Cc: Signed-off-by: Herbert Xu Acked-by: Ondrej Mosnacek Signed-off-by: Herbert Xu --- crypto/lrw.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 0430ccd08728..08a0e458bc3e 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -212,8 +212,12 @@ static void crypt_done(struct crypto_async_request *areq, int err) { struct skcipher_request *req = areq->data; - if (!err) + if (!err) { + struct rctx *rctx = skcipher_request_ctx(req); + + rctx->subreq.base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; err = xor_tweak_post(req); + } skcipher_request_complete(req, err); } From 71adf60f0a925c0e0c7dc2d0311fe40b825be737 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 18 Apr 2019 15:27:25 +0200 Subject: [PATCH 036/181] drm/sun4i: Add missing drm_atomic_helper_shutdown at driver unbind A call to drm_atomic_helper_shutdown is required to properly release the internal references taken by the core and avoid warnings about leaking objects. Add it since it was missing. Fixes: 9026e0d122ac ("drm: Add Allwinner A10 Display Engine support") Signed-off-by: Paul Kocialkowski Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20190418132727.5128-2-paul.kocialkowski@bootlin.com --- drivers/gpu/drm/sun4i/sun4i_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c index 3ebd9f5e2719..c5871b15714f 100644 --- a/drivers/gpu/drm/sun4i/sun4i_drv.c +++ b/drivers/gpu/drm/sun4i/sun4i_drv.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -144,6 +145,7 @@ static void sun4i_drv_unbind(struct device *dev) drm_dev_unregister(drm); drm_kms_helper_poll_fini(drm); + drm_atomic_helper_shutdown(drm); drm_mode_config_cleanup(drm); of_reserved_mem_device_release(dev); drm_dev_put(drm); From 02b92adbe33e6dbd15dc6e32540b22f47c4ff0a2 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 18 Apr 2019 15:27:26 +0200 Subject: [PATCH 037/181] drm/sun4i: Set device driver data at bind time for use in unbind Our sun4i_drv_unbind gets the drm device using dev_get_drvdata. However, that driver data is never set in sun4i_drv_bind. Set it there to avoid getting a NULL pointer at unbind time. Fixes: 9026e0d122ac ("drm: Add Allwinner A10 Display Engine support") Signed-off-by: Paul Kocialkowski Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20190418132727.5128-3-paul.kocialkowski@bootlin.com --- drivers/gpu/drm/sun4i/sun4i_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c index c5871b15714f..8abaabde08ab 100644 --- a/drivers/gpu/drm/sun4i/sun4i_drv.c +++ b/drivers/gpu/drm/sun4i/sun4i_drv.c @@ -86,6 +86,8 @@ static int sun4i_drv_bind(struct device *dev) ret = -ENOMEM; goto free_drm; } + + dev_set_drvdata(dev, drm); drm->dev_private = drv; INIT_LIST_HEAD(&drv->frontend_list); INIT_LIST_HEAD(&drv->engine_list); From f5a9ed867c83875546c9aadd4ed8e785e9adcc3c Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 18 Apr 2019 15:27:27 +0200 Subject: [PATCH 038/181] drm/sun4i: Fix component unbinding and component master deletion For our component-backed driver to be properly removed, we need to delete the component master in sun4i_drv_remove and make sure to call component_unbind_all in the master's unbind so that all components are unbound when the master is. Fixes: 9026e0d122ac ("drm: Add Allwinner A10 Display Engine support") Signed-off-by: Paul Kocialkowski Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20190418132727.5128-4-paul.kocialkowski@bootlin.com --- drivers/gpu/drm/sun4i/sun4i_drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c index 8abaabde08ab..843b86661833 100644 --- a/drivers/gpu/drm/sun4i/sun4i_drv.c +++ b/drivers/gpu/drm/sun4i/sun4i_drv.c @@ -151,6 +151,8 @@ static void sun4i_drv_unbind(struct device *dev) drm_mode_config_cleanup(drm); of_reserved_mem_device_release(dev); drm_dev_put(drm); + + component_unbind_all(dev, NULL); } static const struct component_master_ops sun4i_drv_master_ops = { @@ -399,6 +401,8 @@ static int sun4i_drv_probe(struct platform_device *pdev) static int sun4i_drv_remove(struct platform_device *pdev) { + component_master_del(&pdev->dev, &sun4i_drv_master_ops); + return 0; } From fc834e607ae3d18e1a20bca3f9a2d7f52ea7a2be Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 18 Apr 2019 13:12:07 -0400 Subject: [PATCH 039/181] USB: dummy-hcd: Fix failure to give back unlinked URBs The syzkaller USB fuzzer identified a failure mode in which dummy-hcd would never give back an unlinked URB. This causes usb_kill_urb() to hang, leading to WARNINGs and unkillable threads. In dummy-hcd, all URBs are given back by the dummy_timer() routine as it scans through the list of pending URBS. Failure to give back URBs can be caused by failure to start or early exit from the scanning loop. The code currently has two such pathways: One is triggered when an unsupported bus transfer speed is encountered, and the other by exhausting the simulated bandwidth for USB transfers during a frame. This patch removes those two paths, thereby allowing all unlinked URBs to be given back in a timely manner. It adds a check for the bus speed when the gadget first starts running, so that dummy_timer() will never thereafter encounter an unsupported speed. And it prevents the loop from exiting as soon as the total bandwidth has been used up (the scanning loop continues, giving back unlinked URBs as they are found, but not transferring any more data). Thanks to Andrey Konovalov for manually running the syzkaller fuzzer to help track down the source of the bug. Signed-off-by: Alan Stern Reported-and-tested-by: syzbot+d919b0f29d7b5a4994b9@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index baf72f95f0f1..213b52508621 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -979,8 +979,18 @@ static int dummy_udc_start(struct usb_gadget *g, struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(g); struct dummy *dum = dum_hcd->dum; - if (driver->max_speed == USB_SPEED_UNKNOWN) + switch (g->speed) { + /* All the speeds we support */ + case USB_SPEED_LOW: + case USB_SPEED_FULL: + case USB_SPEED_HIGH: + case USB_SPEED_SUPER: + break; + default: + dev_err(dummy_dev(dum_hcd), "Unsupported driver max speed %d\n", + driver->max_speed); return -EINVAL; + } /* * SLAVE side init ... the layer above hardware, which @@ -1784,9 +1794,10 @@ static void dummy_timer(struct timer_list *t) /* Bus speed is 500000 bytes/ms, so use a little less */ total = 490000; break; - default: + default: /* Can't happen */ dev_err(dummy_dev(dum_hcd), "bogus device speed\n"); - return; + total = 0; + break; } /* FIXME if HZ != 1000 this will probably misbehave ... */ @@ -1828,7 +1839,7 @@ restart: /* Used up this frame's bandwidth? */ if (total <= 0) - break; + continue; /* find the gadget's ep for this request (if configured) */ address = usb_pipeendpoint (urb->pipe); From bd4264112f93045704731850c5e4d85db981cd85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 16 Apr 2019 11:49:17 +0200 Subject: [PATCH 040/181] drm/ttm: fix re-init of global structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a driver unloads without unloading TTM we don't correctly clear the global structures leading to errors on re-init. Next step should probably be to remove the global structures and kobjs all together, but this is tricky since we need to maintain backward compatibility. Signed-off-by: Christian König Reviewed-by: Karol Herbst Tested-by: Karol Herbst CC: stable@vger.kernel.org # 5.0.x Signed-off-by: Alex Deucher --- drivers/gpu/drm/ttm/ttm_bo.c | 10 +++++----- drivers/gpu/drm/ttm/ttm_memory.c | 5 +++-- include/drm/ttm/ttm_bo_driver.h | 1 - 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 0fa5034b9f9e..1a01669b159a 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -49,9 +49,8 @@ static void ttm_bo_global_kobj_release(struct kobject *kobj); * ttm_global_mutex - protecting the global BO state */ DEFINE_MUTEX(ttm_global_mutex); -struct ttm_bo_global ttm_bo_glob = { - .use_count = 0 -}; +unsigned ttm_bo_glob_use_count; +struct ttm_bo_global ttm_bo_glob; static struct attribute ttm_bo_count = { .name = "bo_count", @@ -1531,12 +1530,13 @@ static void ttm_bo_global_release(void) struct ttm_bo_global *glob = &ttm_bo_glob; mutex_lock(&ttm_global_mutex); - if (--glob->use_count > 0) + if (--ttm_bo_glob_use_count > 0) goto out; kobject_del(&glob->kobj); kobject_put(&glob->kobj); ttm_mem_global_release(&ttm_mem_glob); + memset(glob, 0, sizeof(*glob)); out: mutex_unlock(&ttm_global_mutex); } @@ -1548,7 +1548,7 @@ static int ttm_bo_global_init(void) unsigned i; mutex_lock(&ttm_global_mutex); - if (++glob->use_count > 1) + if (++ttm_bo_glob_use_count > 1) goto out; ret = ttm_mem_global_init(&ttm_mem_glob); diff --git a/drivers/gpu/drm/ttm/ttm_memory.c b/drivers/gpu/drm/ttm/ttm_memory.c index f1567c353b54..9a0909decb36 100644 --- a/drivers/gpu/drm/ttm/ttm_memory.c +++ b/drivers/gpu/drm/ttm/ttm_memory.c @@ -461,8 +461,8 @@ out_no_zone: void ttm_mem_global_release(struct ttm_mem_global *glob) { - unsigned int i; struct ttm_mem_zone *zone; + unsigned int i; /* let the page allocator first stop the shrink work. */ ttm_page_alloc_fini(); @@ -475,9 +475,10 @@ void ttm_mem_global_release(struct ttm_mem_global *glob) zone = glob->zones[i]; kobject_del(&zone->kobj); kobject_put(&zone->kobj); - } + } kobject_del(&glob->kobj); kobject_put(&glob->kobj); + memset(glob, 0, sizeof(*glob)); } static void ttm_check_swapping(struct ttm_mem_global *glob) diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index cbf3180cb612..668ad971cd7b 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -420,7 +420,6 @@ extern struct ttm_bo_global { /** * Protected by ttm_global_mutex. */ - unsigned int use_count; struct list_head device_list; /** From c2b71462d294cf517a0bc6e4fd6424d7cee5596f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 19 Apr 2019 13:52:38 -0400 Subject: [PATCH 041/181] USB: core: Fix bug caused by duplicate interface PM usage counter The syzkaller fuzzer reported a bug in the USB hub driver which turned out to be caused by a negative runtime-PM usage counter. This allowed a hub to be runtime suspended at a time when the driver did not expect it. The symptom is a WARNING issued because the hub's status URB is submitted while it is already active: URB 0000000031fb463e submitted while active WARNING: CPU: 0 PID: 2917 at drivers/usb/core/urb.c:363 The negative runtime-PM usage count was caused by an unfortunate design decision made when runtime PM was first implemented for USB. At that time, USB class drivers were allowed to unbind from their interfaces without balancing the usage counter (i.e., leaving it with a positive count). The core code would take care of setting the counter back to 0 before allowing another driver to bind to the interface. Later on when runtime PM was implemented for the entire kernel, the opposite decision was made: Drivers were required to balance their runtime-PM get and put calls. In order to maintain backward compatibility, however, the USB subsystem adapted to the new implementation by keeping an independent usage counter for each interface and using it to automatically adjust the normal usage counter back to 0 whenever a driver was unbound. This approach involves duplicating information, but what is worse, it doesn't work properly in cases where a USB class driver delays decrementing the usage counter until after the driver's disconnect() routine has returned and the counter has been adjusted back to 0. Doing so would cause the usage counter to become negative. There's even a warning about this in the USB power management documentation! As it happens, this is exactly what the hub driver does. The kick_hub_wq() routine increments the runtime-PM usage counter, and the corresponding decrement is carried out by hub_event() in the context of the hub_wq work-queue thread. This work routine may sometimes run after the driver has been unbound from its interface, and when it does it causes the usage counter to go negative. It is not possible for hub_disconnect() to wait for a pending hub_event() call to finish, because hub_disconnect() is called with the device lock held and hub_event() acquires that lock. The only feasible fix is to reverse the original design decision: remove the duplicate interface-specific usage counter and require USB drivers to balance their runtime PM gets and puts. As far as I know, all existing drivers currently do this. Signed-off-by: Alan Stern Reported-and-tested-by: syzbot+7634edaea4d0b341c625@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/usb/power-management.rst | 14 +++++++++----- drivers/usb/core/driver.c | 13 ------------- drivers/usb/storage/realtek_cr.c | 13 +++++-------- include/linux/usb.h | 2 -- 4 files changed, 14 insertions(+), 28 deletions(-) diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst index 79beb807996b..4a74cf6f2797 100644 --- a/Documentation/driver-api/usb/power-management.rst +++ b/Documentation/driver-api/usb/power-management.rst @@ -370,11 +370,15 @@ autosuspend the interface's device. When the usage counter is = 0 then the interface is considered to be idle, and the kernel may autosuspend the device. -Drivers need not be concerned about balancing changes to the usage -counter; the USB core will undo any remaining "get"s when a driver -is unbound from its interface. As a corollary, drivers must not call -any of the ``usb_autopm_*`` functions after their ``disconnect`` -routine has returned. +Drivers must be careful to balance their overall changes to the usage +counter. Unbalanced "get"s will remain in effect when a driver is +unbound from its interface, preventing the device from going into +runtime suspend should the interface be bound to a driver again. On +the other hand, drivers are allowed to achieve this balance by calling +the ``usb_autopm_*`` functions even after their ``disconnect`` routine +has returned -- say from within a work-queue routine -- provided they +retain an active reference to the interface (via ``usb_get_intf`` and +``usb_put_intf``). Drivers using the async routines are responsible for their own synchronization and mutual exclusion. diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index 8987cec9549d..ebcadaad89d1 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -473,11 +473,6 @@ static int usb_unbind_interface(struct device *dev) pm_runtime_disable(dev); pm_runtime_set_suspended(dev); - /* Undo any residual pm_autopm_get_interface_* calls */ - for (r = atomic_read(&intf->pm_usage_cnt); r > 0; --r) - usb_autopm_put_interface_no_suspend(intf); - atomic_set(&intf->pm_usage_cnt, 0); - if (!error) usb_autosuspend_device(udev); @@ -1633,7 +1628,6 @@ void usb_autopm_put_interface(struct usb_interface *intf) int status; usb_mark_last_busy(udev); - atomic_dec(&intf->pm_usage_cnt); status = pm_runtime_put_sync(&intf->dev); dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&intf->dev.power.usage_count), @@ -1662,7 +1656,6 @@ void usb_autopm_put_interface_async(struct usb_interface *intf) int status; usb_mark_last_busy(udev); - atomic_dec(&intf->pm_usage_cnt); status = pm_runtime_put(&intf->dev); dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&intf->dev.power.usage_count), @@ -1684,7 +1677,6 @@ void usb_autopm_put_interface_no_suspend(struct usb_interface *intf) struct usb_device *udev = interface_to_usbdev(intf); usb_mark_last_busy(udev); - atomic_dec(&intf->pm_usage_cnt); pm_runtime_put_noidle(&intf->dev); } EXPORT_SYMBOL_GPL(usb_autopm_put_interface_no_suspend); @@ -1715,8 +1707,6 @@ int usb_autopm_get_interface(struct usb_interface *intf) status = pm_runtime_get_sync(&intf->dev); if (status < 0) pm_runtime_put_sync(&intf->dev); - else - atomic_inc(&intf->pm_usage_cnt); dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&intf->dev.power.usage_count), status); @@ -1750,8 +1740,6 @@ int usb_autopm_get_interface_async(struct usb_interface *intf) status = pm_runtime_get(&intf->dev); if (status < 0 && status != -EINPROGRESS) pm_runtime_put_noidle(&intf->dev); - else - atomic_inc(&intf->pm_usage_cnt); dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n", __func__, atomic_read(&intf->dev.power.usage_count), status); @@ -1775,7 +1763,6 @@ void usb_autopm_get_interface_no_resume(struct usb_interface *intf) struct usb_device *udev = interface_to_usbdev(intf); usb_mark_last_busy(udev); - atomic_inc(&intf->pm_usage_cnt); pm_runtime_get_noresume(&intf->dev); } EXPORT_SYMBOL_GPL(usb_autopm_get_interface_no_resume); diff --git a/drivers/usb/storage/realtek_cr.c b/drivers/usb/storage/realtek_cr.c index 31b024441938..cc794e25a0b6 100644 --- a/drivers/usb/storage/realtek_cr.c +++ b/drivers/usb/storage/realtek_cr.c @@ -763,18 +763,16 @@ static void rts51x_suspend_timer_fn(struct timer_list *t) break; case RTS51X_STAT_IDLE: case RTS51X_STAT_SS: - usb_stor_dbg(us, "RTS51X_STAT_SS, intf->pm_usage_cnt:%d, power.usage:%d\n", - atomic_read(&us->pusb_intf->pm_usage_cnt), + usb_stor_dbg(us, "RTS51X_STAT_SS, power.usage:%d\n", atomic_read(&us->pusb_intf->dev.power.usage_count)); - if (atomic_read(&us->pusb_intf->pm_usage_cnt) > 0) { + if (atomic_read(&us->pusb_intf->dev.power.usage_count) > 0) { usb_stor_dbg(us, "Ready to enter SS state\n"); rts51x_set_stat(chip, RTS51X_STAT_SS); /* ignore mass storage interface's children */ pm_suspend_ignore_children(&us->pusb_intf->dev, true); usb_autopm_put_interface_async(us->pusb_intf); - usb_stor_dbg(us, "RTS51X_STAT_SS 01, intf->pm_usage_cnt:%d, power.usage:%d\n", - atomic_read(&us->pusb_intf->pm_usage_cnt), + usb_stor_dbg(us, "RTS51X_STAT_SS 01, power.usage:%d\n", atomic_read(&us->pusb_intf->dev.power.usage_count)); } break; @@ -807,11 +805,10 @@ static void rts51x_invoke_transport(struct scsi_cmnd *srb, struct us_data *us) int ret; if (working_scsi(srb)) { - usb_stor_dbg(us, "working scsi, intf->pm_usage_cnt:%d, power.usage:%d\n", - atomic_read(&us->pusb_intf->pm_usage_cnt), + usb_stor_dbg(us, "working scsi, power.usage:%d\n", atomic_read(&us->pusb_intf->dev.power.usage_count)); - if (atomic_read(&us->pusb_intf->pm_usage_cnt) <= 0) { + if (atomic_read(&us->pusb_intf->dev.power.usage_count) <= 0) { ret = usb_autopm_get_interface(us->pusb_intf); usb_stor_dbg(us, "working scsi, ret=%d\n", ret); } diff --git a/include/linux/usb.h b/include/linux/usb.h index 5e49e82c4368..ff010d1fd1c7 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -200,7 +200,6 @@ usb_find_last_int_out_endpoint(struct usb_host_interface *alt, * @dev: driver model's view of this device * @usb_dev: if an interface is bound to the USB major, this will point * to the sysfs representation for that device. - * @pm_usage_cnt: PM usage counter for this interface * @reset_ws: Used for scheduling resets from atomic context. * @resetting_device: USB core reset the device, so use alt setting 0 as * current; needs bandwidth alloc after reset. @@ -257,7 +256,6 @@ struct usb_interface { struct device dev; /* interface specific device info */ struct device *usb_dev; - atomic_t pm_usage_cnt; /* usage counter for autosuspend */ struct work_struct reset_ws; /* for resets in atomic context */ }; #define to_usb_interface(d) container_of(d, struct usb_interface, dev) From 36f0c423552dacaca152324b8e9bda42a6d88865 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 19 Apr 2019 15:40:14 +0200 Subject: [PATCH 042/181] x86/boot: Disable RSDP parsing temporarily The original intention to move RDSP parsing very early, before KASLR does its ranges selection, was to accommodate movable memory regions machines (CONFIG_MEMORY_HOTREMOVE) to still be able to do memory hotplug. However, that broke kexec'ing a kernel on EFI machines because depending on where the EFI systab was mapped, on at least one machine it isn't present in the kexec mapping of the second kernel, leading to a triple fault in the early code. Fixing this properly requires significantly involved surgery and we cannot allow ourselves to do that, that close to the merge window. So disable the RSDP parsing code temporarily until it is fixed properly in the next release cycle. Signed-off-by: Borislav Petkov Cc: Ard Biesheuvel Cc: Baoquan He Cc: Chao Fan Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: indou.takao@jp.fujitsu.com Cc: Ingo Molnar Cc: Juergen Gross Cc: kasong@redhat.com Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: msys.mizuma@gmail.com Cc: Thomas Gleixner Cc: Tom Lendacky Cc: x86-ml Link: https://lkml.kernel.org/r/20190419141952.GE10324@zn.tnic --- arch/x86/boot/compressed/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index c0d6c560df69..5a237e8dbf8d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -352,7 +352,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, boot_params->hdr.loadflags &= ~KASLR_FLAG; /* Save RSDP address for later use. */ - boot_params->acpi_rsdp_addr = get_rsdp_addr(); + /* boot_params->acpi_rsdp_addr = get_rsdp_addr(); */ sanitize_boot_params(boot_params); From 35fa71a030caa50458a043560d4814ea9bcd639f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Apr 2019 10:23:23 -0600 Subject: [PATCH 043/181] io_uring: fail io_uring_register(2) on a dying io_uring instance If we have multiple threads doing io_uring_register(2) on an io_uring fd, then we can potentially try and kill the percpu reference while someone else has already killed it. Prevent this race by failing io_uring_register(2) if the ref is marked dying. This is safe since we're inside the io_uring mutex. Fixes: b19062a56726 ("io_uring: fix possible deadlock between io_uring_{enter,register}") Reported-by: syzbot Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index f65f85d89217..a2f39faed6a7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2934,6 +2934,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, { int ret; + /* + * We're inside the ring mutex, if the ref is already dying, then + * someone else killed the ctx or is already going through + * io_uring_register(). + */ + if (percpu_ref_is_dying(&ctx->refs)) + return -ENXIO; + percpu_ref_kill(&ctx->refs); /* From e523a29c4f2703bdb98f68ce1bb256e259fd8d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Fri, 19 Apr 2019 11:57:44 +0200 Subject: [PATCH 044/181] io_uring: fix race condition reading SQ entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A read memory barrier is required between reading SQ tail and reading the actual data belonging to the SQ entry. Userspace needs a matching write barrier between writing SQ entries and updating SQ tail (using smp_store_release to update tail will do). Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a2f39faed6a7..41e3a6f6a096 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1739,7 +1739,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) head = ctx->cached_sq_head; /* See comment at the top of this file */ smp_rmb(); - if (head == READ_ONCE(ring->r.tail)) + /* make sure SQ entry isn't read before tail */ + if (head == smp_load_acquire(&ring->r.tail)) return false; head = READ_ONCE(ring->array[head & ctx->sq_mask]); From 0d7bae69c574c5f25802f8a71252e7d66933a3ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Fri, 19 Apr 2019 11:57:45 +0200 Subject: [PATCH 045/181] io_uring: fix race condition when sq threads goes sleeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reading the SQ tail needs to come after setting IORING_SQ_NEED_WAKEUP in flags; there is no cheap barrier for ordering a store before a load, a full memory barrier is required. Userspace needs a full memory barrier between updating SQ tail and checking for the IORING_SQ_NEED_WAKEUP too. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 41e3a6f6a096..69910fd9ccca 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1865,7 +1865,8 @@ static int io_sq_thread(void *data) /* Tell userspace we may need a wakeup call */ ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; - smp_wmb(); + /* make sure to read SQ tail after writing flags */ + smp_mb(); if (!io_get_sqring(ctx, &sqes[0])) { if (kthread_should_stop()) { From fb775faa9e46ff481e4ced11116c9bd45359cb43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Fri, 19 Apr 2019 11:57:46 +0200 Subject: [PATCH 046/181] io_uring: fix poll full SQ detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit io_uring_poll shouldn't signal EPOLLOUT | EPOLLWRNORM if the queue is full; the old check would always signal EPOLLOUT | EPOLLWRNORM (unless there were U32_MAX - 1 entries in the SQ queue). Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 69910fd9ccca..b998e98acd01 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2576,7 +2576,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) poll_wait(file, &ctx->cq_wait, wait); /* See comment at the top of this file */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != + ctx->sq_ring->ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; From f147384774a7b24dda4783a3dcd61af272757ea8 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Mon, 1 Apr 2019 20:38:19 +0200 Subject: [PATCH 047/181] dmaengine: bcm2835: Avoid GFP_KERNEL in device_prep_slave_sg The commit af19b7ce76ba ("mmc: bcm2835: Avoid possible races on data requests") introduces a possible circular locking dependency, which is triggered by swapping to the sdhost interface. So instead of reintroduce the race condition again, we could also avoid this situation by using GFP_NOWAIT for the allocation of the DMA buffer descriptors. Reported-by: Aaro Koskinen Signed-off-by: Stefan Wahren Fixes: af19b7ce76ba ("mmc: bcm2835: Avoid possible races on data requests") Link: http://lists.infradead.org/pipermail/linux-rpi-kernel/2019-March/008615.html Signed-off-by: Vinod Koul --- drivers/dma/bcm2835-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/bcm2835-dma.c b/drivers/dma/bcm2835-dma.c index ec8a291d62ba..54093ffd0aef 100644 --- a/drivers/dma/bcm2835-dma.c +++ b/drivers/dma/bcm2835-dma.c @@ -671,7 +671,7 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_slave_sg( d = bcm2835_dma_create_cb_chain(chan, direction, false, info, extra, frames, src, dst, 0, 0, - GFP_KERNEL); + GFP_NOWAIT); if (!d) return NULL; From 907bd68a2edc491849e2fdcfe52c4596627bca94 Mon Sep 17 00:00:00 2001 From: Dirk Behme Date: Fri, 12 Apr 2019 07:29:13 +0200 Subject: [PATCH 048/181] dmaengine: sh: rcar-dmac: With cyclic DMA residue 0 is valid Having a cyclic DMA, a residue 0 is not an indication of a completed DMA. In case of cyclic DMA make sure that dma_set_residue() is called and with this a residue of 0 is forwarded correctly to the caller. Fixes: 3544d2878817 ("dmaengine: rcar-dmac: use result of updated get_residue in tx_status") Signed-off-by: Dirk Behme Signed-off-by: Achim Dahlhoff Signed-off-by: Hiroyuki Yokoyama Signed-off-by: Yao Lihua Reviewed-by: Yoshihiro Shimoda Reviewed-by: Laurent Pinchart Cc: # v4.8+ Signed-off-by: Vinod Koul --- drivers/dma/sh/rcar-dmac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c index 2b4f25698169..54810ffd95e2 100644 --- a/drivers/dma/sh/rcar-dmac.c +++ b/drivers/dma/sh/rcar-dmac.c @@ -1368,6 +1368,7 @@ static enum dma_status rcar_dmac_tx_status(struct dma_chan *chan, enum dma_status status; unsigned long flags; unsigned int residue; + bool cyclic; status = dma_cookie_status(chan, cookie, txstate); if (status == DMA_COMPLETE || !txstate) @@ -1375,10 +1376,11 @@ static enum dma_status rcar_dmac_tx_status(struct dma_chan *chan, spin_lock_irqsave(&rchan->lock, flags); residue = rcar_dmac_chan_get_residue(rchan, cookie); + cyclic = rchan->desc.running ? rchan->desc.running->cyclic : false; spin_unlock_irqrestore(&rchan->lock, flags); /* if there's no residue, the cookie is complete */ - if (!residue) + if (!residue && !cyclic) return DMA_COMPLETE; dma_set_residue(txstate, residue); From 6e7da74775348d96e2d7efaf3f91410e18c481ef Mon Sep 17 00:00:00 2001 From: Achim Dahlhoff Date: Fri, 12 Apr 2019 07:29:14 +0200 Subject: [PATCH 049/181] dmaengine: sh: rcar-dmac: Fix glitch in dmaengine_tx_status The tx_status poll in the rcar_dmac driver reads the status register which indicates which chunk is busy (DMACHCRB). Afterwards the point inside the chunk is read from DMATCRB. It is possible that the chunk has changed between the two reads. The result is a non-monotonous increase of the residue. Fix this by introducing a 'safe read' logic. Fixes: 73a47bd0da66 ("dmaengine: rcar-dmac: use TCRB instead of TCR for residue") Signed-off-by: Achim Dahlhoff Signed-off-by: Dirk Behme Reviewed-by: Yoshihiro Shimoda Cc: # v4.16+ Signed-off-by: Vinod Koul --- drivers/dma/sh/rcar-dmac.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c index 54810ffd95e2..e2a5398f89b5 100644 --- a/drivers/dma/sh/rcar-dmac.c +++ b/drivers/dma/sh/rcar-dmac.c @@ -1282,6 +1282,9 @@ static unsigned int rcar_dmac_chan_get_residue(struct rcar_dmac_chan *chan, enum dma_status status; unsigned int residue = 0; unsigned int dptr = 0; + unsigned int chcrb; + unsigned int tcrb; + unsigned int i; if (!desc) return 0; @@ -1329,6 +1332,24 @@ static unsigned int rcar_dmac_chan_get_residue(struct rcar_dmac_chan *chan, return 0; } + /* + * We need to read two registers. + * Make sure the control register does not skip to next chunk + * while reading the counter. + * Trying it 3 times should be enough: Initial read, retry, retry + * for the paranoid. + */ + for (i = 0; i < 3; i++) { + chcrb = rcar_dmac_chan_read(chan, RCAR_DMACHCRB) & + RCAR_DMACHCRB_DPTR_MASK; + tcrb = rcar_dmac_chan_read(chan, RCAR_DMATCRB); + /* Still the same? */ + if (chcrb == (rcar_dmac_chan_read(chan, RCAR_DMACHCRB) & + RCAR_DMACHCRB_DPTR_MASK)) + break; + } + WARN_ONCE(i >= 3, "residue might be not continuous!"); + /* * In descriptor mode the descriptor running pointer is not maintained * by the interrupt handler, find the running descriptor from the @@ -1336,8 +1357,7 @@ static unsigned int rcar_dmac_chan_get_residue(struct rcar_dmac_chan *chan, * mode just use the running descriptor pointer. */ if (desc->hwdescs.use) { - dptr = (rcar_dmac_chan_read(chan, RCAR_DMACHCRB) & - RCAR_DMACHCRB_DPTR_MASK) >> RCAR_DMACHCRB_DPTR_SHIFT; + dptr = chcrb >> RCAR_DMACHCRB_DPTR_SHIFT; if (dptr == 0) dptr = desc->nchunks; dptr--; @@ -1355,7 +1375,7 @@ static unsigned int rcar_dmac_chan_get_residue(struct rcar_dmac_chan *chan, } /* Add the residue for the current chunk. */ - residue += rcar_dmac_chan_read(chan, RCAR_DMATCRB) << desc->xfer_shift; + residue += tcrb << desc->xfer_shift; return residue; } From d4d18e3ec6091843f607e8929a56723e28f393a6 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 17 Apr 2019 21:29:29 -0700 Subject: [PATCH 050/181] arm64: mm: Ensure tail of unaligned initrd is reserved In the event that the start address of the initrd is not aligned, but has an aligned size, the base + size will not cover the entire initrd image and there is a chance that the kernel will corrupt the tail of the image. By aligning the end of the initrd to a page boundary and then subtracting the adjusted start address the memblock reservation will cover all pages that contains the initrd. Fixes: c756c592e442 ("arm64: Utilize phys_initrd_start/phys_initrd_size") Cc: stable@vger.kernel.org Acked-by: Will Deacon Signed-off-by: Bjorn Andersson Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6bc135042f5e..7cae155e81a5 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -363,7 +363,7 @@ void __init arm64_memblock_init(void) * Otherwise, this is a no-op */ u64 base = phys_initrd_start & PAGE_MASK; - u64 size = PAGE_ALIGN(phys_initrd_size); + u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; /* * We can only add back the initrd memory if we don't end up From f1267cf3c01b12e0f843fb6a7450a7f0b2efab8a Mon Sep 17 00:00:00 2001 From: Bhagavathi Perumal S Date: Tue, 16 Apr 2019 12:54:40 +0530 Subject: [PATCH 051/181] mac80211: Fix kernel panic due to use of txq after free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The txq of vif is added to active_txqs list for ATF TXQ scheduling in the function ieee80211_queue_skb(), but it was not properly removed before freeing the txq object. It was causing use after free of the txq objects from the active_txqs list, result was kernel panic due to invalid memory access. Fix kernel invalid memory access by properly removing txq object from active_txqs list before free the object. Signed-off-by: Bhagavathi Perumal S Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 4a6ff1482a9f..02d2e6f11e93 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1908,6 +1908,9 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); + if (sdata->vif.txq) + ieee80211_txq_purge(sdata->local, to_txq_info(sdata->vif.txq)); + synchronize_rcu(); if (sdata->dev) { From 8772eed9a95abd82cf188c93edb9645543ca4418 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Tue, 16 Apr 2019 11:16:33 +0530 Subject: [PATCH 052/181] cfg80211: Notify previous user request during self managed wiphy registration Commit c82c06ce43d3("cfg80211: Notify all User Hints To self managed wiphys") notified all new user hints to self managed wiphy's after device registration. But it didn't do this for anything other than cell base hints done before registration. This needs to be done during wiphy registration of a self managed device also, so that the previous user settings are retained. Fixes: c82c06ce43d3 ("cfg80211: Notify all User Hints To self managed wiphys") Signed-off-by: Sriram R Signed-off-by: Johannes Berg --- net/wireless/reg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 0ba778f371cb..a6fd5ce199da 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3778,10 +3778,9 @@ void wiphy_regulatory_register(struct wiphy *wiphy) /* * The last request may have been received before this * registration call. Call the driver notifier if - * initiator is USER and user type is CELL_BASE. + * initiator is USER. */ - if (lr->initiator == NL80211_REGDOM_SET_BY_USER && - lr->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE) + if (lr->initiator == NL80211_REGDOM_SET_BY_USER) reg_call_notifier(wiphy, lr); } From 517879147493a5e1df6b89a50f708f1133fcaddb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Apr 2019 11:39:33 +0200 Subject: [PATCH 053/181] mac80211: don't attempt to rename ERR_PTR() debugfs dirs We need to dereference the directory to get its parent to be able to rename it, so it's clearly not safe to try to do this with ERR_PTR() pointers. Skip in this case. It seems that this is most likely what was causing the report by syzbot, but I'm not entirely sure as it didn't come with a reproducer this time. Cc: stable@vger.kernel.org Reported-by: syzbot+4ece1a28b8f4730547c9@syzkaller.appspotmail.com Signed-off-by: Johannes Berg --- net/mac80211/debugfs_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index cff0fb3578c9..deb3faf08337 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -841,7 +841,7 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) dir = sdata->vif.debugfs_dir; - if (!dir) + if (IS_ERR_OR_NULL(dir)) return; sprintf(buf, "netdev:%s", sdata->name); From 4e69ecf4da1ee0b2ac735e1f1bb13935acd5a38d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 12 Apr 2019 23:59:25 -0700 Subject: [PATCH 054/181] arm64/module: ftrace: deal with place relative nature of PLTs Another bodge for the ftrace PLT code: plt_entries_equal() now takes the place relative nature of the ADRP/ADD based PLT entries into account, which means that a struct trampoline instance on the stack is no longer equal to the same set of opcodes in the module struct, given that they don't point to the same place in memory anymore. Work around this by using memcmp() in the ftrace PLT handling code. Acked-by: Will Deacon Tested-by: dann frazier Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/kernel/ftrace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 07b298120182..65a51331088e 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -103,10 +103,15 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) * to be revisited if support for multiple ftrace entry points * is added in the future, but for now, the pr_err() below * deals with a theoretical issue only. + * + * Note that PLTs are place relative, and plt_entries_equal() + * checks whether they point to the same target. Here, we need + * to check if the actual opcodes are in fact identical, + * regardless of the offset in memory so use memcmp() instead. */ trampoline = get_plt_entry(addr, mod->arch.ftrace_trampoline); - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &trampoline)) { + if (memcmp(mod->arch.ftrace_trampoline, &trampoline, + sizeof(trampoline))) { if (plt_entry_is_initialized(mod->arch.ftrace_trampoline)) { pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n"); return -EINVAL; From d4fad0a426c6e26f48c9a7cdd21a7fe9c198d645 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 12 Apr 2019 17:59:40 +0200 Subject: [PATCH 055/181] gpu: ipu-v3: dp: fix CSC handling Initialize the flow input colorspaces to unknown and reset to that value when the channel gets disabled. This avoids the state getting mixed up with a previous mode. Also keep the CSC settings for the background flow intact when disabling the foreground flow. Root-caused-by: Jonathan Marek Signed-off-by: Lucas Stach Signed-off-by: Philipp Zabel --- drivers/gpu/ipu-v3/ipu-dp.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/ipu-v3/ipu-dp.c b/drivers/gpu/ipu-v3/ipu-dp.c index 9b2b3fa479c4..5e44ff1f2085 100644 --- a/drivers/gpu/ipu-v3/ipu-dp.c +++ b/drivers/gpu/ipu-v3/ipu-dp.c @@ -195,7 +195,8 @@ int ipu_dp_setup_channel(struct ipu_dp *dp, ipu_dp_csc_init(flow, flow->foreground.in_cs, flow->out_cs, DP_COM_CONF_CSC_DEF_BOTH); } else { - if (flow->foreground.in_cs == flow->out_cs) + if (flow->foreground.in_cs == IPUV3_COLORSPACE_UNKNOWN || + flow->foreground.in_cs == flow->out_cs) /* * foreground identical to output, apply color * conversion on background @@ -261,6 +262,8 @@ void ipu_dp_disable_channel(struct ipu_dp *dp, bool sync) struct ipu_dp_priv *priv = flow->priv; u32 reg, csc; + dp->in_cs = IPUV3_COLORSPACE_UNKNOWN; + if (!dp->foreground) return; @@ -268,8 +271,9 @@ void ipu_dp_disable_channel(struct ipu_dp *dp, bool sync) reg = readl(flow->base + DP_COM_CONF); csc = reg & DP_COM_CONF_CSC_DEF_MASK; - if (csc == DP_COM_CONF_CSC_DEF_FG) - reg &= ~DP_COM_CONF_CSC_DEF_MASK; + reg &= ~DP_COM_CONF_CSC_DEF_MASK; + if (csc == DP_COM_CONF_CSC_DEF_BOTH || csc == DP_COM_CONF_CSC_DEF_BG) + reg |= DP_COM_CONF_CSC_DEF_BG; reg &= ~DP_COM_CONF_FG_EN; writel(reg, flow->base + DP_COM_CONF); @@ -347,6 +351,8 @@ int ipu_dp_init(struct ipu_soc *ipu, struct device *dev, unsigned long base) mutex_init(&priv->mutex); for (i = 0; i < IPUV3_NUM_FLOWS; i++) { + priv->flow[i].background.in_cs = IPUV3_COLORSPACE_UNKNOWN; + priv->flow[i].foreground.in_cs = IPUV3_COLORSPACE_UNKNOWN; priv->flow[i].foreground.foreground = true; priv->flow[i].base = priv->base + ipu_dp_flow_base[i]; priv->flow[i].priv = priv; From 7bcde275eb1d0ac8793c77c7e666a886eb16633d Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 12 Apr 2019 17:59:41 +0200 Subject: [PATCH 056/181] drm/imx: don't skip DP channel disable for background plane In order to make sure that the plane color space gets reset correctly. Signed-off-by: Lucas Stach Signed-off-by: Philipp Zabel --- drivers/gpu/drm/imx/ipuv3-crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/imx/ipuv3-crtc.c b/drivers/gpu/drm/imx/ipuv3-crtc.c index ec3602ebbc1c..54011df8c2e8 100644 --- a/drivers/gpu/drm/imx/ipuv3-crtc.c +++ b/drivers/gpu/drm/imx/ipuv3-crtc.c @@ -71,7 +71,7 @@ static void ipu_crtc_disable_planes(struct ipu_crtc *ipu_crtc, if (disable_partial) ipu_plane_disable(ipu_crtc->plane[1], true); if (disable_full) - ipu_plane_disable(ipu_crtc->plane[0], false); + ipu_plane_disable(ipu_crtc->plane[0], true); } static void ipu_crtc_atomic_disable(struct drm_crtc *crtc, From 8358e3a8264a228cf2dfb6f3a05c0328f4118f12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Apr 2019 08:17:58 -0600 Subject: [PATCH 057/181] io_uring: remove 'state' argument from io_{read,write} path Since commit 09bb839434b we don't use the state argument for any sort of on-stack caching in the io read and write path. Remove the stale and unused argument from them, and bubble it up to __io_submit_sqe() and down to io_prep_rw(). Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b998e98acd01..0e9fb2cb1984 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -740,7 +740,7 @@ static bool io_file_supports_async(struct file *file) } static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; @@ -938,7 +938,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -947,7 +947,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; file = kiocb->ki_filp; @@ -985,7 +985,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, } static int io_write(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -994,7 +994,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; @@ -1336,8 +1336,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) } static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct sqe_submit *s, bool force_nonblock, - struct io_submit_state *state) + const struct sqe_submit *s, bool force_nonblock) { int ret, opcode; @@ -1353,18 +1352,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_READV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITEV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_READ_FIXED: - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITE_FIXED: - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_FSYNC: ret = io_fsync(req, s->sqe, force_nonblock); @@ -1457,7 +1456,7 @@ restart: s->has_user = cur_mm != NULL; s->needs_lock = true; do { - ret = __io_submit_sqe(ctx, req, s, false, NULL); + ret = __io_submit_sqe(ctx, req, s, false); /* * We can get EAGAIN for polled IO even though * we're forcing a sync submission from here, @@ -1623,7 +1622,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(ret)) goto out; - ret = __io_submit_sqe(ctx, req, s, true, state); + ret = __io_submit_sqe(ctx, req, s, true); if (ret == -EAGAIN) { struct io_uring_sqe *sqe_copy; From f5d356328d676deca698d01324000e0d98fba643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sat, 20 Apr 2019 14:50:50 +0200 Subject: [PATCH 058/181] drm/sched: Fix description of drm_sched_stop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 222b5f044159 ("drm/sched: Refactor ring mirror list handling."), drm_sched_hw_job_reset is no longer there, so let's adjust the doc comment accordingly. Reviewed-by: Andrey Grodzovsky Signed-off-by: Jonathan Neuschäfer Signed-off-by: Alex Deucher --- drivers/gpu/drm/scheduler/sched_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 19fc601c9eeb..a1bec2779e76 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -366,10 +366,9 @@ void drm_sched_increase_karma(struct drm_sched_job *bad) EXPORT_SYMBOL(drm_sched_increase_karma); /** - * drm_sched_hw_job_reset - stop the scheduler if it contains the bad job + * drm_sched_stop - stop the scheduler * * @sched: scheduler instance - * @bad: bad scheduler job * */ void drm_sched_stop(struct drm_gpu_scheduler *sched) From 503621628b32782a07b2318e4112bd4372aa3401 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 23 Apr 2019 17:09:38 +0100 Subject: [PATCH 059/181] ARM: fix function graph tracer and unwinder dependencies Naresh Kamboju recently reported that the function-graph tracer crashes on ARM. The function-graph tracer assumes that the kernel is built with frame pointers. We explicitly disabled the function-graph tracer when building Thumb2, since the Thumb2 ABI doesn't have frame pointers. We recently changed the way the unwinder method was selected, which seems to have made it more likely that we can end up with the function- graph tracer enabled but without the kernel built with frame pointers. Fix up the function graph tracer dependencies so the option is not available when we have no possibility of having frame pointers, and adjust the dependencies on the unwinder option to hide the non-frame pointer unwinder options if the function-graph tracer is enabled. Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Russell King --- arch/arm/Kconfig | 2 +- arch/arm/Kconfig.debug | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 054ead960f98..5bd8d781b9ff 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -73,7 +73,7 @@ config ARM select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL - select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL + select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 6d6e0330930b..e388af4594a6 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -47,8 +47,8 @@ config DEBUG_WX choice prompt "Choose kernel unwinder" - default UNWINDER_ARM if AEABI && !FUNCTION_GRAPH_TRACER - default UNWINDER_FRAME_POINTER if !AEABI || FUNCTION_GRAPH_TRACER + default UNWINDER_ARM if AEABI + default UNWINDER_FRAME_POINTER if !AEABI help This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc//stack, @@ -65,7 +65,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_ARM bool "ARM EABI stack unwinder" - depends on AEABI + depends on AEABI && !FUNCTION_GRAPH_TRACER select ARM_UNWIND help This option enables stack unwinding support in the kernel From c3143967807adb1357c36b68a7563fc0c4e1f615 Mon Sep 17 00:00:00 2001 From: Tigran Tadevosyan Date: Fri, 5 Apr 2019 14:16:13 +0100 Subject: [PATCH 060/181] ARM: 8856/1: NOMMU: Fix CCR register faulty initialization when MPU is disabled When CONFIG_ARM_MPU is not defined, the base address of v7M SCB register is not initialized with correct value. This prevents enabling I/D caches when the L1 cache poilcy is applied in kernel. Fixes: 3c24121039c9da14692eb48f6e39565b28c0f3cf ("ARM: 8756/1: NOMMU: Postpone MPU activation till __after_proc_init") Signed-off-by: Tigran Tadevosyan Signed-off-by: Vladimir Murzin Signed-off-by: Russell King --- arch/arm/kernel/head-nommu.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index c08d2d890f7b..b38bbd011b35 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -133,9 +133,9 @@ __secondary_data: */ .text __after_proc_init: -#ifdef CONFIG_ARM_MPU M_CLASS(movw r12, #:lower16:BASEADDR_V7M_SCB) M_CLASS(movt r12, #:upper16:BASEADDR_V7M_SCB) +#ifdef CONFIG_ARM_MPU M_CLASS(ldr r3, [r12, 0x50]) AR_CLASS(mrc p15, 0, r3, c0, c1, 4) @ Read ID_MMFR0 and r3, r3, #(MMFR0_PMSA) @ PMSA field From e17b1af96b2afc38e684aa2f1033387e2ed10029 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 12 Apr 2019 22:34:18 +0100 Subject: [PATCH 061/181] ARM: 8857/1: efi: enable CP15 DMB instructions before cleaning the cache The EFI stub is entered with the caches and MMU enabled by the firmware, and once the stub is ready to hand over to the decompressor, we clean and disable the caches. The cache clean routines use CP15 barrier instructions, which can be disabled via SCTLR. Normally, when using the provided cache handling routines to enable the caches and MMU, this bit is enabled as well. However, but since we entered the stub with the caches already enabled, this routine is not executed before we call the cache clean routines, resulting in undefined instruction exceptions if the firmware never enabled this bit. So set the bit explicitly in the EFI entry code, but do so in a way that guarantees that the resulting code can still run on v6 cores as well (which are guaranteed to have CP15 barriers enabled) Cc: # v4.9+ Acked-by: Marc Zyngier Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/boot/compressed/head.S | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 6c7ccb428c07..7135820f76d4 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -1438,7 +1438,21 @@ ENTRY(efi_stub_entry) @ Preserve return value of efi_entry() in r4 mov r4, r0 - bl cache_clean_flush + + @ our cache maintenance code relies on CP15 barrier instructions + @ but since we arrived here with the MMU and caches configured + @ by UEFI, we must check that the CP15BEN bit is set in SCTLR. + @ Note that this bit is RAO/WI on v6 and earlier, so the ISB in + @ the enable path will be executed on v7+ only. + mrc p15, 0, r1, c1, c0, 0 @ read SCTLR + tst r1, #(1 << 5) @ CP15BEN bit set? + bne 0f + orr r1, r1, #(1 << 5) @ CP15 barrier instructions + mcr p15, 0, r1, c1, c0, 0 @ write SCTLR + ARM( .inst 0xf57ff06f @ v7+ isb ) + THUMB( isb ) + +0: bl cache_clean_flush bl cache_off @ Set parameters for booting zImage according to boot protocol From 1bcb344086f3ecf8d6705f6d708441baa823beb3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 15 Apr 2019 12:00:42 -0400 Subject: [PATCH 062/181] ceph: only use d_name directly when parent is locked Ben reported tripping the BUG_ON in create_request_message during some performance testing. Analysis of the vmcore showed that the length of the r_dentry->d_name string changed after we allocated the buffer, but before we encoded it. build_dentry_path returns pointers to d_name in the common case of non-snapped dentries, but this optimization isn't safe unless the parent directory is locked. When it isn't, have the code make a copy of the d_name while holding the d_lock. Cc: stable@vger.kernel.org Reported-by: Ben England Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 61 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 21c33ed048ed..bc5d70d6bfe6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2161,10 +2161,39 @@ retry: return path; } +/* Duplicate the dentry->d_name.name safely */ +static int clone_dentry_name(struct dentry *dentry, const char **ppath, + int *ppathlen) +{ + u32 len; + char *name; + +retry: + len = READ_ONCE(dentry->d_name.len); + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + return -ENOMEM; + + spin_lock(&dentry->d_lock); + if (dentry->d_name.len != len) { + spin_unlock(&dentry->d_lock); + kfree(name); + goto retry; + } + memcpy(name, dentry->d_name.name, len); + spin_unlock(&dentry->d_lock); + + name[len] = '\0'; + *ppath = name; + *ppathlen = len; + return 0; +} + static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath, bool parent_locked) { + int ret; char *path; rcu_read_lock(); @@ -2173,8 +2202,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (dir && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + if (parent_locked) { + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + } else { + ret = clone_dentry_name(dentry, ppath, ppathlen); + if (ret) + return ret; + *pfreepath = true; + } return 0; } rcu_read_unlock(); @@ -2182,13 +2218,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath) { struct dentry *dentry; char *path; @@ -2204,7 +2240,7 @@ static int build_inode_path(struct inode *inode, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } @@ -2215,7 +2251,7 @@ static int build_inode_path(struct inode *inode, static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, const char *rpath, u64 rino, const char **ppath, int *pathlen, - u64 *ino, int *freepath) + u64 *ino, bool *freepath, bool parent_locked) { int r = 0; @@ -2225,7 +2261,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, ceph_snap(rinode)); } else if (rdentry) { r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, - freepath); + freepath, parent_locked); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -2251,7 +2287,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; - int freepath1 = 0, freepath2 = 0; + bool freepath1 = false, freepath2 = false; int len; u16 releases; void *p, *end; @@ -2259,16 +2295,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1); + &path1, &pathlen1, &ino1, &freepath1, + test_bit(CEPH_MDS_R_PARENT_LOCKED, + &req->r_req_flags)); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* If r_old_dentry is set, then assume that its parent is locked */ ret = set_request_path_attr(NULL, req->r_old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2); + &path2, &pathlen2, &ino2, &freepath2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; From 76a495d666e5043ffc315695f8241f5e94a98849 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 17 Apr 2019 12:58:28 -0400 Subject: [PATCH 063/181] ceph: ensure d_name stability in ceph_dentry_hash() Take the d_lock here to ensure that d_name doesn't change. Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a8f429882249..0637149fb9f9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1766,6 +1766,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { struct ceph_inode_info *dci = ceph_inode(dir); + unsigned hash; switch (dci->i_dir_layout.dl_dir_hash) { case 0: /* for backward compat */ @@ -1773,8 +1774,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) return dn->d_name.hash; default: - return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + spin_lock(&dn->d_lock); + hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash, dn->d_name.name, dn->d_name.len); + spin_unlock(&dn->d_lock); + return hash; } } From 4b8222870032715f9d995f3eb7c7acd8379a275d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 15 Apr 2019 12:00:42 -0400 Subject: [PATCH 064/181] ceph: handle the case where a dentry has been renamed on outstanding req It's possible for us to issue a lookup to revalidate a dentry concurrently with a rename. If done in the right order, then we could end up processing dentry info in the reply that no longer reflects the state of the dentry. If req->r_dentry->d_name differs from the one in the trace, then just ignore the trace in the reply. We only need to do this however if the parent's i_rwsem is not held. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2d61ddda9bf5..c2feb310ac1e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1163,6 +1163,19 @@ static int splice_dentry(struct dentry **pdn, struct inode *in) return 0; } +static int d_name_cmp(struct dentry *dentry, const char *name, size_t len) +{ + int ret; + + /* take d_lock to ensure dentry->d_name stability */ + spin_lock(&dentry->d_lock); + ret = dentry->d_name.len - len; + if (!ret) + ret = memcmp(dentry->d_name.name, name, len); + spin_unlock(&dentry->d_lock); + return ret; +} + /* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., @@ -1412,7 +1425,8 @@ retry_lookup: err = splice_dentry(&req->r_dentry, in); if (err < 0) goto done; - } else if (rinfo->head->is_dentry) { + } else if (rinfo->head->is_dentry && + !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) { struct ceph_vino *ptvino = NULL; if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || From 37659182bff1eeaaeadcfc8f853c6d2b6dbc3f47 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 18 Apr 2019 11:24:57 +0800 Subject: [PATCH 065/181] ceph: fix ci->i_head_snapc leak We missed two places that i_wrbuffer_ref_head, i_wr_ref, i_dirty_caps and i_flushing_caps may change. When they are all zeros, we should free i_head_snapc. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/38224 Reported-and-tested-by: Luis Henriques Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 9 +++++++++ fs/ceph/snap.c | 7 ++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bc5d70d6bfe6..9049c2a3e972 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1414,6 +1414,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (drop && + ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 89aa37fa0f84..b26e12cd8ec3 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -572,7 +572,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) old_snapc = NULL; update_snapc: - if (ci->i_head_snapc) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } From d08106796a78a4273e39e1bbdf538dc4334b2635 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Fri, 1 Mar 2019 13:56:11 +0100 Subject: [PATCH 066/181] drm/vc4: Fix memory leak during gpu reset. __drm_atomic_helper_crtc_destroy_state does not free memory, it only cleans it up. Fix this by calling the functions own destroy function. Fixes: 6d6e50039187 ("drm/vc4: Allocate the right amount of space for boot-time CRTC state.") Cc: Eric Anholt Cc: # v4.6+ Reviewed-by: Eric Anholt Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20190301125627.7285-2-maarten.lankhorst@linux.intel.com --- drivers/gpu/drm/vc4/vc4_crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c index 730008d3da76..e7c04a9eb219 100644 --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -1042,7 +1042,7 @@ static void vc4_crtc_reset(struct drm_crtc *crtc) { if (crtc->state) - __drm_atomic_helper_crtc_destroy_state(crtc->state); + vc4_crtc_destroy_state(crtc->state); crtc->state = kzalloc(sizeof(struct vc4_crtc_state), GFP_KERNEL); if (crtc->state) From 0d02113b31b2017dd349ec9df2314e798a90fa6e Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 23 Apr 2019 12:58:11 -0400 Subject: [PATCH 067/181] x86/mm: Fix a crash with kmemleak_scan() The first kmemleak_scan() call after boot would trigger the crash below because this callpath: kernel_init free_initmem mem_encrypt_free_decrypted_mem free_init_pages unmaps memory inside the .bss when DEBUG_PAGEALLOC=y. kmemleak_init() will register the .data/.bss sections and then kmemleak_scan() will scan those addresses and dereference them looking for pointer references. If free_init_pages() frees and unmaps pages in those sections, kmemleak_scan() will crash if referencing one of those addresses: BUG: unable to handle kernel paging request at ffffffffbd402000 CPU: 12 PID: 325 Comm: kmemleak Not tainted 5.1.0-rc4+ #4 RIP: 0010:scan_block Call Trace: scan_gray_list kmemleak_scan kmemleak_scan_thread kthread ret_from_fork Since kmemleak_free_part() is tolerant to unknown objects (not tracked by kmemleak), it is fine to call it from free_init_pages() even if not all address ranges passed to this function are known to kmemleak. [ bp: Massage. ] Fixes: b3f0907c71e0 ("x86/mm: Add .bss..decrypted section to hold shared variables") Signed-off-by: Qian Cai Signed-off-by: Borislav Petkov Reviewed-by: Catalin Marinas Cc: Andy Lutomirski Cc: Brijesh Singh Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190423165811.36699-1-cai@lca.pw --- arch/x86/mm/init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f905a2371080..8dacdb96899e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -766,6 +767,11 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) if (debug_pagealloc_enabled()) { pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", begin, end - 1); + /* + * Inform kmemleak about the hole in the memory since the + * corresponding pages will be unmapped. + */ + kmemleak_free_part((void *)begin, end - begin); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); } else { /* From e02bc29b2cfa7806830d6da8b2322cddd67e8dfe Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Wed, 24 Apr 2019 11:04:13 +0200 Subject: [PATCH 068/181] drm/sun4i: Unbind components before releasing DRM and memory Our components may still be using the DRM device driver (if only to access our driver's private data), so make sure to unbind them before the final drm_dev_put. Also release our reserved memory after component unbind instead of before to match reverse creation order. Fixes: f5a9ed867c83 ("drm/sun4i: Fix component unbinding and component master deletion") Signed-off-by: Paul Kocialkowski Reviewed-by: Chen-Yu Tsai Link: https://patchwork.freedesktop.org/patch/msgid/20190424090413.6918-1-paul.kocialkowski@bootlin.com --- drivers/gpu/drm/sun4i/sun4i_drv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c index 843b86661833..29258b404e54 100644 --- a/drivers/gpu/drm/sun4i/sun4i_drv.c +++ b/drivers/gpu/drm/sun4i/sun4i_drv.c @@ -149,10 +149,11 @@ static void sun4i_drv_unbind(struct device *dev) drm_kms_helper_poll_fini(drm); drm_atomic_helper_shutdown(drm); drm_mode_config_cleanup(drm); - of_reserved_mem_device_release(dev); - drm_dev_put(drm); component_unbind_all(dev, NULL); + of_reserved_mem_device_release(dev); + + drm_dev_put(drm); } static const struct component_master_ops sun4i_drv_master_ops = { From 462ce5d963f18b71c63f6b7730a35a2ee5273540 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 24 Apr 2019 17:06:29 +0200 Subject: [PATCH 069/181] drm/vc4: Fix compilation error reported by kbuild test bot A pointer to crtc was missing, resulting in the following build error: drivers/gpu/drm/vc4/vc4_crtc.c:1045:44: sparse: sparse: incorrect type in argument 1 (different base types) drivers/gpu/drm/vc4/vc4_crtc.c:1045:44: sparse: expected struct drm_crtc *crtc drivers/gpu/drm/vc4/vc4_crtc.c:1045:44: sparse: got struct drm_crtc_state *state drivers/gpu/drm/vc4/vc4_crtc.c:1045:39: sparse: sparse: not enough arguments for function vc4_crtc_destroy_state Signed-off-by: Maarten Lankhorst Reported-by: kbuild test robot Cc: Eric Anholt Link: https://patchwork.freedesktop.org/patch/msgid/2b6ed5e6-81b0-4276-8860-870b54ca3262@linux.intel.com Fixes: d08106796a78 ("drm/vc4: Fix memory leak during gpu reset.") Cc: # v4.6+ Acked-by: Daniel Vetter --- drivers/gpu/drm/vc4/vc4_crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c index e7c04a9eb219..1baa10e94484 100644 --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -1042,7 +1042,7 @@ static void vc4_crtc_reset(struct drm_crtc *crtc) { if (crtc->state) - vc4_crtc_destroy_state(crtc->state); + vc4_crtc_destroy_state(crtc, crtc->state); crtc->state = kzalloc(sizeof(struct vc4_crtc_state), GFP_KERNEL); if (crtc->state) From c660133c339f9ab684fdf568c0d51b9ae5e86002 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Apr 2019 14:07:25 +0300 Subject: [PATCH 070/181] RDMA/mlx5: Do not allow the user to write to the clock page The intent of this VMA was to be read-only from user space, but the VM_MAYWRITE masking was missed, so mprotect could make it writable. Cc: stable@vger.kernel.org Fixes: 5c99eaecb1fc ("IB/mlx5: Mmap the HCA's clock info to user-space") Signed-off-by: Jason Gunthorpe Reviewed-by: Haggai Eran Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 241d9ead79eb..12a75dfc992b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2068,6 +2068,7 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (vma->vm_flags & VM_WRITE) return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; if (!dev->mdev->clock_info_page) return -EOPNOTSUPP; @@ -2233,6 +2234,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_flags & VM_WRITE) return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) From d5e560d3f72382ac4e3bfe4e0f0420e6a220b039 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Apr 2019 14:07:26 +0300 Subject: [PATCH 071/181] RDMA/mlx5: Use rdma_user_map_io for mapping BAR pages Since mlx5 supports device disassociate it must use this API for all BAR page mmaps, otherwise the pages can remain mapped after the device is unplugged causing a system crash. Cc: stable@vger.kernel.org Fixes: 5f9794dc94f5 ("RDMA/ucontext: Add a core API for mmaping driver IO memory") Signed-off-by: Jason Gunthorpe Reviewed-by: Haggai Eran Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 12a75dfc992b..d3dd290ae1b1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2240,14 +2240,12 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (PAGE_SIZE > 4096) return -EOPNOTSUPP; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); pfn = (dev->mdev->iseg_base + offsetof(struct mlx5_init_seg, internal_timer_h)) >> PAGE_SHIFT; - if (io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - break; + return rdma_user_mmap_io(&context->ibucontext, vma, pfn, + PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot)); case MLX5_IB_MMAP_CLOCK_INFO: return mlx5_ib_mmap_clock_info_page(dev, vma, context); From f06eba72274788db6a43012a05a99915c0283aef Mon Sep 17 00:00:00 2001 From: Jacky Bai Date: Fri, 5 Apr 2019 10:31:09 -0700 Subject: [PATCH 072/181] Input: snvs_pwrkey - make it depend on ARCH_MXC The SNVS power key is not only used on i.MX6SX and i.MX7D, it is also used by i.MX6UL and NXP's latest ARMv8 based i.MX8M series SOC. So update the config dependency to use ARCH_MXC, and add the COMPILE_TEST too. Signed-off-by: Jacky Bai Reviewed-by: Dong Aisheng Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index a878351f1643..52d7f55fca32 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -420,7 +420,7 @@ config KEYBOARD_MPR121 config KEYBOARD_SNVS_PWRKEY tristate "IMX SNVS Power Key Driver" - depends on SOC_IMX6SX || SOC_IMX7D + depends on ARCH_MXC || COMPILE_TEST depends on OF help This is the snvs powerkey driver for the Freescale i.MX application From 67f269b37f9b4d52c5e7f97acea26c0852e9b8a1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Apr 2019 14:07:28 +0300 Subject: [PATCH 073/181] RDMA/ucontext: Fix regression with disassociate When this code was consolidated the intention was that the VMA would become backed by anonymous zero pages after the zap_vma_pte - however this very subtly relied on setting the vm_ops = NULL and clearing the VM_SHARED bits to transform the VMA into an anonymous VMA. Since the vm_ops was removed this broke. Now userspace gets a SIGBUS if it touches the vma after disassociation. Instead of converting the VMA to anonymous provide a fault handler that puts a zero'd page into the VMA when user-space touches it after disassociation. Cc: stable@vger.kernel.org Suggested-by: Andrea Arcangeli Fixes: 5f9794dc94f5 ("RDMA/ucontext: Add a core API for mmaping driver IO memory") Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_main.c | 52 +++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index ea0bc6885517..32cc8fe7902f 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -160,6 +160,7 @@ struct ib_uverbs_file { struct mutex umap_lock; struct list_head umaps; + struct page *disassociate_page; struct idr idr; /* spinlock protects write access to idr */ diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 70b7d80431a9..db20b6e0f253 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -208,6 +208,9 @@ void ib_uverbs_release_file(struct kref *ref) kref_put(&file->async_file->ref, ib_uverbs_release_async_event_file); put_device(&file->device->dev); + + if (file->disassociate_page) + __free_pages(file->disassociate_page, 0); kfree(file); } @@ -877,9 +880,50 @@ static void rdma_umap_close(struct vm_area_struct *vma) kfree(priv); } +/* + * Once the zap_vma_ptes has been called touches to the VMA will come here and + * we return a dummy writable zero page for all the pfns. + */ +static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) +{ + struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data; + struct rdma_umap_priv *priv = vmf->vma->vm_private_data; + vm_fault_t ret = 0; + + if (!priv) + return VM_FAULT_SIGBUS; + + /* Read only pages can just use the system zero page. */ + if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { + vmf->page = ZERO_PAGE(vmf->vm_start); + get_page(vmf->page); + return 0; + } + + mutex_lock(&ufile->umap_lock); + if (!ufile->disassociate_page) + ufile->disassociate_page = + alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0); + + if (ufile->disassociate_page) { + /* + * This VMA is forced to always be shared so this doesn't have + * to worry about COW. + */ + vmf->page = ufile->disassociate_page; + get_page(vmf->page); + } else { + ret = VM_FAULT_SIGBUS; + } + mutex_unlock(&ufile->umap_lock); + + return ret; +} + static const struct vm_operations_struct rdma_umap_ops = { .open = rdma_umap_open, .close = rdma_umap_close, + .fault = rdma_umap_fault, }; static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, @@ -889,6 +933,9 @@ static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, struct ib_uverbs_file *ufile = ucontext->ufile; struct rdma_umap_priv *priv; + if (!(vma->vm_flags & VM_SHARED)) + return ERR_PTR(-EINVAL); + if (vma->vm_end - vma->vm_start != size) return ERR_PTR(-EINVAL); @@ -992,7 +1039,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) * at a time to get the lock ordering right. Typically there * will only be one mm, so no big deal. */ - down_write(&mm->mmap_sem); + down_read(&mm->mmap_sem); mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { @@ -1004,10 +1051,9 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); } mutex_unlock(&ufile->umap_lock); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); mmput(mm); } } From bce1a78423961fce676ac65540a31b6ffd179e6d Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 19 Apr 2019 07:39:00 +0000 Subject: [PATCH 074/181] Input: synaptics-rmi4 - fix possible double free The RMI4 function structure has been released in rmi_register_function if error occurs. However, it will be released again in the function rmi_create_function, which may result in a double-free bug. Signed-off-by: Pan Bian Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_driver.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/input/rmi4/rmi_driver.c b/drivers/input/rmi4/rmi_driver.c index fc3ab93b7aea..7fb358f96195 100644 --- a/drivers/input/rmi4/rmi_driver.c +++ b/drivers/input/rmi4/rmi_driver.c @@ -860,7 +860,7 @@ static int rmi_create_function(struct rmi_device *rmi_dev, error = rmi_register_function(fn); if (error) - goto err_put_fn; + return error; if (pdt->function_number == 0x01) data->f01_container = fn; @@ -870,10 +870,6 @@ static int rmi_create_function(struct rmi_device *rmi_dev, list_add_tail(&fn->node, &data->function_list); return RMI_SCAN_CONTINUE; - -err_put_fn: - put_device(&fn->dev); - return error; } void rmi_enable_irq(struct rmi_device *rmi_dev, bool clear_wake) From 05fd5c2c61732152a6bddc318aae62d7e436629b Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 23 Apr 2019 16:39:45 +1000 Subject: [PATCH 075/181] cifs: fix memory leak in SMB2_read Commit 088aaf17aa79300cab14dbee2569c58cfafd7d6e introduced a leak where if SMB2_read() returned an error we would return without freeing the request buffer. Cc: Stable Signed-off-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b8f7262ac354..a37774a55f3a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3466,6 +3466,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->tcon->tid, ses->Suid, io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else trace_smb3_read_done(xid, req->PersistentFileId, From 652727bbe1b17993636346716ae5867627793647 Mon Sep 17 00:00:00 2001 From: Frank Sorenson Date: Tue, 16 Apr 2019 08:37:27 -0500 Subject: [PATCH 076/181] cifs: do not attempt cifs operation on smb2+ rename error A path-based rename returning EBUSY will incorrectly try opening the file with a cifs (NT Create AndX) operation on an smb2+ mount, which causes the server to force a session close. If the mount is smb2+, skip the fallback. Signed-off-by: Frank Sorenson Signed-off-by: Steve French CC: Stable Reviewed-by: Ronnie Sahlberg --- fs/cifs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 53fdb5df0d2e..538fd7d807e4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1735,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (rc == 0 || rc != -EBUSY) goto do_rename_exit; + /* Don't fall back to using SMB on SMB 2+ mount */ + if (server->vals->protocol_id != 0) + goto do_rename_exit; + /* open-file renames don't work across directories */ if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; From 13f5938d8264b5501368523c4513ff26608a33e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Wed, 10 Apr 2019 15:37:47 -0400 Subject: [PATCH 077/181] cifs: fix page reference leak with readv/writev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CIFS can leak pages reference gotten through GUP (get_user_pages*() through iov_iter_get_pages()). This happen if cifs_send_async_read() or cifs_write_from_iter() calls fail from within __cifs_readv() and __cifs_writev() respectively. This patch move page unreference to cifs_aio_ctx_release() which will happens on all code paths this is all simpler to follow for correctness. Signed-off-by: Jérôme Glisse Cc: Steve French Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: Linus Torvalds Cc: Stable Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/file.c | 15 +-------------- fs/cifs/misc.c | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9c0ccc06d172..7037a137fa53 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2877,7 +2877,6 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct dentry *dentry = ctx->cfile->dentry; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -2941,10 +2940,6 @@ restart_loop: kref_put(&wdata->refcount, cifs_uncached_writedata_release); } - if (!ctx->direct_io) - for (i = 0; i < ctx->npages; i++) - put_page(ctx->bv[i].bv_page); - cifs_stats_bytes_written(tcon, ctx->total_len); set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags); @@ -3582,7 +3577,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx) struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -3666,15 +3660,8 @@ again: kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - if (!ctx->direct_io) { - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); - } - + if (!ctx->direct_io) ctx->total_len = ctx->len - iov_iter_count(to); - } /* mask nodata case */ if (rc == -ENODATA) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1e1626a2cfc3..0dc6f08020ac 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -789,6 +789,11 @@ cifs_aio_ctx_alloc(void) { struct cifs_aio_ctx *ctx; + /* + * Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io + * to false so that we know when we have to unreference pages within + * cifs_aio_ctx_release() + */ ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); if (!ctx) return NULL; @@ -807,7 +812,23 @@ cifs_aio_ctx_release(struct kref *refcount) struct cifs_aio_ctx, refcount); cifsFileInfo_put(ctx->cfile); - kvfree(ctx->bv); + + /* + * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly + * which means that iov_iter_get_pages() was a success and thus that + * we have taken reference on pages. + */ + if (ctx->bv) { + unsigned i; + + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } + kvfree(ctx->bv); + } + kfree(ctx); } From 22e8860cf8f777fbf6a83f2fb7127f682a8e9de4 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Sun, 24 Mar 2019 18:18:56 -0500 Subject: [PATCH 078/181] net: ieee802154: fix missing checks for regmap_update_bits regmap_update_bits could fail and deserves a check. The patch adds the checks and if it fails, returns its error code upstream. Signed-off-by: Kangjie Lu Reviewed-by: Mukesh Ojha Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mcr20a.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ieee802154/mcr20a.c b/drivers/net/ieee802154/mcr20a.c index c589f5ae75bb..8bb53ec8d9cf 100644 --- a/drivers/net/ieee802154/mcr20a.c +++ b/drivers/net/ieee802154/mcr20a.c @@ -533,6 +533,8 @@ mcr20a_start(struct ieee802154_hw *hw) dev_dbg(printdev(lp), "no slotted operation\n"); ret = regmap_update_bits(lp->regmap_dar, DAR_PHY_CTRL1, DAR_PHY_CTRL1_SLOTTED, 0x0); + if (ret < 0) + return ret; /* enable irq */ enable_irq(lp->spi->irq); @@ -540,11 +542,15 @@ mcr20a_start(struct ieee802154_hw *hw) /* Unmask SEQ interrupt */ ret = regmap_update_bits(lp->regmap_dar, DAR_PHY_CTRL2, DAR_PHY_CTRL2_SEQMSK, 0x0); + if (ret < 0) + return ret; /* Start the RX sequence */ dev_dbg(printdev(lp), "start the RX sequence\n"); ret = regmap_update_bits(lp->regmap_dar, DAR_PHY_CTRL1, DAR_PHY_CTRL1_XCVSEQ_MASK, MCR20A_XCVSEQ_RX); + if (ret < 0) + return ret; return 0; } From 82c99f7a81f28f8c1be5f701c8377d14c4075b10 Mon Sep 17 00:00:00 2001 From: Harry Pan Date: Wed, 24 Apr 2019 22:50:33 +0800 Subject: [PATCH 079/181] perf/x86/intel: Update KBL Package C-state events to also include PC8/PC9/PC10 counters Kaby Lake (and Coffee Lake) has PC8/PC9/PC10 residency counters. This patch updates the list of Kaby/Coffee Lake PMU event counters from the snb_cstates[] list of events to the hswult_cstates[] list of events, which keeps all previously supported events and also adds the PKG_C8, PKG_C9 and PKG_C10 residency counters. This allows user space tools to profile them through the perf interface. Signed-off-by: Harry Pan Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: gs0622@gmail.com Link: http://lkml.kernel.org/r/20190424145033.1924-1-harry.pan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 94a4b7fc75d0..d41de9af7a39 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -76,15 +76,15 @@ * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,CNL + * Available model: HSW ULT,KBL,CNL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,CNL + * Available model: HSW ULT,KBL,CNL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,GLM,CNL + * Available model: HSW ULT,KBL,GLM,CNL * Scope: Package (physical package) * */ @@ -566,8 +566,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, hswult_cstates), X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_MOBILE, cnl_cstates), From 81103355b1e23345dbcdeccad59962a424da4a34 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Tue, 23 Apr 2019 14:02:57 +0200 Subject: [PATCH 080/181] drm/vmwgfx: Fix dma API layer violation Remove the check for IOMMU presence since it was considered a layer violation. This means we have no reliable way to destinguish between coherent hardware IOMMU DMA address translations and incoherent SWIOTLB DMA address translations, which we can't handle. So always presume the former. This means that if anybody forces SWIOTLB without also setting the vmw_force_coherent=1 vmwgfx option, driver operation will fail, like it will on most other graphics drivers. Signed-off-by: Thomas Hellstrom Reviewed-by: Christoph Hellwig --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 33 +++++------------------------ 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 7ef5dcb06104..2d066383d7fd 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -545,30 +545,14 @@ static void vmw_get_initial_size(struct vmw_private *dev_priv) dev_priv->initial_height = height; } -/** - * vmw_assume_iommu - Figure out whether coherent dma-remapping might be - * taking place. - * @dev: Pointer to the struct drm_device. - * - * Return: true if iommu present, false otherwise. - */ -static bool vmw_assume_iommu(struct drm_device *dev) -{ - const struct dma_map_ops *ops = get_dma_ops(dev->dev); - - return !dma_is_direct(ops) && ops && - ops->map_page != dma_direct_map_page; -} - /** * vmw_dma_select_mode - Determine how DMA mappings should be set up for this * system. * * @dev_priv: Pointer to a struct vmw_private * - * This functions tries to determine the IOMMU setup and what actions - * need to be taken by the driver to make system pages visible to the - * device. + * This functions tries to determine what actions need to be taken by the + * driver to make system pages visible to the device. * If this function decides that DMA is not possible, it returns -EINVAL. * The driver may then try to disable features of the device that require * DMA. @@ -578,23 +562,16 @@ static int vmw_dma_select_mode(struct vmw_private *dev_priv) static const char *names[vmw_dma_map_max] = { [vmw_dma_phys] = "Using physical TTM page addresses.", [vmw_dma_alloc_coherent] = "Using coherent TTM pages.", - [vmw_dma_map_populate] = "Keeping DMA mappings.", + [vmw_dma_map_populate] = "Caching DMA mappings.", [vmw_dma_map_bind] = "Giving up DMA mappings early."}; if (vmw_force_coherent) dev_priv->map_mode = vmw_dma_alloc_coherent; - else if (vmw_assume_iommu(dev_priv->dev)) - dev_priv->map_mode = vmw_dma_map_populate; - else if (!vmw_force_iommu) - dev_priv->map_mode = vmw_dma_phys; - else if (IS_ENABLED(CONFIG_SWIOTLB) && swiotlb_nr_tbl()) - dev_priv->map_mode = vmw_dma_alloc_coherent; + else if (vmw_restrict_iommu) + dev_priv->map_mode = vmw_dma_map_bind; else dev_priv->map_mode = vmw_dma_map_populate; - if (dev_priv->map_mode == vmw_dma_map_populate && vmw_restrict_iommu) - dev_priv->map_mode = vmw_dma_map_bind; - /* No TTM coherent page pool? FIXME: Ask TTM instead! */ if (!(IS_ENABLED(CONFIG_SWIOTLB) || IS_ENABLED(CONFIG_INTEL_IOMMU)) && (dev_priv->map_mode == vmw_dma_alloc_coherent)) From 357798909164bf423eac6a78ff7da7e98d2d7f7f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 24 Apr 2019 15:59:33 +0200 Subject: [PATCH 081/181] gpio: Fix gpiochip_add_data_with_key() error path The err_remove_chip block is too coarse, and may perform cleanup that must not be done. E.g. if of_gpiochip_add() fails, of_gpiochip_remove() is still called, causing: OF: ERROR: Bad of_node_put() on /soc/gpio@e6050000 CPU: 1 PID: 20 Comm: kworker/1:1 Not tainted 5.1.0-rc2-koelsch+ #407 Hardware name: Generic R-Car Gen2 (Flattened Device Tree) Workqueue: events deferred_probe_work_func [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x7c/0x9c) [] (dump_stack) from [] (kobject_put+0x94/0xbc) [] (kobject_put) from [] (gpiochip_add_data_with_key+0x8d8/0xa3c) [] (gpiochip_add_data_with_key) from [] (gpio_rcar_probe+0x1d4/0x314) [] (gpio_rcar_probe) from [] (platform_drv_probe+0x48/0x94) and later, if a GPIO consumer tries to use a GPIO from a failed controller: WARNING: CPU: 0 PID: 1 at lib/refcount.c:156 kobject_get+0x38/0x4c refcount_t: increment on 0; use-after-free. Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.1.0-rc2-koelsch+ #407 Hardware name: Generic R-Car Gen2 (Flattened Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x7c/0x9c) [] (dump_stack) from [] (__warn+0xd0/0xec) [] (__warn) from [] (warn_slowpath_fmt+0x44/0x6c) [] (warn_slowpath_fmt) from [] (kobject_get+0x38/0x4c) [] (kobject_get) from [] (of_node_get+0x14/0x1c) [] (of_node_get) from [] (of_find_node_by_phandle+0xc0/0xf0) [] (of_find_node_by_phandle) from [] (of_phandle_iterator_next+0x68/0x154) [] (of_phandle_iterator_next) from [] (__of_parse_phandle_with_args+0x40/0xd0) [] (__of_parse_phandle_with_args) from [] (of_parse_phandle_with_args_map+0x100/0x3ac) [] (of_parse_phandle_with_args_map) from [] (of_get_named_gpiod_flags+0x38/0x380) [] (of_get_named_gpiod_flags) from [] (gpiod_get_from_of_node+0x24/0xd8) [] (gpiod_get_from_of_node) from [] (devm_fwnode_get_index_gpiod_from_child+0xa0/0x144) [] (devm_fwnode_get_index_gpiod_from_child) from [] (gpio_keys_probe+0x418/0x7bc) [] (gpio_keys_probe) from [] (platform_drv_probe+0x48/0x94) Fix this by splitting the cleanup block, and adding a missing call to gpiochip_irqchip_remove(). Fixes: 28355f81969962cf ("gpio: defer probe if pinctrl cannot be found") Signed-off-by: Geert Uytterhoeven Reviewed-by: Mukesh Ojha Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 0495bf1d480a..bca3e7740ef6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1379,7 +1379,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, status = gpiochip_add_irqchip(chip, lock_key, request_key); if (status) - goto err_remove_chip; + goto err_free_gpiochip_mask; status = of_gpiochip_add(chip); if (status) @@ -1387,7 +1387,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, status = gpiochip_init_valid_mask(chip); if (status) - goto err_remove_chip; + goto err_remove_of_chip; for (i = 0; i < chip->ngpio; i++) { struct gpio_desc *desc = &gdev->descs[i]; @@ -1415,14 +1415,18 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, if (gpiolib_initialized) { status = gpiochip_setup_dev(gdev); if (status) - goto err_remove_chip; + goto err_remove_acpi_chip; } return 0; -err_remove_chip: +err_remove_acpi_chip: acpi_gpiochip_remove(chip); +err_remove_of_chip: gpiochip_free_hogs(chip); of_gpiochip_remove(chip); +err_remove_chip: + gpiochip_irqchip_remove(chip); +err_free_gpiochip_mask: gpiochip_free_valid_mask(chip); err_remove_irqchip_mask: gpiochip_irqchip_free_valid_mask(chip); From c4cba44eeecab9d5ccd3dd2d5520a7d1e5be544f Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Sun, 21 Apr 2019 08:25:50 +0000 Subject: [PATCH 082/181] drm/bridge: dw-hdmi: fix SCDC configuration for ddc-i2c-bus When ddc-i2c-bus property is used, a NULL pointer dereference is reported: [ 31.041669] Unable to handle kernel NULL pointer dereference at virtual address 00000008 [ 31.041671] pgd = 4d3c16f6 [ 31.041673] [00000008] *pgd=00000000 [ 31.041678] Internal error: Oops: 5 [#1] SMP ARM [ 31.041711] Hardware name: Rockchip (Device Tree) [ 31.041718] PC is at i2c_transfer+0x8/0xe4 [ 31.041721] LR is at drm_scdc_read+0x54/0x84 [ 31.041723] pc : [] lr : [] psr: 280f0013 [ 31.041725] sp : edffdad0 ip : 5ccb5511 fp : 00000058 [ 31.041727] r10: 00000780 r9 : edf91608 r8 : c11b0f48 [ 31.041728] r7 : 00000438 r6 : 00000000 r5 : 00000000 r4 : 00000000 [ 31.041730] r3 : edffdae7 r2 : 00000002 r1 : edffdaec r0 : 00000000 [ 31.041908] [] (i2c_transfer) from [] (drm_scdc_read+0x54/0x84) [ 31.041913] [] (drm_scdc_read) from [] (drm_scdc_set_scrambling+0x30/0xbc) [ 31.041919] [] (drm_scdc_set_scrambling) from [] (dw_hdmi_update_power+0x1440/0x1610) [ 31.041926] [] (dw_hdmi_update_power) from [] (dw_hdmi_bridge_enable+0x2c/0x70) [ 31.041932] [] (dw_hdmi_bridge_enable) from [] (drm_bridge_enable+0x24/0x34) [ 31.041938] [] (drm_bridge_enable) from [] (drm_atomic_helper_commit_modeset_enables+0x114/0x220) [ 31.041943] [] (drm_atomic_helper_commit_modeset_enables) from [] (rockchip_atomic_helper_commit_tail_rpm+0x28/0x64) hdmi->i2c may not be set when ddc-i2c-bus property is used in device tree. Fix this by using hdmi->ddc as the i2c adapter when calling drm_scdc_*(). Also report that SCDC is not supported when there is no DDC bus. Fixes: 264fce6cc2c1 ("drm/bridge: dw-hdmi: Add SCDC and TMDS Scrambling support") Signed-off-by: Jonas Karlman Reviewed-by: Heiko Stuebner Reviewed-by: Neil Armstrong Reviewed-by: Laurent Pinchart Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/VE1PR03MB59031814B5BCAB2152923BDAAC210@VE1PR03MB5903.eurprd03.prod.outlook.com --- drivers/gpu/drm/bridge/synopsys/dw-hdmi.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c index b74ccb6a24c2..ab7968c8f6a2 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c @@ -1046,6 +1046,10 @@ static bool dw_hdmi_support_scdc(struct dw_hdmi *hdmi) if (hdmi->version < 0x200a) return false; + /* Disable if no DDC bus */ + if (!hdmi->ddc) + return false; + /* Disable if SCDC is not supported, or if an HF-VSDB block is absent */ if (!display->hdmi.scdc.supported || !display->hdmi.scdc.scrambling.supported) @@ -1684,13 +1688,13 @@ static void hdmi_av_composer(struct dw_hdmi *hdmi, * Source Devices compliant shall set the * Source Version = 1. */ - drm_scdc_readb(&hdmi->i2c->adap, SCDC_SINK_VERSION, + drm_scdc_readb(hdmi->ddc, SCDC_SINK_VERSION, &bytes); - drm_scdc_writeb(&hdmi->i2c->adap, SCDC_SOURCE_VERSION, + drm_scdc_writeb(hdmi->ddc, SCDC_SOURCE_VERSION, min_t(u8, bytes, SCDC_MIN_SOURCE_VERSION)); /* Enabled Scrambling in the Sink */ - drm_scdc_set_scrambling(&hdmi->i2c->adap, 1); + drm_scdc_set_scrambling(hdmi->ddc, 1); /* * To activate the scrambler feature, you must ensure @@ -1706,7 +1710,7 @@ static void hdmi_av_composer(struct dw_hdmi *hdmi, hdmi_writeb(hdmi, 0, HDMI_FC_SCRAMBLER_CTRL); hdmi_writeb(hdmi, (u8)~HDMI_MC_SWRSTZ_TMDSSWRST_REQ, HDMI_MC_SWRSTZ); - drm_scdc_set_scrambling(&hdmi->i2c->adap, 0); + drm_scdc_set_scrambling(hdmi->ddc, 0); } } From c409ca3be3c6ff3a1eeb303b191184e80d412862 Mon Sep 17 00:00:00 2001 From: Malte Leip Date: Sun, 14 Apr 2019 12:00:12 +0200 Subject: [PATCH 083/181] usb: usbip: fix isoc packet num validation in get_pipe Change the validation of number_of_packets in get_pipe to compare the number of packets to a fixed maximum number of packets allowed, set to be 1024. This number was chosen due to it being used by other drivers as well, for example drivers/usb/host/uhci-q.c Background/reason: The get_pipe function in stub_rx.c validates the number of packets in isochronous mode and aborts with an error if that number is too large, in order to prevent malicious input from possibly triggering large memory allocations. This was previously done by checking whether pdu->u.cmd_submit.number_of_packets is bigger than the number of packets that would be needed for pdu->u.cmd_submit.transfer_buffer_length bytes if all except possibly the last packet had maximum length, given by usb_endpoint_maxp(epd) * usb_endpoint_maxp_mult(epd). This leads to an error if URBs with packets shorter than the maximum possible length are submitted, which is allowed according to Documentation/driver-api/usb/URB.rst and occurs for example with the snd-usb-audio driver. Fixes: c6688ef9f297 ("usbip: fix stub_rx: harden CMD_SUBMIT path to handle malicious input") Signed-off-by: Malte Leip Cc: stable Acked-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_rx.c | 12 +++--------- drivers/usb/usbip/usbip_common.h | 7 +++++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index 97b09a42a10c..dbfb2f24d71e 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -361,16 +361,10 @@ static int get_pipe(struct stub_device *sdev, struct usbip_header *pdu) } if (usb_endpoint_xfer_isoc(epd)) { - /* validate packet size and number of packets */ - unsigned int maxp, packets, bytes; - - maxp = usb_endpoint_maxp(epd); - maxp *= usb_endpoint_maxp_mult(epd); - bytes = pdu->u.cmd_submit.transfer_buffer_length; - packets = DIV_ROUND_UP(bytes, maxp); - + /* validate number of packets */ if (pdu->u.cmd_submit.number_of_packets < 0 || - pdu->u.cmd_submit.number_of_packets > packets) { + pdu->u.cmd_submit.number_of_packets > + USBIP_MAX_ISO_PACKETS) { dev_err(&sdev->udev->dev, "CMD_SUBMIT: isoc invalid num packets %d\n", pdu->u.cmd_submit.number_of_packets); diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h index bf8afe9b5883..8be857a4fa13 100644 --- a/drivers/usb/usbip/usbip_common.h +++ b/drivers/usb/usbip/usbip_common.h @@ -121,6 +121,13 @@ extern struct device_attribute dev_attr_usbip_debug; #define USBIP_DIR_OUT 0x00 #define USBIP_DIR_IN 0x01 +/* + * Arbitrary limit for the maximum number of isochronous packets in an URB, + * compare for example the uhci_submit_isochronous function in + * drivers/usb/host/uhci-q.c + */ +#define USBIP_MAX_ISO_PACKETS 1024 + /** * struct usbip_header_basic - data pertinent to every request * @command: the usbip request type From ef61eb43ada6c1d6b94668f0f514e4c268093ff3 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 23 Apr 2019 14:48:29 -0400 Subject: [PATCH 084/181] USB: yurex: Fix protection fault after device removal The syzkaller USB fuzzer found a general-protection-fault bug in the yurex driver. The fault occurs when a device has been unplugged; the driver's interrupt-URB handler logs an error message referring to the device by name, after the device has been unregistered and its name deallocated. This problem is caused by the fact that the interrupt URB isn't cancelled until the driver's private data structure is released, which can happen long after the device is gone. The cure is to make sure that the interrupt URB is killed before yurex_disconnect() returns; this is exactly the sort of thing that usb_poison_urb() was meant for. Signed-off-by: Alan Stern Reported-and-tested-by: syzbot+2eb9121678bdb36e6d57@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/yurex.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/misc/yurex.c b/drivers/usb/misc/yurex.c index 6d9fd5f64903..7b306aa22d25 100644 --- a/drivers/usb/misc/yurex.c +++ b/drivers/usb/misc/yurex.c @@ -314,6 +314,7 @@ static void yurex_disconnect(struct usb_interface *interface) usb_deregister_dev(interface, &yurex_class); /* prevent more I/O from starting */ + usb_poison_urb(dev->urb); mutex_lock(&dev->io_mutex); dev->interface = NULL; mutex_unlock(&dev->io_mutex); From c114944d7d67f24e71562fcfc18d550ab787e4d4 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 22 Apr 2019 11:16:04 -0400 Subject: [PATCH 085/181] USB: w1 ds2490: Fix bug caused by improper use of altsetting array The syzkaller USB fuzzer spotted a slab-out-of-bounds bug in the ds2490 driver. This bug is caused by improper use of the altsetting array in the usb_interface structure (the array's entries are not always stored in numerical order), combined with a naive assumption that all interfaces probed by the driver will have the expected number of altsettings. The bug can be fixed by replacing references to the possibly non-existent intf->altsetting[alt] entry with the guaranteed-to-exist intf->cur_altsetting entry. Signed-off-by: Alan Stern Reported-and-tested-by: syzbot+d65f673b847a1a96cdba@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- drivers/w1/masters/ds2490.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/w1/masters/ds2490.c b/drivers/w1/masters/ds2490.c index 0f4ecfcdb549..a9fb77585272 100644 --- a/drivers/w1/masters/ds2490.c +++ b/drivers/w1/masters/ds2490.c @@ -1016,15 +1016,15 @@ static int ds_probe(struct usb_interface *intf, /* alternative 3, 1ms interrupt (greatly speeds search), 64 byte bulk */ alt = 3; err = usb_set_interface(dev->udev, - intf->altsetting[alt].desc.bInterfaceNumber, alt); + intf->cur_altsetting->desc.bInterfaceNumber, alt); if (err) { dev_err(&dev->udev->dev, "Failed to set alternative setting %d " "for %d interface: err=%d.\n", alt, - intf->altsetting[alt].desc.bInterfaceNumber, err); + intf->cur_altsetting->desc.bInterfaceNumber, err); goto err_out_clear; } - iface_desc = &intf->altsetting[alt]; + iface_desc = intf->cur_altsetting; if (iface_desc->desc.bNumEndpoints != NUM_EP-1) { pr_info("Num endpoints=%d. It is not DS9490R.\n", iface_desc->desc.bNumEndpoints); From b82d6c1f8f8288f744a9dcc16cd3085d535decca Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 3 Apr 2019 21:01:06 -0700 Subject: [PATCH 086/181] mwifiex: Make resume actually do something useful again on SDIO cards The commit fc3a2fcaa1ba ("mwifiex: use atomic bitops to represent adapter status variables") had a fairly straightforward bug in it. It contained this bit of diff: - if (!adapter->is_suspended) { + if (test_bit(MWIFIEX_IS_SUSPENDED, &adapter->work_flags)) { As you can see the patch missed the "!" when converting to the atomic bitops. This meant that the resume hasn't done anything at all since that commit landed and suspend/resume for mwifiex SDIO cards has been totally broken. After fixing this mwifiex suspend/resume appears to work again, at least with the simple testing I've done. Fixes: fc3a2fcaa1ba ("mwifiex: use atomic bitops to represent adapter status variables") Cc: Signed-off-by: Douglas Anderson Reviewed-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/sdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/marvell/mwifiex/sdio.c b/drivers/net/wireless/marvell/mwifiex/sdio.c index a85648342d15..d5a70340a945 100644 --- a/drivers/net/wireless/marvell/mwifiex/sdio.c +++ b/drivers/net/wireless/marvell/mwifiex/sdio.c @@ -181,7 +181,7 @@ static int mwifiex_sdio_resume(struct device *dev) adapter = card->adapter; - if (test_bit(MWIFIEX_IS_SUSPENDED, &adapter->work_flags)) { + if (!test_bit(MWIFIEX_IS_SUSPENDED, &adapter->work_flags)) { mwifiex_dbg(adapter, WARN, "device already resumed\n"); return 0; From a3d46aea46f99d134b4e0726e4826b824c3e5980 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 1 Apr 2019 11:29:58 +0300 Subject: [PATCH 087/181] btrfs: Switch memory allocations in async csum calculation path to kvmalloc Recent multi-page biovec rework allowed creation of bios that can span large regions - up to 128 megabytes in the case of btrfs. OTOH btrfs' submission path currently allocates a contiguous array to store the checksums for every bio submitted. This means we can request up to (128mb / BTRFS_SECTOR_SIZE) * 4 bytes + 32bytes of memory from kmalloc. On busy systems with possibly fragmented memory said kmalloc can fail which will trigger BUG_ON due to improper error handling IO submission context in btrfs. Until error handling is improved or bios in btrfs limited to a more manageable size (e.g. 1m) let's use kvmalloc to fallback to vmalloc for such large allocations. There is no hard requirement that the memory allocated for checksums during IO submission has to be contiguous, but this is a simple fix that does not require several non-contiguous allocations. For small writes this is unlikely to have any visible effect since kmalloc will still satisfy allocation requests as usual. For larger requests the code will just fallback to vmalloc. We've performed evaluation on several workload types and there was no significant difference kmalloc vs kvmalloc. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 15 +++++++++++---- fs/btrfs/ordered-data.c | 3 ++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 920bf3b4b0ef..cccc75d15970 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -427,9 +428,13 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, unsigned long this_sum_bytes = 0; int i; u64 offset; + unsigned nofs_flag; + + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), + GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); - sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), - GFP_NOFS); if (!sums) return BLK_STS_RESOURCE; @@ -472,8 +477,10 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, bytes_left = bio->bi_iter.bi_size - total_bytes; - sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left), - GFP_NOFS); + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, + bytes_left), GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); BUG_ON(!sums); /* -ENOMEM */ sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6fde2b2741ef..45e3cfd1198b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" @@ -442,7 +443,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); list_del(&sum->list); - kfree(sum); + kvfree(sum); } kmem_cache_free(btrfs_ordered_extent_cache, entry); } From 2557fabd6e29f349bfa0ac13f38ac98aa5eafc74 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Tue, 23 Apr 2019 17:30:26 +0800 Subject: [PATCH 088/181] RDMA/hns: Bugfix for mapping user db When the maximum send wr delivered by the user is zero, the qp does not have a sq. When allocating the sq db buffer to store the user sq pi pointer and map it to the kernel mode, max_send_wr is used as the trigger condition, while the kernel does not consider the max_send_wr trigger condition when mapmping db. It will cause sq record doorbell map fail and create qp fail. The failed print information as follows: hns3 0000:7d:00.1: Send cmd: tail - 418, opcode - 0x8504, flag - 0x0011, retval - 0x0000 hns3 0000:7d:00.1: Send cmd: 0xe59dc000 0x00000000 0x00000000 0x00000000 0x00000116 0x0000ffff hns3 0000:7d:00.1: sq record doorbell map failed! hns3 0000:7d:00.1: Create RC QP failed Fixes: 0425e3e6e0c7 ("RDMA/hns: Support flush cqe for hip08 in kernel space") Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 66cdf625534f..60cf9f03e941 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -533,7 +533,7 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, static int hns_roce_qp_has_sq(struct ib_qp_init_attr *attr) { - if (attr->qp_type == IB_QPT_XRC_TGT) + if (attr->qp_type == IB_QPT_XRC_TGT || !attr->cap.max_send_wr) return 0; return 1; From a860fa7b96e1a1c974556327aa1aee852d434c21 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Sat, 20 Apr 2019 16:34:16 +0800 Subject: [PATCH 089/181] sched/numa: Fix a possible divide-by-zero sched_clock_cpu() may not be consistent between CPUs. If a task migrates to another CPU, then se.exec_start is set to that CPU's rq_clock_task() by update_stats_curr_start(). Specifically, the new value might be before the old value due to clock skew. So then if in numa_get_avg_runtime() the expression: 'now - p->last_task_numa_placement' ends up as -1, then the divider '*period + 1' in task_numa_placement() is 0 and things go bang. Similar to update_curr(), check if time goes backwards to avoid this. [ peterz: Wrote new changelog. ] [ mingo: Tweaked the code comment. ] Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cj.chengjian@huawei.com Cc: Link: http://lkml.kernel.org/r/20190425080016.GX11158@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a4d9e14bf138..35f3ea375084 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; } else { delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; From 9a8f612ca0d6a436e6471c9bed516d34a2cc626f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 8 Apr 2019 10:31:45 +0200 Subject: [PATCH 090/181] mtd: rawnand: marvell: Clean the controller state before each operation Since the migration of the driver to stop using the legacy ->select_chip() hook, there is nothing deselecting the target anymore, thus the selection is not forced at the next access. Ensure the ND_RUN bit and the interrupts are always in a clean state. Cc: Daniel Mack Cc: stable@vger.kernel.org Fixes: b25251414f6e00 ("mtd: rawnand: marvell: Stop implementing ->select_chip()") Suggested-by: Boris Brezillon Signed-off-by: Miquel Raynal Tested-by: Daniel Mack Reviewed-by: Boris Brezillon Signed-off-by: Richard Weinberger --- drivers/mtd/nand/raw/marvell_nand.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/nand/raw/marvell_nand.c b/drivers/mtd/nand/raw/marvell_nand.c index f38e5c1b87e4..d984538980e2 100644 --- a/drivers/mtd/nand/raw/marvell_nand.c +++ b/drivers/mtd/nand/raw/marvell_nand.c @@ -722,12 +722,6 @@ static void marvell_nfc_select_target(struct nand_chip *chip, struct marvell_nfc *nfc = to_marvell_nfc(chip->controller); u32 ndcr_generic; - if (chip == nfc->selected_chip && die_nr == marvell_nand->selected_die) - return; - - writel_relaxed(marvell_nand->ndtr0, nfc->regs + NDTR0); - writel_relaxed(marvell_nand->ndtr1, nfc->regs + NDTR1); - /* * Reset the NDCR register to a clean state for this particular chip, * also clear ND_RUN bit. @@ -739,6 +733,12 @@ static void marvell_nfc_select_target(struct nand_chip *chip, /* Also reset the interrupt status register */ marvell_nfc_clear_int(nfc, NDCR_ALL_INT); + if (chip == nfc->selected_chip && die_nr == marvell_nand->selected_die) + return; + + writel_relaxed(marvell_nand->ndtr0, nfc->regs + NDTR0); + writel_relaxed(marvell_nand->ndtr1, nfc->regs + NDTR1); + nfc->selected_chip = chip; marvell_nand->selected_die = die_nr; } From 349ced9984ff540ce74ca8a0b2e9b03dc434b9dd Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Wed, 24 Apr 2019 00:16:10 -0700 Subject: [PATCH 091/181] power: supply: sysfs: prevent endless uevent loop with CONFIG_POWER_SUPPLY_DEBUG Fix a similar endless event loop as was done in commit 8dcf32175b4e ("i2c: prevent endless uevent loop with CONFIG_I2C_DEBUG_CORE"): The culprit is the dev_dbg printk in the i2c uevent handler. If this is activated (for instance by CONFIG_I2C_DEBUG_CORE) it results in an endless loop with systemd-journald. This happens if user-space scans the system log and reads the uevent file to get information about a newly created device, which seems fair use to me. Unfortunately reading the "uevent" file uses the same function that runs for creating the uevent for a new device, generating the next syslog entry Both CONFIG_I2C_DEBUG_CORE and CONFIG_POWER_SUPPLY_DEBUG were reported in https://bugs.freedesktop.org/show_bug.cgi?id=76886 but only former seems to have been fixed. Drop debug prints as it was done in I2C subsystem to resolve the issue. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: linux-pm@vger.kernel.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/power_supply_sysfs.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index dce24f596160..5358a80d854f 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -383,15 +383,11 @@ int power_supply_uevent(struct device *dev, struct kobj_uevent_env *env) char *prop_buf; char *attrname; - dev_dbg(dev, "uevent\n"); - if (!psy || !psy->desc) { dev_dbg(dev, "No power supply yet\n"); return ret; } - dev_dbg(dev, "POWER_SUPPLY_NAME=%s\n", psy->desc->name); - ret = add_uevent_var(env, "POWER_SUPPLY_NAME=%s", psy->desc->name); if (ret) return ret; @@ -427,8 +423,6 @@ int power_supply_uevent(struct device *dev, struct kobj_uevent_env *env) goto out; } - dev_dbg(dev, "prop %s=%s\n", attrname, prop_buf); - ret = add_uevent_var(env, "POWER_SUPPLY_%s=%s", attrname, prop_buf); kfree(attrname); if (ret) From 4ee0776760af03f181e6b80baf5fb1cc1a980f50 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 24 Apr 2019 09:32:55 -0700 Subject: [PATCH 092/181] selftests/seccomp: Prepare for exclusive seccomp flags Some seccomp flags will become exclusive, so the selftest needs to be adjusted to mask those out and test them individually for the "all flags" tests. Cc: stable@vger.kernel.org # v5.0+ Signed-off-by: Kees Cook Reviewed-by: Tycho Andersen Acked-by: James Morris --- tools/testing/selftests/seccomp/seccomp_bpf.c | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index f69d2ee29742..5019cdae5d0b 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -2166,11 +2166,14 @@ TEST(detect_seccomp_filter_flags) SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_SPEC_ALLOW, SECCOMP_FILTER_FLAG_NEW_LISTENER }; - unsigned int flag, all_flags; + unsigned int exclusive[] = { + SECCOMP_FILTER_FLAG_TSYNC, + SECCOMP_FILTER_FLAG_NEW_LISTENER }; + unsigned int flag, all_flags, exclusive_mask; int i; long ret; - /* Test detection of known-good filter flags */ + /* Test detection of individual known-good filter flags */ for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) { int bits = 0; @@ -2197,16 +2200,29 @@ TEST(detect_seccomp_filter_flags) all_flags |= flag; } - /* Test detection of all known-good filter flags */ - ret = seccomp(SECCOMP_SET_MODE_FILTER, all_flags, NULL); - EXPECT_EQ(-1, ret); - EXPECT_EQ(EFAULT, errno) { - TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!", - all_flags); + /* + * Test detection of all known-good filter flags combined. But + * for the exclusive flags we need to mask them out and try them + * individually for the "all flags" testing. + */ + exclusive_mask = 0; + for (i = 0; i < ARRAY_SIZE(exclusive); i++) + exclusive_mask |= exclusive[i]; + for (i = 0; i < ARRAY_SIZE(exclusive); i++) { + flag = all_flags & ~exclusive_mask; + flag |= exclusive[i]; + + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!", + flag); + } } - /* Test detection of an unknown filter flag */ + /* Test detection of an unknown filter flags, without exclusives. */ flag = -1; + flag &= ~exclusive_mask; ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno) { From 7a0df7fbc14505e2e2be19ed08654a09e1ed5bf6 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 6 Mar 2019 13:14:13 -0700 Subject: [PATCH 093/181] seccomp: Make NEW_LISTENER and TSYNC flags exclusive As the comment notes, the return codes for TSYNC and NEW_LISTENER conflict, because they both return positive values, one in the case of success and one in the case of error. So, let's disallow both of these flags together. While this is technically a userspace break, all the users I know of are still waiting on me to land this feature in libseccomp, so I think it'll be safe. Also, at present my use case doesn't require TSYNC at all, so this isn't a big deal to disallow. If someone wanted to support this, a path forward would be to add a new flag like TSYNC_AND_LISTENER_YES_I_UNDERSTAND_THAT_TSYNC_WILL_JUST_RETURN_EAGAIN, but the use cases are so different I don't see it really happening. Finally, it's worth noting that this does actually fix a UAF issue: at the end of seccomp_set_mode_filter(), we have: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { if (ret < 0) { listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); } else { fd_install(listener, listener_f); ret = listener; } } out_free: seccomp_filter_free(prepared); But if ret > 0 because TSYNC raced, we'll install the listener fd and then free the filter out from underneath it, causing a UAF when the task closes it or dies. This patch also switches the condition to be simply if (ret), so that if someone does add the flag mentioned above, they won't have to remember to fix this too. Reported-by: syzbot+b562969adb2e04af3442@syzkaller.appspotmail.com Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") CC: stable@vger.kernel.org # v5.0+ Signed-off-by: Tycho Andersen Signed-off-by: Kees Cook Acked-by: James Morris --- kernel/seccomp.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54a0347ca812..4b51d9cbcf06 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -502,7 +502,10 @@ out: * * Caller must be holding current->sighand->siglock lock. * - * Returns 0 on success, -ve on error. + * Returns 0 on success, -ve on error, or + * - in TSYNC mode: the pid of a thread which was either not in the correct + * seccomp mode or did not have an ancestral seccomp filter + * - in NEW_LISTENER mode: the fd of the new listener */ static long seccomp_attach_filter(unsigned int flags, struct seccomp_filter *filter) @@ -1258,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags, if (flags & ~SECCOMP_FILTER_FLAG_MASK) return -EINVAL; + /* + * In the successful case, NEW_LISTENER returns the new listener fd. + * But in the failure case, TSYNC returns the thread that died. If you + * combine these two flags, there's no way to tell whether something + * succeeded or failed. So, let's disallow this combination. + */ + if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && + (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) + return -EINVAL; + /* Prepare the new filter before holding any locks. */ prepared = seccomp_prepare_user_filter(filter); if (IS_ERR(prepared)) @@ -1304,7 +1317,7 @@ out: mutex_unlock(¤t->signal->cred_guard_mutex); out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { - if (ret < 0) { + if (ret) { listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); From ecfc3fcabbb5291d1e61600a3dac6cdbfdb04cb1 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 10 Apr 2019 21:49:23 +0800 Subject: [PATCH 094/181] MIPS: eBPF: Make ebpf_to_mips_reg() static Fix sparse warning: arch/mips/net/ebpf_jit.c:196:5: warning: symbol 'ebpf_to_mips_reg' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/mips/net/ebpf_jit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 0effd3cba9a7..98bf0c222b5f 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -186,8 +186,9 @@ enum which_ebpf_reg { * separate frame pointer, so BPF_REG_10 relative accesses are * adjusted to be $sp relative. */ -int ebpf_to_mips_reg(struct jit_ctx *ctx, const struct bpf_insn *insn, - enum which_ebpf_reg w) +static int ebpf_to_mips_reg(struct jit_ctx *ctx, + const struct bpf_insn *insn, + enum which_ebpf_reg w) { int ebpf_reg = (w == src_reg || w == src_reg_no_fp) ? insn->src_reg : insn->dst_reg; From 8694d8c1f82cccec9380e0d3720b84eee315dfb7 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Fri, 12 Apr 2019 14:40:50 +0200 Subject: [PATCH 095/181] tools: bpftool: fix infinite loop in map create "bpftool map create" has an infinite loop on "while (argc)". The error case is missing. Symptoms: when forgetting to type the keyword 'type' in front of 'hash': $ sudo bpftool map create /sys/fs/bpf/dir/foobar hash key 8 value 8 entries 128 (infinite loop, taking all the CPU) ^C After the patch: $ sudo bpftool map create /sys/fs/bpf/dir/foobar hash key 8 value 8 entries 128 Error: unknown arg hash Fixes: 0b592b5a01be ("tools: bpftool: add map create command") Signed-off-by: Alban Crequy Reviewed-by: Quentin Monnet Acked-by: Song Liu Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/map.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index e0c650d91784..994a7e0d16fb 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1151,6 +1151,9 @@ static int do_create(int argc, char **argv) return -1; } NEXT_ARG(); + } else { + p_err("unknown arg %s", *argv); + return -1; } } From 39391377f8ecf2fa4569e2fede624dc091bcd859 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Sat, 13 Apr 2019 03:37:32 +0200 Subject: [PATCH 096/181] libbpf: add binary to gitignore Some binaries are generated when building libbpf from tools/lib/bpf/, namely libbpf.so.0.0.2 and libbpf.so.0. Add them to the local .gitignore. Signed-off-by: Matteo Croce Reviewed-by: Jakub Kicinski Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/.gitignore b/tools/lib/bpf/.gitignore index 4db74758c674..fecb78afea3f 100644 --- a/tools/lib/bpf/.gitignore +++ b/tools/lib/bpf/.gitignore @@ -1,3 +1,4 @@ libbpf_version.h FEATURE-DUMP.libbpf test_libbpf +libbpf.so.* From c6a9efa1d8353d8960d152e7d469d952b01495c0 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 24 Apr 2019 21:50:42 +0200 Subject: [PATCH 097/181] bpf: mark registers in all frames after pkt/null checks In case of a null check on a pointer inside a subprog, we should mark all registers with this pointer as either safe or unknown, in both the current and previous frames. Currently, only spilled registers and registers in the current frame are marked. Packet bound checks in subprogs have the same issue. This patch fixes it to mark registers in previous frames as well. A good reproducer for null checks looks as follow: 1: ptr = bpf_map_lookup_elem(map, &key); 2: ret = subprog(ptr) { 3: return ptr != NULL; 4: } 5: if (ret) 6: value = *ptr; With the above, the verifier will complain on line 6 because it sees ptr as map_value_or_null despite the null check in subprog 1. Note that this patch fixes another resulting bug when using bpf_sk_release(): 1: sk = bpf_sk_lookup_tcp(...); 2: subprog(sk) { 3: if (sk) 4: bpf_sk_release(sk); 5: } 6: if (!sk) 7: return 0; 8: return 1; In the above, mark_ptr_or_null_regs will warn on line 6 because it will try to free the reference state, even though it was already freed on line 3. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Paul Chaignon Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 76 ++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6c5a41f7f338..09d5d972c9ff 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4138,15 +4138,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static void __find_good_pkt_pointers(struct bpf_func_state *state, + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type, u16 new_range) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + reg = &state->regs[i]; + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + } + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } +} + static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i, j; + int i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -4211,20 +4231,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == type && regs[i].id == dst_reg->id) - /* keep the maximum range already checked */ - regs[i].range = max(regs[i].range, new_range); - - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } - } + for (i = 0; i <= vstate->curframe; i++) + __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, + new_range); } /* compute branch direction of the expression "if (reg opcode val) goto target;" @@ -4698,6 +4707,22 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } +static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, + bool is_null) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + mark_ptr_or_null_reg(state, reg, id, is_null); + } +} + /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -4705,10 +4730,10 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *regs = state->regs; u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i, j; + int i; if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. @@ -4717,17 +4742,8 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, */ WARN_ON_ONCE(release_reference_state(state, id)); - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, ®s[i], id, is_null); - - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } - } + for (i = 0; i <= vstate->curframe; i++) + __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, From 6dd7f14080473b655c247863e61b7c34424f0c83 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 24 Apr 2019 21:51:26 +0200 Subject: [PATCH 098/181] selftests/bpf: test cases for pkt/null checks in subprogs The first test case, for pointer null checks, is equivalent to the following pseudo-code. It checks that the verifier does not complain on line 6 and recognizes that ptr isn't null. 1: ptr = bpf_map_lookup_elem(map, &key); 2: ret = subprog(ptr) { 3: return ptr != NULL; 4: } 5: if (ret) 6: value = *ptr; The second test case, for packet bound checks, is equivalent to the following pseudo-code. It checks that the verifier does not complain on line 7 and recognizes that the packet is at least 1 byte long. 1: pkt_end = ctx.pkt_end; 2: ptr = ctx.pkt + 8; 3: ret = subprog(ptr, pkt_end) { 4: return ptr <= pkt_end; 5: } 6: if (ret) 7: value = *(u8 *)ctx.pkt; Signed-off-by: Paul Chaignon Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/calls.c | 25 +++++++++++++++++++ .../bpf/verifier/direct_packet_access.c | 22 ++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index fb11240b758b..9093a8f64dc6 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -374,6 +374,31 @@ .prog_type = BPF_PROG_TYPE_XDP, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, +{ + "calls: ptr null check in subprog", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .fixup_map_hash_48b = { 3 }, + .result_unpriv = REJECT, + .result = ACCEPT, + .retval = 0, +}, { "calls: two calls with args", .insns = { diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c index e3fc22e672c2..d5c596fdc4b9 100644 --- a/tools/testing/selftests/bpf/verifier/direct_packet_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c @@ -631,3 +631,25 @@ .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, +{ + "direct packet access: test29 (reg > pkt_end in subprog)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, +}, From 0edd6b64d1939e9e9168ff27947995bb7751db5d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Apr 2019 21:55:59 +0200 Subject: [PATCH 099/181] bpf: Fix preempt_enable_no_resched() abuse Unless the very next line is schedule(), or implies it, one must not use preempt_enable_no_resched(). It can cause a preemption to go missing and thereby cause arbitrary delays, breaking the PREEMPT=y invariant. Cc: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f02367faa58d..944ccc310201 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -510,7 +510,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } \ _out: \ rcu_read_unlock(); \ - preempt_enable_no_resched(); \ + preempt_enable(); \ _ret; \ }) From 5bb5c3a3ac102158b799bf5eda871223aa5e9c25 Mon Sep 17 00:00:00 2001 From: Shun-Chih Yu Date: Thu, 25 Apr 2019 11:53:50 +0800 Subject: [PATCH 100/181] dmaengine: mediatek-cqdma: fix wrong register usage in mtk_cqdma_start This patch fixes wrong register usage in the mtk_cqdma_start. The destination register should be MTK_CQDMA_DST2 instead. Fixes: b1f01e48df5a ("dmaengine: mediatek: Add MediaTek Command-Queue DMA controller for MT6765 SoC") Signed-off-by: Shun-Chih Yu Cc: stable@vger.kernel.org Signed-off-by: Vinod Koul --- drivers/dma/mediatek/mtk-cqdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/mediatek/mtk-cqdma.c b/drivers/dma/mediatek/mtk-cqdma.c index 131f3974740d..814853842e29 100644 --- a/drivers/dma/mediatek/mtk-cqdma.c +++ b/drivers/dma/mediatek/mtk-cqdma.c @@ -253,7 +253,7 @@ static void mtk_cqdma_start(struct mtk_cqdma_pchan *pc, #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT mtk_dma_set(pc, MTK_CQDMA_DST2, cvd->dest >> MTK_CQDMA_ADDR2_SHFIT); #else - mtk_dma_set(pc, MTK_CQDMA_SRC2, 0); + mtk_dma_set(pc, MTK_CQDMA_DST2, 0); #endif /* setup the length */ From c1c477217882c610a2ba0268f5faf36c9c092528 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Apr 2019 09:43:26 -0700 Subject: [PATCH 101/181] l2tp: use rcu_dereference_sk_user_data() in l2tp_udp_encap_recv() Canonical way to fetch sk_user_data from an encap_rcv() handler called from UDP stack in rcu protected section is to use rcu_dereference_sk_user_data(), otherwise compiler might read it multiple times. Fixes: d00fa9adc528 ("il2tp: fix races with tunnel socket close") Signed-off-by: Eric Dumazet Cc: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index fed6becc5daf..aee33d132018 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -909,7 +909,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_tunnel *tunnel; - tunnel = l2tp_tunnel(sk); + tunnel = rcu_dereference_sk_user_data(sk); if (tunnel == NULL) goto pass_up; From 56c5bc1849de1311eda5bc506bddad504bfd14fc Mon Sep 17 00:00:00 2001 From: Fabien Dessenne Date: Wed, 24 Apr 2019 11:35:49 +0200 Subject: [PATCH 102/181] net: ethernet: stmmac: manage the get_irq probe defer case Manage the -EPROBE_DEFER error case for "stm32_pwr_wakeup" IRQ. Signed-off-by: Fabien Dessenne Acked-by: Alexandre TORGUE Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c index 062a600fa5a7..21428537e231 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c @@ -333,6 +333,9 @@ static int stm32mp1_parse_data(struct stm32_dwmac *dwmac, */ dwmac->irq_pwr_wakeup = platform_get_irq_byname(pdev, "stm32_pwr_wakeup"); + if (dwmac->irq_pwr_wakeup == -EPROBE_DEFER) + return -EPROBE_DEFER; + if (!dwmac->clk_eth_ck && dwmac->irq_pwr_wakeup >= 0) { err = device_init_wakeup(&pdev->dev, true); if (err) { From 88ef66a28391ea7b624bfb7508a5b015c13b28f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 24 Apr 2019 19:12:46 +0200 Subject: [PATCH 103/181] qmi_wwan: new Wistron, ZTE and D-Link devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding device entries found in vendor modified versions of this driver. Function maps for some of the devices follow: WNC D16Q1, D16Q5, D18Q1 LTE CAT3 module (1435:0918) MI_00 Qualcomm HS-USB Diagnostics MI_01 Android Debug interface MI_02 Qualcomm HS-USB Modem MI_03 Qualcomm Wireless HS-USB Ethernet Adapter MI_04 Qualcomm Wireless HS-USB Ethernet Adapter MI_05 Qualcomm Wireless HS-USB Ethernet Adapter MI_06 USB Mass Storage Device T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1435 ProdID=0918 Rev= 2.32 S: Manufacturer=Android S: Product=Android S: SerialNumber=0123456789ABCDEF C:* #Ifs= 7 Cfg#= 1 Atr=80 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=32ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=32ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=88(I) Atr=03(Int.) MxPS= 64 Ivl=32ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=8a(I) Atr=03(Int.) MxPS= 64 Ivl=32ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms WNC D18 LTE CAT3 module (1435:d182) MI_00 Qualcomm HS-USB Diagnostics MI_01 Androd Debug interface MI_02 Qualcomm HS-USB Modem MI_03 Qualcomm HS-USB NMEA MI_04 Qualcomm Wireless HS-USB Ethernet Adapter MI_05 Qualcomm Wireless HS-USB Ethernet Adapter MI_06 USB Mass Storage Device ZM8510/ZM8620/ME3960 (19d2:0396) MI_00 ZTE Mobile Broadband Diagnostics Port MI_01 ZTE Mobile Broadband AT Port MI_02 ZTE Mobile Broadband Modem MI_03 ZTE Mobile Broadband NDIS Port (qmi_wwan) MI_04 ZTE Mobile Broadband ADB Port ME3620_X (19d2:1432) MI_00 ZTE Diagnostics Device MI_01 ZTE UI AT Interface MI_02 ZTE Modem Device MI_03 ZTE Mobile Broadband Network Adapter MI_04 ZTE Composite ADB Interface Reported-by: Lars Melin Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 9195f3476b1d..679e404a5224 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1122,9 +1122,16 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x0846, 0x68d3, 8)}, /* Netgear Aircard 779S */ {QMI_FIXED_INTF(0x12d1, 0x140c, 1)}, /* Huawei E173 */ {QMI_FIXED_INTF(0x12d1, 0x14ac, 1)}, /* Huawei E1820 */ + {QMI_FIXED_INTF(0x1435, 0x0918, 3)}, /* Wistron NeWeb D16Q1 */ + {QMI_FIXED_INTF(0x1435, 0x0918, 4)}, /* Wistron NeWeb D16Q1 */ + {QMI_FIXED_INTF(0x1435, 0x0918, 5)}, /* Wistron NeWeb D16Q1 */ + {QMI_FIXED_INTF(0x1435, 0x3185, 4)}, /* Wistron NeWeb M18Q5 */ + {QMI_FIXED_INTF(0x1435, 0xd111, 4)}, /* M9615A DM11-1 D51QC */ {QMI_FIXED_INTF(0x1435, 0xd181, 3)}, /* Wistron NeWeb D18Q1 */ {QMI_FIXED_INTF(0x1435, 0xd181, 4)}, /* Wistron NeWeb D18Q1 */ {QMI_FIXED_INTF(0x1435, 0xd181, 5)}, /* Wistron NeWeb D18Q1 */ + {QMI_FIXED_INTF(0x1435, 0xd182, 4)}, /* Wistron NeWeb D18 */ + {QMI_FIXED_INTF(0x1435, 0xd182, 5)}, /* Wistron NeWeb D18 */ {QMI_FIXED_INTF(0x1435, 0xd191, 4)}, /* Wistron NeWeb D19Q1 */ {QMI_QUIRK_SET_DTR(0x1508, 0x1001, 4)}, /* Fibocom NL668 series */ {QMI_FIXED_INTF(0x16d8, 0x6003, 0)}, /* CMOTech 6003 */ @@ -1180,6 +1187,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x0265, 4)}, /* ONDA MT8205 4G LTE */ {QMI_FIXED_INTF(0x19d2, 0x0284, 4)}, /* ZTE MF880 */ {QMI_FIXED_INTF(0x19d2, 0x0326, 4)}, /* ZTE MF821D */ + {QMI_FIXED_INTF(0x19d2, 0x0396, 3)}, /* ZTE ZM8620 */ {QMI_FIXED_INTF(0x19d2, 0x0412, 4)}, /* Telewell TW-LTE 4G */ {QMI_FIXED_INTF(0x19d2, 0x1008, 4)}, /* ZTE (Vodafone) K3570-Z */ {QMI_FIXED_INTF(0x19d2, 0x1010, 4)}, /* ZTE (Vodafone) K3571-Z */ @@ -1200,7 +1208,9 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x1425, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1426, 2)}, /* ZTE MF91 */ {QMI_FIXED_INTF(0x19d2, 0x1428, 2)}, /* Telewell TW-LTE 4G v2 */ + {QMI_FIXED_INTF(0x19d2, 0x1432, 3)}, /* ZTE ME3620 */ {QMI_FIXED_INTF(0x19d2, 0x2002, 4)}, /* ZTE (Vodafone) K3765-Z */ + {QMI_FIXED_INTF(0x2001, 0x7e16, 3)}, /* D-Link DWM-221 */ {QMI_FIXED_INTF(0x2001, 0x7e19, 4)}, /* D-Link DWM-221 B1 */ {QMI_FIXED_INTF(0x2001, 0x7e35, 4)}, /* D-Link DWM-222 */ {QMI_FIXED_INTF(0x2020, 0x2031, 4)}, /* Olicard 600 */ From b987222654f84f7b4ca95b3a55eca784cb30235b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Apr 2019 23:59:25 +0200 Subject: [PATCH 104/181] tracing: Fix buffer_ref pipe ops This fixes multiple issues in buffer_pipe_buf_ops: - The ->steal() handler must not return zero unless the pipe buffer has the only reference to the page. But generic_pipe_buf_steal() assumes that every reference to the pipe is tracked by the page's refcount, which isn't true for these buffers - buffer_pipe_buf_get(), which duplicates a buffer, doesn't touch the page's refcount. Fix it by using generic_pipe_buf_nosteal(), which refuses every attempted theft. It should be easy to actually support ->steal, but the only current users of pipe_buf_steal() are the virtio console and FUSE, and they also only use it as an optimization. So it's probably not worth the effort. - The ->get() and ->release() handlers can be invoked concurrently on pipe buffers backed by the same struct buffer_ref. Make them safe against concurrency by using refcount_t. - The pointers stored in ->private were only zeroed out when the last reference to the buffer_ref was dropped. As far as I know, this shouldn't be necessary anyway, but if we do it, let's always do it. Link: http://lkml.kernel.org/r/20190404215925.253531-1-jannh@google.com Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Al Viro Cc: stable@vger.kernel.org Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer") Signed-off-by: Jann Horn Signed-off-by: Steven Rostedt (VMware) --- fs/splice.c | 4 ++-- include/linux/pipe_fs_i.h | 1 + kernel/trace/trace.c | 28 ++++++++++++++-------------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 3ee7e82df48f..e75807380caa 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -330,8 +330,8 @@ const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return 1; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 787d224ff43e..a830e9a00eb9 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -174,6 +174,7 @@ void free_pipe_info(struct pipe_inode_info *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_nosteal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21153e64bf1c..0cfa13a60086 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7025,19 +7025,23 @@ struct buffer_ref { struct ring_buffer *buffer; void *page; int cpu; - int ref; + refcount_t refcount; }; +static void buffer_ref_release(struct buffer_ref *ref) +{ + if (!refcount_dec_and_test(&ref->refcount)) + return; + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); + kfree(ref); +} + static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); buf->private = 0; } @@ -7046,14 +7050,14 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - ref->ref++; + refcount_inc(&ref->refcount); } /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; @@ -7066,11 +7070,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); spd->partial[i].private = 0; } @@ -7125,7 +7125,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - ref->ref = 1; + refcount_set(&ref->refcount, 1); ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { From 91862cc7867bba4ee5c8fcf0ca2f1d30427b6129 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Fri, 19 Apr 2019 21:22:59 -0500 Subject: [PATCH 105/181] tracing: Fix a memory leak by early error exit in trace_pid_write() In trace_pid_write(), the buffer for trace parser is allocated through kmalloc() in trace_parser_get_init(). Later on, after the buffer is used, it is then freed through kfree() in trace_parser_put(). However, it is possible that trace_pid_write() is terminated due to unexpected errors, e.g., ENOMEM. In that case, the allocated buffer will not be freed, which is a memory leak bug. To fix this issue, free the allocated buffer when an error is encountered. Link: http://lkml.kernel.org/r/1555726979-15633-1-git-send-email-wang6495@umn.edu Fixes: f4d34a87e9c10 ("tracing: Use pid bitmap instead of a pid array for set_event_pid") Cc: stable@vger.kernel.org Signed-off-by: Wenwen Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0cfa13a60086..46f68fad6373 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -496,8 +496,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * not modified. */ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) + if (!pid_list) { + trace_parser_put(&parser); return -ENOMEM; + } pid_list->pid_max = READ_ONCE(pid_max); @@ -507,6 +509,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); if (!pid_list->pids) { + trace_parser_put(&parser); kfree(pid_list); return -ENOMEM; } From d6097c9e4454adf1f8f2c9547c2fa6060d55d952 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Apr 2019 22:03:18 +0200 Subject: [PATCH 106/181] trace: Fix preempt_enable_no_resched() abuse Unless the very next line is schedule(), or implies it, one must not use preempt_enable_no_resched(). It can cause a preemption to go missing and thereby cause arbitrary delays, breaking the PREEMPT=y invariant. Link: http://lkml.kernel.org/r/20190423200318.GY14281@hirez.programming.kicks-ass.net Cc: Waiman Long Cc: Linus Torvalds Cc: Ingo Molnar Cc: Will Deacon Cc: Thomas Gleixner Cc: the arch/x86 maintainers Cc: Davidlohr Bueso Cc: Tim Chen Cc: huang ying Cc: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: stable@vger.kernel.org Fixes: 2c2d7329d8af ("tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp()") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 41b6f96e5366..4ee8d8aa3d0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) preempt_disable_notrace(); time = rb_time_stamp(buffer); - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return time; } From 4e43df38a2e6c876d3c8ecc4196ed67a895c425d Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 24 Apr 2019 22:18:53 +0200 Subject: [PATCH 107/181] genetlink: use idr_alloc_cyclic for family->id assignment When allocating the next family->id it makes more sense to use idr_alloc_cyclic to avoid re-using a previously used family->id as much as possible. Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f0ec068e1d02..cb69d35c8e6a 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -362,8 +362,8 @@ int genl_register_family(struct genl_family *family) } else family->attrbuf = NULL; - family->id = idr_alloc(&genl_fam_idr, family, - start, end + 1, GFP_KERNEL); + family->id = idr_alloc_cyclic(&genl_fam_idr, family, + start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; goto errout_free; From fdfdf86720a34527f777cbe0d8599bf0528fa146 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 25 Apr 2019 00:33:00 +0200 Subject: [PATCH 108/181] net: phy: marvell: Fix buffer overrun with stats counters marvell_get_sset_count() returns how many statistics counters there are. If the PHY supports fibre, there are 3, otherwise two. marvell_get_strings() does not make this distinction, and always returns 3 strings. This then often results in writing past the end of the buffer for the strings. Fixes: 2170fef78a40 ("Marvell phy: add field to get errors from fiber link.") Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 3ccba37bd6dd..f76c4048b978 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1489,9 +1489,10 @@ static int marvell_get_sset_count(struct phy_device *phydev) static void marvell_get_strings(struct phy_device *phydev, u8 *data) { + int count = marvell_get_sset_count(phydev); int i; - for (i = 0; i < ARRAY_SIZE(marvell_hw_stats); i++) { + for (i = 0; i < count; i++) { strlcpy(data + i * ETH_GSTRING_LEN, marvell_hw_stats[i].string, ETH_GSTRING_LEN); } @@ -1519,9 +1520,10 @@ static u64 marvell_get_stat(struct phy_device *phydev, int i) static void marvell_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data) { + int count = marvell_get_sset_count(phydev); int i; - for (i = 0; i < ARRAY_SIZE(marvell_hw_stats); i++) + for (i = 0; i < count; i++) data[i] = marvell_get_stat(phydev, i); } From 89c02e69fc5245f8a2f34b58b42d43a737af1a5e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Apr 2019 22:23:37 -0700 Subject: [PATCH 109/181] mm/memory_hotplug.c: drop memory device reference after find_memory_block() Right now we are using find_memory_block() to get the node id for the pfn range to online. We are missing to drop a reference to the memory block device. While the device still gets unregistered via device_unregister(), resulting in no user visible problem, the device is never released via device_release(), resulting in a memory leak. Fix that by properly using a put_device(). Link: http://lkml.kernel.org/r/20190411110955.1430-1-david@redhat.com Fixes: d0dc12e86b31 ("mm/memory_hotplug: optimize memory hotplug") Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: David Hildenbrand Cc: Pavel Tatashin Cc: Qian Cai Cc: Arun KS Cc: Mathieu Malaterre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0082d699be94..b236069ff0d8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -874,6 +874,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ mem = find_memory_block(__pfn_to_section(pfn)); nid = mem->nid; + put_device(&mem->dev); /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); From e153abc0739ff77bd89c9ba1688cdb963464af97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Thu, 25 Apr 2019 22:23:41 -0700 Subject: [PATCH 110/181] zram: pass down the bvec we need to read into in the work struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When scheduling work item to read page we need to pass down the proper bvec struct which points to the page to read into. Before this patch it uses a randomly initialized bvec (only if PAGE_SIZE != 4096) which is wrong. Note that without this patch on arch/kernel where PAGE_SIZE != 4096 userspace could read random memory through a zram block device (thought userspace probably would have no control on the address being read). Link: http://lkml.kernel.org/r/20190408183219.26377-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Andrew Morton Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 399cad7daae7..d58a359a6622 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -774,18 +774,18 @@ struct zram_work { struct zram *zram; unsigned long entry; struct bio *bio; + struct bio_vec bvec; }; #if PAGE_SIZE != 4096 static void zram_sync_read(struct work_struct *work) { - struct bio_vec bvec; struct zram_work *zw = container_of(work, struct zram_work, work); struct zram *zram = zw->zram; unsigned long entry = zw->entry; struct bio *bio = zw->bio; - read_from_bdev_async(zram, &bvec, entry, bio); + read_from_bdev_async(zram, &zw->bvec, entry, bio); } /* @@ -798,6 +798,7 @@ static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, { struct zram_work work; + work.bvec = *bvec; work.zram = zram; work.entry = entry; work.bio = bio; From ae3d6a323347940f0548bbb4b17f0bb2e9164169 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 25 Apr 2019 22:23:44 -0700 Subject: [PATCH 111/181] lib/Kconfig.debug: fix build error without CONFIG_BLOCK If CONFIG_TEST_KMOD is set to M, while CONFIG_BLOCK is not set, XFS and BTRFS can not be compiled successly. Link: http://lkml.kernel.org/r/20190410075434.35220-1-yuehaibing@huawei.com Fixes: d9c6a72d6fa2 ("kmod: add test driver to stress test the module loader") Signed-off-by: YueHaibing Reported-by: Hulk Robot Reviewed-by: Kees Cook Cc: Masahiro Yamada Cc: Petr Mladek Cc: Andy Shevchenko Cc: Matthew Wilcox Cc: Joe Lawrence Cc: Robin Murphy Cc: Luis Chamberlain Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 00dbcdbc9a0d..d5a4a4036d2f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1929,6 +1929,7 @@ config TEST_KMOD depends on m depends on BLOCK && (64BIT || LBDAF) # for XFS, BTRFS depends on NETDEVICES && NET_CORE && INET # for TUN + depends on BLOCK select TEST_LKM select XFS_FS select TUN From e789803507b2e154ed452865ee981b038654e98d Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 25 Apr 2019 22:23:47 -0700 Subject: [PATCH 112/181] lib/test_vmalloc.c: do not create cpumask_t variable on stack On my "Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz" system(12 CPUs) i get the warning from the compiler about frame size: warning: the frame size of 1096 bytes is larger than 1024 bytes [-Wframe-larger-than=] the size of cpumask_t depends on number of CPUs, therefore just make use of cpumask_of() in set_cpus_allowed_ptr() as a second argument. Link: http://lkml.kernel.org/r/20190418193925.9361-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Andrew Morton Reviewed-by: Roman Gushchin Cc: Uladzislau Rezki Cc: Michal Hocko Cc: Matthew Wilcox Cc: Thomas Garnier Cc: Oleksiy Avramchenko Cc: Steven Rostedt Cc: Joel Fernandes Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_vmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 83cdcaa82bf6..f832b095afba 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -383,14 +383,14 @@ static void shuffle_array(int *arr, int n) static int test_func(void *private) { struct test_driver *t = private; - cpumask_t newmask = CPU_MASK_NONE; int random_array[ARRAY_SIZE(test_case_array)]; int index, i, j, ret; ktime_t kt; u64 delta; - cpumask_set_cpu(t->cpu, &newmask); - set_cpus_allowed_ptr(current, &newmask); + ret = set_cpus_allowed_ptr(current, cpumask_of(t->cpu)); + if (ret < 0) + pr_err("Failed to set affinity to %d CPU\n", t->cpu); for (i = 0; i < ARRAY_SIZE(test_case_array); i++) random_array[i] = i; From 24512228b7a3f412b5a51f189df302616b021c33 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Apr 2019 22:23:51 -0700 Subject: [PATCH 113/181] mm: do not boost watermarks to avoid fragmentation for the DISCONTIG memory model Mikulas Patocka reported that commit 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs") "broke" memory management on parisc. The machine is not NUMA but the DISCONTIG model creates three pgdats even though it's a UMA machine for the following ranges 0) Start 0x0000000000000000 End 0x000000003fffffff Size 1024 MB 1) Start 0x0000000100000000 End 0x00000001bfdfffff Size 3070 MB 2) Start 0x0000004040000000 End 0x00000040ffffffff Size 3072 MB Mikulas reported: With the patch 1c30844d2, the kernel will incorrectly reclaim the first zone when it fills up, ignoring the fact that there are two completely free zones. Basiscally, it limits cache size to 1GiB. For example, if I run: # dd if=/dev/sda of=/dev/null bs=1M count=2048 - with the proper kernel, there should be "Buffers - 2GiB" when this command finishes. With the patch 1c30844d2, buffers will consume just 1GiB or slightly more, because the kernel was incorrectly reclaiming them. The page allocator and reclaim makes assumptions that pgdats really represent NUMA nodes and zones represent ranges and makes decisions on that basis. Watermark boosting for small pgdats leads to unexpected results even though this would have behaved reasonably on SPARSEMEM. DISCONTIG is essentially deprecated and even parisc plans to move to SPARSEMEM so there is no need to be fancy, this patch simply disables watermark boosting by default on DISCONTIGMEM. Link: http://lkml.kernel.org/r/20190419094335.GJ18914@techsingularity.net Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs") Signed-off-by: Mel Gorman Reported-by: Mikulas Patocka Tested-by: Mikulas Patocka Acked-by: Vlastimil Babka Cc: James Bottomley Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 16 ++++++++-------- mm/page_alloc.c | 13 +++++++++++++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6af24cdb25cc..3f13d8599337 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -866,14 +866,14 @@ The intent is that compaction has less work to do in the future and to increase the success rate of future high-order allocations such as SLUB allocations, THP and hugetlbfs pages. -To make it sensible with respect to the watermark_scale_factor parameter, -the unit is in fractions of 10,000. The default value of 15,000 means -that up to 150% of the high watermark will be reclaimed in the event of -a pageblock being mixed due to fragmentation. The level of reclaim is -determined by the number of fragmentation events that occurred in the -recent past. If this value is smaller than a pageblock then a pageblocks -worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor -of 0 will disable the feature. +To make it sensible with respect to the watermark_scale_factor +parameter, the unit is in fractions of 10,000. The default value of +15,000 on !DISCONTIGMEM configurations means that up to 150% of the high +watermark will be reclaimed in the event of a pageblock being mixed due +to fragmentation. The level of reclaim is determined by the number of +fragmentation events that occurred in the recent past. If this value is +smaller than a pageblock then a pageblocks worth of pages will be reclaimed +(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature. ============================================================= diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c6ce20aaf80b..6c6d9f1c404e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -266,7 +266,20 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_DISCONTIGMEM +/* + * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges + * are not on separate NUMA nodes. Functionally this works but with + * watermark_boost_factor, it can reclaim prematurely as the ranges can be + * quite small. By default, do not boost watermarks on discontigmem as in + * many cases very high-order allocations like THP are likely to be + * unsupported and the premature reclaim offsets the advantage of long-term + * fragmentation avoidance. + */ +int watermark_boost_factor __read_mostly; +#else int watermark_boost_factor __read_mostly = 15000; +#endif int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; From ee8ab0eeb49bd3982090c8f14dc9cc65bcd13c5c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Apr 2019 22:23:54 -0700 Subject: [PATCH 114/181] mm, page_alloc: always use a captured page regardless of compaction result During the development of commit 5e1f0f098b46 ("mm, compaction: capture a page under direct compaction"), a paranoid check was added to ensure that if a captured page was available after compaction that it was consistent with the final state of compaction. The intent was to catch serious programming bugs such as using a stale page pointer and causing corruption problems. However, it is possible to get a captured page even if compaction was unsuccessful if an interrupt triggered and happened to free pages in interrupt context that got merged into a suitable high-order page. It's highly unlikely but Li Wang did report the following warning on s390 occuring when testing OOM handling. Note that the warning is slightly edited for clarity. WARNING: CPU: 0 PID: 9783 at mm/page_alloc.c:3777 __alloc_pages_direct_compact+0x182/0x190 Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache sunrpc pkey ghash_s390 prng xts aes_s390 des_s390 des_generic sha512_s390 zcrypt_cex4 zcrypt vmur binfmt_misc ip_tables xfs libcrc32c dasd_fba_mod qeth_l2 dasd_eckd_mod dasd_mod qeth qdio lcs ctcm ccwgroup fsm dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 9783 Comm: copy.sh Kdump: loaded Not tainted 5.1.0-rc 5 #1 This patch simply removes the check entirely instead of trying to be clever about pages freed from interrupt context. If a serious programming error was introduced, it is highly likely to be caught by prep_new_page() instead. Link: http://lkml.kernel.org/r/20190419085133.GH18914@techsingularity.net Fixes: 5e1f0f098b46 ("mm, compaction: capture a page under direct compaction") Signed-off-by: Mel Gorman Reported-by: Li Wang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6c6d9f1c404e..d167c48d913c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3786,11 +3786,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); - if (*compact_result <= COMPACT_INACTIVE) { - WARN_ON_ONCE(page); - return NULL; - } - /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall From 8139ad043d632c0e9e12d760068a7a8e91659aa1 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 25 Apr 2019 22:23:58 -0700 Subject: [PATCH 115/181] mm/page_alloc.c: avoid potential NULL pointer dereference ac.preferred_zoneref->zone passed to alloc_flags_nofragment() can be NULL. 'zone' pointer unconditionally derefernced in alloc_flags_nofragment(). Bail out on NULL zone to avoid potential crash. Currently we don't see any crashes only because alloc_flags_nofragment() has another bug which allows compiler to optimize away all accesses to 'zone'. Link: http://lkml.kernel.org/r/20190423120806.3503-1-aryabinin@virtuozzo.com Fixes: 6bb154504f8b ("mm, page_alloc: spread allocations across zones before introducing fragmentation") Signed-off-by: Andrey Ryabinin Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d167c48d913c..9992ca7f29f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3432,6 +3432,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) alloc_flags |= ALLOC_KSWAPD; #ifdef CONFIG_ZONE_DMA32 + if (!zone) + return alloc_flags; + if (zone_idx(zone) != ZONE_NORMAL) goto out; From 8118b82eb756e271929697e8ada5f637dc443af1 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 25 Apr 2019 22:24:01 -0700 Subject: [PATCH 116/181] mm/page_alloc.c: fix never set ALLOC_NOFRAGMENT flag Commit 0a79cdad5eb2 ("mm: use alloc_flags to record if kswapd can wake") removed setting of the ALLOC_NOFRAGMENT flag. Bring it back. The runtime effect is that ALLOC_NOFRAGMENT behaviour is restored so that allocations are spread across local zones to avoid fragmentation due to mixing pageblocks as long as possible. Link: http://lkml.kernel.org/r/20190423120806.3503-2-aryabinin@virtuozzo.com Fixes: 0a79cdad5eb2 ("mm: use alloc_flags to record if kswapd can wake") Signed-off-by: Andrey Ryabinin Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9992ca7f29f1..c02cff1ed56e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3436,7 +3436,7 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) return alloc_flags; if (zone_idx(zone) != ZONE_NORMAL) - goto out; + return alloc_flags; /* * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and @@ -3445,9 +3445,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) */ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); if (nr_online_nodes > 1 && !populated_zone(--zone)) - goto out; + return alloc_flags; -out: + alloc_flags |= ALLOC_NOFRAGMENT; #endif /* CONFIG_ZONE_DMA32 */ return alloc_flags; } From 89189557b47b35683a27c80ee78aef18248eefb4 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 25 Apr 2019 22:24:05 -0700 Subject: [PATCH 117/181] fs/proc/proc_sysctl.c: Fix a NULL pointer dereference Syzkaller report this: sysctl could not get directory: /net//bridge -12 kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 7027 Comm: syz-executor.0 Tainted: G C 5.1.0-rc3+ #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 RIP: 0010:__write_once_size include/linux/compiler.h:220 [inline] RIP: 0010:__rb_change_child include/linux/rbtree_augmented.h:144 [inline] RIP: 0010:__rb_erase_augmented include/linux/rbtree_augmented.h:186 [inline] RIP: 0010:rb_erase+0x5f4/0x19f0 lib/rbtree.c:459 Code: 00 0f 85 60 13 00 00 48 89 1a 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 89 f2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 75 0c 00 00 4d 85 ed 4c 89 2e 74 ce 4c 89 ea 48 RSP: 0018:ffff8881bb507778 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: ffff8881f224b5b8 RCX: ffffffff818f3f6a RDX: 000000000000000a RSI: 0000000000000050 RDI: ffff8881f224b568 RBP: 0000000000000000 R08: ffffed10376a0ef4 R09: ffffed10376a0ef4 R10: 0000000000000001 R11: ffffed10376a0ef4 R12: ffff8881f224b558 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f3e7ce13700(0000) GS:ffff8881f7300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fd60fbe9398 CR3: 00000001cb55c001 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: erase_entry fs/proc/proc_sysctl.c:178 [inline] erase_header+0xe3/0x160 fs/proc/proc_sysctl.c:207 start_unregistering fs/proc/proc_sysctl.c:331 [inline] drop_sysctl_table+0x558/0x880 fs/proc/proc_sysctl.c:1631 get_subdir fs/proc/proc_sysctl.c:1022 [inline] __register_sysctl_table+0xd65/0x1090 fs/proc/proc_sysctl.c:1335 br_netfilter_init+0x68/0x1000 [br_netfilter] do_one_initcall+0xbc/0x47d init/main.c:901 do_init_module+0x1b5/0x547 kernel/module.c:3456 load_module+0x6405/0x8c10 kernel/module.c:3804 __do_sys_finit_module+0x162/0x190 kernel/module.c:3898 do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Modules linked in: br_netfilter(+) backlight comedi(C) hid_sensor_hub max3100 ti_ads8688 udc_core fddi snd_mona leds_gpio rc_streamzap mtd pata_netcell nf_log_common rc_winfast udp_tunnel snd_usbmidi_lib snd_usb_toneport snd_usb_line6 snd_rawmidi snd_seq_device snd_hwdep videobuf2_v4l2 videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops rc_gadmei_rm008z 8250_of smm665 hid_tmff hid_saitek hwmon_vid rc_ati_tv_wonder_hd_600 rc_core pata_pdc202xx_old dn_rtmsg as3722 ad714x_i2c ad714x snd_soc_cs4265 hid_kensington panel_ilitek_ili9322 drm drm_panel_orientation_quirks ipack cdc_phonet usbcore phonet hid_jabra hid extcon_arizona can_dev industrialio_triggered_buffer kfifo_buf industrialio adm1031 i2c_mux_ltc4306 i2c_mux ipmi_msghandler mlxsw_core snd_soc_cs35l34 snd_soc_core snd_pcm_dmaengine snd_pcm snd_timer ac97_bus snd_compress snd soundcore gpio_da9055 uio ecdh_generic mdio_thunder of_mdio fixed_phy libphy mdio_cavium iptable_security iptable_raw iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun joydev mousedev ppdev tpm kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ide_pci_generic piix aes_x86_64 crypto_simd cryptd ide_core glue_helper input_leds psmouse intel_agp intel_gtt serio_raw ata_generic i2c_piix4 agpgart pata_acpi parport_pc parport floppy rtc_cmos sch_fq_codel ip_tables x_tables sha1_ssse3 sha1_generic ipv6 [last unloaded: br_netfilter] Dumping ftrace buffer: (ftrace buffer empty) ---[ end trace 68741688d5fbfe85 ]--- commit 23da9588037e ("fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links") forgot to handle start_unregistering() case, while header->parent is NULL, it calls erase_header() and as seen in the above syzkaller call trace, accessing &header->parent->root will trigger a NULL pointer dereference. As that commit explained, there is also no need to call start_unregistering() if header->parent is NULL. Link: http://lkml.kernel.org/r/20190409153622.28112-1-yuehaibing@huawei.com Fixes: 23da9588037e ("fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links") Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets") Signed-off-by: YueHaibing Reported-by: Hulk Robot Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Alexey Dobriyan Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d65390727541..7325baa8f9d4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1626,9 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - if (parent) + if (parent) { put_links(header); - start_unregistering(header); + start_unregistering(header); + } + if (!--header->count) kfree_rcu(header, rcu); From 3a349763cf11e63534b8f2d302f2d0c790566497 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 26 Apr 2019 17:22:01 -0700 Subject: [PATCH 118/181] Input: synaptics-rmi4 - write config register values to the right offset Currently any changed config register values don't take effect, as the function to write them back is called with the wrong register offset. Fixes: ff8f83708b3e (Input: synaptics-rmi4 - add support for 2D sensors and F11) Signed-off-by: Lucas Stach Reviewed-by: Philipp Zabel Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f11.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/rmi4/rmi_f11.c b/drivers/input/rmi4/rmi_f11.c index df64d6aed4f7..93901ebd122a 100644 --- a/drivers/input/rmi4/rmi_f11.c +++ b/drivers/input/rmi4/rmi_f11.c @@ -1230,7 +1230,7 @@ static int rmi_f11_initialize(struct rmi_function *fn) } rc = f11_write_control_regs(fn, &f11->sens_query, - &f11->dev_controls, fn->fd.query_base_addr); + &f11->dev_controls, fn->fd.control_base_addr); if (rc) dev_warn(&fn->dev, "Failed to write control registers\n"); From baf76f0c58aec435a3a864075b8f6d8ee5d1f17e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Apr 2019 16:13:58 -0700 Subject: [PATCH 119/181] slip: make slhc_free() silently accept an error pointer This way, slhc_free() accepts what slhc_init() returns, whether that is an error or not. In particular, the pattern in sl_alloc_bufs() is slcomp = slhc_init(16, 16); ... slhc_free(slcomp); for the error handling path, and rather than complicate that code, just make it ok to always free what was returned by the init function. That's what the code used to do before commit 4ab42d78e37a ("ppp, slip: Validate VJ compression slot parameters completely") when slhc_init() just returned NULL for the error case, with no actual indication of the details of the error. Reported-by: syzbot+45474c076a4927533d2e@syzkaller.appspotmail.com Fixes: 4ab42d78e37a ("ppp, slip: Validate VJ compression slot parameters completely") Acked-by: Ben Hutchings Cc: David Miller Signed-off-by: Linus Torvalds --- drivers/net/slip/slhc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index f4e93f5fc204..ea90db3c7705 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -153,7 +153,7 @@ out_fail: void slhc_free(struct slcompress *comp) { - if ( comp == NULLSLCOMPR ) + if ( IS_ERR_OR_NULL(comp) ) return; if ( comp->tstate != NULLSLSTATE ) From b4e30e8e7ea1d1e35ffd64ca46f7d9a7f227b4bf Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Apr 2019 22:31:50 -0400 Subject: [PATCH 120/181] bnxt_en: Improve multicast address setup logic. The driver builds a list of multicast addresses and sends it to the firmware when the driver's ndo_set_rx_mode() is called. In rare cases, the firmware can fail this call if internal resources to add multicast addresses are exhausted. In that case, we should try the call again by setting the ALL_MCAST flag which is more guaranteed to succeed. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 4c586ba4364b..42fd273d457e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8961,8 +8961,15 @@ static int bnxt_cfg_rx_mode(struct bnxt *bp) skip_uc: rc = bnxt_hwrm_cfa_l2_set_rx_mask(bp, 0); + if (rc && vnic->mc_list_count) { + netdev_info(bp->dev, "Failed setting MC filters rc: %d, turning on ALL_MCAST mode\n", + rc); + vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST; + vnic->mc_list_count = 0; + rc = bnxt_hwrm_cfa_l2_set_rx_mask(bp, 0); + } if (rc) - netdev_err(bp->dev, "HWRM cfa l2 rx mask failure rc: %x\n", + netdev_err(bp->dev, "HWRM cfa l2 rx mask failure rc: %d\n", rc); return rc; From f9099d611449836a51a65f40ea7dc9cb5f2f665e Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Thu, 25 Apr 2019 22:31:51 -0400 Subject: [PATCH 121/181] bnxt_en: Free short FW command HWRM memory in error path in bnxt_init_one() In the bnxt_init_one() error path, short FW command request memory is not freed. This patch fixes it. Fixes: e605db801bde ("bnxt_en: Support for Short Firmware Message") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 42fd273d457e..5d02f5999df7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -10692,6 +10692,7 @@ init_err_cleanup_tc: bnxt_clear_int_mode(bp); init_err_pci_clean: + bnxt_free_hwrm_short_cmd_req(bp); bnxt_free_hwrm_resources(bp); bnxt_free_ctx_mem(bp); kfree(bp->ctx); From 1f83391bd6fc48f92f627b0ec0bce686d100c6a5 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Apr 2019 22:31:52 -0400 Subject: [PATCH 122/181] bnxt_en: Fix possible crash in bnxt_hwrm_ring_free() under error conditions. If we encounter errors during open and proceed to clean up, bnxt_hwrm_ring_free() may crash if the rings we try to free have never been allocated. bnxt_cp_ring_for_rx() or bnxt_cp_ring_for_tx() may reference pointers that have not been allocated. Fix it by checking for valid fw_ring_id first before calling bnxt_cp_ring_for_rx() or bnxt_cp_ring_for_tx(). Fixes: 2c61d2117ecb ("bnxt_en: Add helper functions to get firmware CP ring ID.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 5d02f5999df7..b03669f67a0c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5135,10 +5135,10 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path) for (i = 0; i < bp->tx_nr_rings; i++) { struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; struct bnxt_ring_struct *ring = &txr->tx_ring_struct; - u32 cmpl_ring_id; - cmpl_ring_id = bnxt_cp_ring_for_tx(bp, txr); if (ring->fw_ring_id != INVALID_HW_RING_ID) { + u32 cmpl_ring_id = bnxt_cp_ring_for_tx(bp, txr); + hwrm_ring_free_send_msg(bp, ring, RING_FREE_REQ_RING_TYPE_TX, close_path ? cmpl_ring_id : @@ -5151,10 +5151,10 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path) struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i]; struct bnxt_ring_struct *ring = &rxr->rx_ring_struct; u32 grp_idx = rxr->bnapi->index; - u32 cmpl_ring_id; - cmpl_ring_id = bnxt_cp_ring_for_rx(bp, rxr); if (ring->fw_ring_id != INVALID_HW_RING_ID) { + u32 cmpl_ring_id = bnxt_cp_ring_for_rx(bp, rxr); + hwrm_ring_free_send_msg(bp, ring, RING_FREE_REQ_RING_TYPE_RX, close_path ? cmpl_ring_id : @@ -5173,10 +5173,10 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path) struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i]; struct bnxt_ring_struct *ring = &rxr->rx_agg_ring_struct; u32 grp_idx = rxr->bnapi->index; - u32 cmpl_ring_id; - cmpl_ring_id = bnxt_cp_ring_for_rx(bp, rxr); if (ring->fw_ring_id != INVALID_HW_RING_ID) { + u32 cmpl_ring_id = bnxt_cp_ring_for_rx(bp, rxr); + hwrm_ring_free_send_msg(bp, ring, type, close_path ? cmpl_ring_id : INVALID_HW_RING_ID); From ad361adf0d08f1135f3845c6b3a36be7cc0bfda5 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Apr 2019 22:31:53 -0400 Subject: [PATCH 123/181] bnxt_en: Pass correct extended TX port statistics size to firmware. If driver determines that extended TX port statistics are not supported or allocation of the data structure fails, make sure to pass 0 TX stats size to firmware to disable it. The firmware returned TX stats size should also be set to 0 for consistency. This will prevent bnxt_get_ethtool_stats() from accessing the NULL TX stats pointer in case there is mismatch between firmware and driver. Fixes: 36e53349b60b ("bnxt_en: Add additional extended port statistics.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b03669f67a0c..a9172b2feb1d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6753,6 +6753,7 @@ static int bnxt_hwrm_port_qstats_ext(struct bnxt *bp) struct hwrm_queue_pri2cos_qcfg_input req2 = {0}; struct hwrm_port_qstats_ext_input req = {0}; struct bnxt_pf_info *pf = &bp->pf; + u32 tx_stat_size; int rc; if (!(bp->flags & BNXT_FLAG_PORT_STATS_EXT)) @@ -6762,13 +6763,16 @@ static int bnxt_hwrm_port_qstats_ext(struct bnxt *bp) req.port_id = cpu_to_le16(pf->port_id); req.rx_stat_size = cpu_to_le16(sizeof(struct rx_port_stats_ext)); req.rx_stat_host_addr = cpu_to_le64(bp->hw_rx_port_stats_ext_map); - req.tx_stat_size = cpu_to_le16(sizeof(struct tx_port_stats_ext)); + tx_stat_size = bp->hw_tx_port_stats_ext ? + sizeof(*bp->hw_tx_port_stats_ext) : 0; + req.tx_stat_size = cpu_to_le16(tx_stat_size); req.tx_stat_host_addr = cpu_to_le64(bp->hw_tx_port_stats_ext_map); mutex_lock(&bp->hwrm_cmd_lock); rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); if (!rc) { bp->fw_rx_stats_ext_size = le16_to_cpu(resp->rx_stat_size) / 8; - bp->fw_tx_stats_ext_size = le16_to_cpu(resp->tx_stat_size) / 8; + bp->fw_tx_stats_ext_size = tx_stat_size ? + le16_to_cpu(resp->tx_stat_size) / 8 : 0; } else { bp->fw_rx_stats_ext_size = 0; bp->fw_tx_stats_ext_size = 0; From 3f93cd3f098e284c851acb89265ebe35b994a5c8 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Apr 2019 22:31:54 -0400 Subject: [PATCH 124/181] bnxt_en: Fix statistics context reservation logic. In an earlier commit that fixes the number of stats contexts to reserve for the RDMA driver, we added a function parameter to pass in the number of stats contexts to all the relevant functions. The passed in parameter should have been used to set the enables field of the firmware message. Fixes: 780baad44f0f ("bnxt_en: Reserve 1 stat_ctx for RDMA driver.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a9172b2feb1d..b6cb7b8ceab9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5315,17 +5315,16 @@ __bnxt_hwrm_reserve_pf_rings(struct bnxt *bp, struct hwrm_func_cfg_input *req, req->num_tx_rings = cpu_to_le16(tx_rings); if (BNXT_NEW_RM(bp)) { enables |= rx_rings ? FUNC_CFG_REQ_ENABLES_NUM_RX_RINGS : 0; + enables |= stats ? FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; if (bp->flags & BNXT_FLAG_CHIP_P5) { enables |= cp_rings ? FUNC_CFG_REQ_ENABLES_NUM_MSIX : 0; enables |= tx_rings + ring_grps ? - FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS | - FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; + FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS : 0; enables |= rx_rings ? FUNC_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS : 0; } else { enables |= cp_rings ? - FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS | - FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; + FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS : 0; enables |= ring_grps ? FUNC_CFG_REQ_ENABLES_NUM_HW_RING_GRPS | FUNC_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS : 0; @@ -5365,14 +5364,13 @@ __bnxt_hwrm_reserve_vf_rings(struct bnxt *bp, enables |= tx_rings ? FUNC_VF_CFG_REQ_ENABLES_NUM_TX_RINGS : 0; enables |= rx_rings ? FUNC_VF_CFG_REQ_ENABLES_NUM_RX_RINGS | FUNC_VF_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS : 0; + enables |= stats ? FUNC_VF_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; if (bp->flags & BNXT_FLAG_CHIP_P5) { enables |= tx_rings + ring_grps ? - FUNC_VF_CFG_REQ_ENABLES_NUM_CMPL_RINGS | - FUNC_VF_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; + FUNC_VF_CFG_REQ_ENABLES_NUM_CMPL_RINGS : 0; } else { enables |= cp_rings ? - FUNC_VF_CFG_REQ_ENABLES_NUM_CMPL_RINGS | - FUNC_VF_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; + FUNC_VF_CFG_REQ_ENABLES_NUM_CMPL_RINGS : 0; enables |= ring_grps ? FUNC_VF_CFG_REQ_ENABLES_NUM_HW_RING_GRPS : 0; } From 0b397b17a4120cb80f7bf89eb30587b3dd9b0d1d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Apr 2019 22:31:55 -0400 Subject: [PATCH 125/181] bnxt_en: Fix uninitialized variable usage in bnxt_rx_pkt(). In bnxt_rx_pkt(), if the driver encounters BD errors, it will recycle the buffers and jump to the end where the uninitailized variable "len" is referenced. Fix it by adding a new jump label that will skip the length update. This is the most correct fix since the length may not be valid when we get this type of error. Fixes: 6a8788f25625 ("bnxt_en: add support for software dynamic interrupt moderation") Reported-by: Nathan Chancellor Cc: Nathan Chancellor Signed-off-by: Michael Chan Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b6cb7b8ceab9..52ade133b57c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1625,7 +1625,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, netdev_warn(bp->dev, "RX buffer error %x\n", rx_err); bnxt_sched_reset(bp, rxr); } - goto next_rx; + goto next_rx_no_len; } len = le32_to_cpu(rxcmp->rx_cmp_len_flags_type) >> RX_CMP_LEN_SHIFT; @@ -1706,12 +1706,13 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, rc = 1; next_rx: - rxr->rx_prod = NEXT_RX(prod); - rxr->rx_next_cons = NEXT_RX(cons); - cpr->rx_packets += 1; cpr->rx_bytes += len; +next_rx_no_len: + rxr->rx_prod = NEXT_RX(prod); + rxr->rx_next_cons = NEXT_RX(cons); + next_rx_no_prod_no_len: *raw_cons = tmp_raw_cons; From 97e1caa517e22d62a283b876fb8aa5f4672c83dd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 17:35:09 -0700 Subject: [PATCH 126/181] net/tls: don't copy negative amounts of data in reencrypt There is no guarantee the record starts before the skb frags. If we don't check for this condition copy amount will get negative, leading to reads and writes to random memory locations. Familiar hilarity ensues. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Signed-off-by: David S. Miller --- net/tls/tls_device.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index cc0256939eb6..96357060addc 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -628,14 +628,16 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) else err = 0; - copy = min_t(int, skb_pagelen(skb) - offset, - rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + if (skb_pagelen(skb) > offset) { + copy = min_t(int, skb_pagelen(skb) - offset, + rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); - if (skb->decrypted) - skb_store_bits(skb, offset, buf, copy); + if (skb->decrypted) + skb_store_bits(skb, offset, buf, copy); - offset += copy; - buf += copy; + offset += copy; + buf += copy; + } skb_walk_frags(skb, skb_iter) { copy = min_t(int, skb_iter->len, From eb3d38d5adb520435d4e4af32529ccb13ccc9935 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 17:35:10 -0700 Subject: [PATCH 127/181] net/tls: fix copy to fragments in reencrypt Fragments may contain data from other records so we have to account for that when we calculate the destination and max length of copy we can perform. Note that 'offset' is the offset within the message, so it can't be passed as offset within the frag.. Here skb_store_bits() would have realised the call is wrong and simply not copy data. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Signed-off-by: David S. Miller --- net/tls/tls_device.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 96357060addc..14dedb24fa7b 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -597,7 +597,7 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) { struct strp_msg *rxm = strp_msg(skb); - int err = 0, offset = rxm->offset, copy, nsg; + int err = 0, offset = rxm->offset, copy, nsg, data_len, pos; struct sk_buff *skb_iter, *unused; struct scatterlist sg[1]; char *orig_buf, *buf; @@ -628,9 +628,10 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) else err = 0; + data_len = rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE; + if (skb_pagelen(skb) > offset) { - copy = min_t(int, skb_pagelen(skb) - offset, - rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + copy = min_t(int, skb_pagelen(skb) - offset, data_len); if (skb->decrypted) skb_store_bits(skb, offset, buf, copy); @@ -639,16 +640,30 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) buf += copy; } + pos = skb_pagelen(skb); skb_walk_frags(skb, skb_iter) { - copy = min_t(int, skb_iter->len, - rxm->full_len - offset + rxm->offset - - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + int frag_pos; + + /* Practically all frags must belong to msg if reencrypt + * is needed with current strparser and coalescing logic, + * but strparser may "get optimized", so let's be safe. + */ + if (pos + skb_iter->len <= offset) + goto done_with_frag; + if (pos >= data_len + rxm->offset) + break; + + frag_pos = offset - pos; + copy = min_t(int, skb_iter->len - frag_pos, + data_len + rxm->offset - offset); if (skb_iter->decrypted) - skb_store_bits(skb_iter, offset, buf, copy); + skb_store_bits(skb_iter, frag_pos, buf, copy); offset += copy; buf += copy; +done_with_frag: + pos += skb_iter->len; } free_buf: From 21f1b8a6636c4dbde4aa1ec0343f42eaf653ffcc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Apr 2019 12:50:44 +0200 Subject: [PATCH 128/181] udp: fix GRO reception in case of length mismatch Currently, the UDP GRO code path does bad things on some edge conditions - Aggregation can happen even on packet with different lengths. Fix the above by rewriting the 'complete' condition for GRO packets. While at it, note explicitly that we allow merging the first packet per burst below gso_size. Reported-by: Sean Tong Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 64f9715173ac..d8776b2110c1 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -377,13 +377,14 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot - * leading to execessive truesize values + * leading to execessive truesize values. + * On len mismatch merge the first packet shorter than gso_size, + * otherwise complete the GRO packet. */ - if (!skb_gro_receive(p, skb) && + if (uh->len > uh2->len || skb_gro_receive(p, skb) || + uh->len != uh2->len || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; - else if (uh->len != uh2->len) - pp = p; return pp; } From de1887c064b9996ac03120d90d0a909a3f678f98 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Tue, 16 Apr 2019 12:57:21 +0300 Subject: [PATCH 129/181] iwlwifi: mvm: check for length correctness in iwl_mvm_create_skb() We don't check for the validity of the lengths in the packet received from the firmware. If the MPDU length received in the rx descriptor is too short to contain the header length and the crypt length together, we may end up trying to copy a negative number of bytes (headlen - hdrlen < 0) which will underflow and cause us to try to copy a huge amount of data. This causes oopses such as this one: BUG: unable to handle kernel paging request at ffff896be2970000 PGD 5e201067 P4D 5e201067 PUD 5e205067 PMD 16110d063 PTE 8000000162970161 Oops: 0003 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 1824 Comm: irq/134-iwlwifi Not tainted 4.19.33-04308-geea41cf4930f #1 Hardware name: [...] RIP: 0010:memcpy_erms+0x6/0x10 Code: 90 90 90 90 eb 1e 0f 1f 00 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 f3 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 a4 c3 0f 1f 80 00 00 00 00 48 89 f8 48 83 fa 20 72 7e 40 38 fe RSP: 0018:ffffa4630196fc60 EFLAGS: 00010287 RAX: ffff896be2924618 RBX: ffff896bc8ecc600 RCX: 00000000fffb4610 RDX: 00000000fffffff8 RSI: ffff896a835e2a38 RDI: ffff896be2970000 RBP: ffffa4630196fd30 R08: ffff896bc8ecc600 R09: ffff896a83597000 R10: ffff896bd6998400 R11: 000000000200407f R12: ffff896a83597050 R13: 00000000fffffff8 R14: 0000000000000010 R15: ffff896a83597038 FS: 0000000000000000(0000) GS:ffff896be8280000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff896be2970000 CR3: 000000005dc12002 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: iwl_mvm_rx_mpdu_mq+0xb51/0x121b [iwlmvm] iwl_pcie_rx_handle+0x58c/0xa89 [iwlwifi] iwl_pcie_irq_rx_msix_handler+0xd9/0x12a [iwlwifi] irq_thread_fn+0x24/0x49 irq_thread+0xb0/0x122 kthread+0x138/0x140 ret_from_fork+0x1f/0x40 Fix that by checking the lengths for correctness and trigger a warning to show that we have received wrong data. Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index 1e03acf30762..b516fd1867ec 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -169,9 +169,9 @@ static inline int iwl_mvm_check_pn(struct iwl_mvm *mvm, struct sk_buff *skb, } /* iwl_mvm_create_skb Adds the rxb to a new skb */ -static void iwl_mvm_create_skb(struct sk_buff *skb, struct ieee80211_hdr *hdr, - u16 len, u8 crypt_len, - struct iwl_rx_cmd_buffer *rxb) +static int iwl_mvm_create_skb(struct iwl_mvm *mvm, struct sk_buff *skb, + struct ieee80211_hdr *hdr, u16 len, u8 crypt_len, + struct iwl_rx_cmd_buffer *rxb) { struct iwl_rx_packet *pkt = rxb_addr(rxb); struct iwl_rx_mpdu_desc *desc = (void *)pkt->data; @@ -204,6 +204,20 @@ static void iwl_mvm_create_skb(struct sk_buff *skb, struct ieee80211_hdr *hdr, * present before copying packet data. */ hdrlen += crypt_len; + + if (WARN_ONCE(headlen < hdrlen, + "invalid packet lengths (hdrlen=%d, len=%d, crypt_len=%d)\n", + hdrlen, len, crypt_len)) { + /* + * We warn and trace because we want to be able to see + * it in trace-cmd as well. + */ + IWL_DEBUG_RX(mvm, + "invalid packet lengths (hdrlen=%d, len=%d, crypt_len=%d)\n", + hdrlen, len, crypt_len); + return -EINVAL; + } + skb_put_data(skb, hdr, hdrlen); skb_put_data(skb, (u8 *)hdr + hdrlen + pad_len, headlen - hdrlen); @@ -216,6 +230,8 @@ static void iwl_mvm_create_skb(struct sk_buff *skb, struct ieee80211_hdr *hdr, skb_add_rx_frag(skb, 0, rxb_steal_page(rxb), offset, fraglen, rxb->truesize); } + + return 0; } static void iwl_mvm_add_rtap_sniffer_config(struct iwl_mvm *mvm, @@ -1671,7 +1687,11 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi, rx_status->boottime_ns = ktime_get_boot_ns(); } - iwl_mvm_create_skb(skb, hdr, len, crypt_len, rxb); + if (iwl_mvm_create_skb(mvm, skb, hdr, len, crypt_len, rxb)) { + kfree_skb(skb); + goto out; + } + if (!iwl_mvm_reorder(mvm, napi, queue, sta, skb, desc)) iwl_mvm_pass_packet_to_mac80211(mvm, napi, skb, queue, sta, csi); From 5c9adef9789148d382d7d1307c3d6bfaf51d143d Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 21 Apr 2019 17:58:11 +0300 Subject: [PATCH 130/181] iwlwifi: fix driver operation for 5350 We introduced a bug that prevented this old device from working. The driver would simply not be able to complete the INIT flow while spewing this warning: CSR addresses aren't configured WARNING: CPU: 0 PID: 819 at drivers/net/wireless/intel/iwlwifi/pcie/drv.c:917 iwl_pci_probe+0x160/0x1e0 [iwlwifi] Cc: stable@vger.kernel.org # v4.18+ Fixes: a8cbb46f831d ("iwlwifi: allow different csr flags for different device families") Signed-off-by: Emmanuel Grumbach Fixes: c8f1b51e506d ("iwlwifi: allow different csr flags for different device families") Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/cfg/5000.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/5000.c b/drivers/net/wireless/intel/iwlwifi/cfg/5000.c index 575a7022d045..3846064d51a5 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/5000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/5000.c @@ -1,7 +1,7 @@ /****************************************************************************** * * Copyright(c) 2007 - 2014 Intel Corporation. All rights reserved. - * Copyright(c) 2018 Intel Corporation + * Copyright(c) 2018 - 2019 Intel Corporation * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as @@ -136,6 +136,7 @@ const struct iwl_cfg iwl5350_agn_cfg = { .ht_params = &iwl5000_ht_params, .led_mode = IWL_LED_BLINK, .internal_wimax_coex = true, + .csr = &iwl_csr_v1, }; #define IWL_DEVICE_5150 \ From d156e67d3f58c5d3c7ebe1bec80657db534f32d4 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Thu, 25 Apr 2019 10:03:34 +0300 Subject: [PATCH 131/181] iwlwifi: mvm: fix merge damage in iwl_mvm_vif_dbgfs_register() When I rebased Greg's patch, I accidentally left the old if block that was already there. Remove it. Fixes: 154d4899e411 ("iwlwifi: mvm: properly check debugfs dentry before using it") Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c index 738eddb2e7ac..6925527d8457 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c @@ -780,12 +780,6 @@ void iwl_mvm_vif_dbgfs_register(struct iwl_mvm *mvm, struct ieee80211_vif *vif) return; } - if (!mvmvif->dbgfs_dir) { - IWL_ERR(mvm, "Failed to create debugfs directory under %pd\n", - dbgfs_dir); - return; - } - if (iwlmvm_mod_params.power_scheme != IWL_POWER_SCHEME_CAM && ((vif->type == NL80211_IFTYPE_STATION && !vif->p2p) || (vif->type == NL80211_IFTYPE_STATION && vif->p2p))) From b1da6a51871c6929dced1a7fad81990988b36ed6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 24 Apr 2019 18:39:57 +0200 Subject: [PATCH 132/181] fsnotify: Fix NULL ptr deref in fanotify_get_fsid() fanotify_get_fsid() is reading mark->connector->fsid under srcu. It can happen that it sees mark not fully initialized or mark that is already detached from the object list. In these cases mark->connector can be NULL leading to NULL ptr dereference. Fix the problem by being careful when reading mark->connector and check it for being NULL. Also use WRITE_ONCE when writing the mark just to prevent compiler from doing something stupid. Reported-by: syzbot+15927486a4f1bfcbaf91@syzkaller.appspotmail.com Fixes: 77115225acc6 ("fanotify: cache fsid in fsnotify_mark_connector") Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 14 ++++++++++++-- fs/notify/mark.c | 12 ++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 6b9c27548997..63c6bb1f8c4d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -346,10 +346,16 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) __kernel_fsid_t fsid = {}; fsnotify_foreach_obj_type(type) { + struct fsnotify_mark_connector *conn; + if (!fsnotify_iter_should_report_type(iter_info, type)) continue; - fsid = iter_info->marks[type]->connector->fsid; + conn = READ_ONCE(iter_info->marks[type]->connector); + /* Mark is just getting destroyed or created? */ + if (!conn) + continue; + fsid = conn->fsid; if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; @@ -408,8 +414,12 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { fsid = fanotify_get_fsid(iter_info); + /* Racing with mark destruction or creation? */ + if (!fsid.val[0] && !fsid.val[1]) + return 0; + } event = fanotify_alloc_event(group, inode, mask, data, data_type, &fsid); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d593d4269561..22acb0a79b53 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -239,13 +239,13 @@ static void fsnotify_drop_object(unsigned int type, void *objp) void fsnotify_put_mark(struct fsnotify_mark *mark) { - struct fsnotify_mark_connector *conn; + struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ - if (!mark->connector) { + if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; @@ -255,10 +255,9 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ - if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; - conn = mark->connector; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); @@ -266,7 +265,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) } else { __fsnotify_recalc_mask(conn); } - mark->connector = NULL; + WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); @@ -620,7 +619,7 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: - mark->connector = conn; + WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); @@ -808,6 +807,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; + WRITE_ONCE(mark->connector, NULL); } /* From 37624b58542fb9f2d9a70e6ea006ef8a5f66c30b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Apr 2019 17:04:13 -0700 Subject: [PATCH 133/181] Linux 5.1-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index abe13538a8c0..2b99679148dc 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Shy Crocodile # *DOCUMENTATION* From 38faed150438be8d6e419137209d25439e6f4c33 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 26 Mar 2019 13:57:28 -0700 Subject: [PATCH 134/181] ath10k: perform crash dump collection in workqueue Commit 25733c4e67df ("ath10k: pci: use mutex for diagnostic window CE polling") introduced a regression where we try to sleep (grab a mutex) in an atomic context: [ 233.602619] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:254 [ 233.602626] in_atomic(): 1, irqs_disabled(): 0, pid: 0, name: swapper/0 [ 233.602636] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 5.1.0-rc2 #4 [ 233.602642] Hardware name: Google Scarlet (DT) [ 233.602647] Call trace: [ 233.602663] dump_backtrace+0x0/0x11c [ 233.602672] show_stack+0x20/0x28 [ 233.602681] dump_stack+0x98/0xbc [ 233.602690] ___might_sleep+0x154/0x16c [ 233.602696] __might_sleep+0x78/0x88 [ 233.602704] mutex_lock+0x2c/0x5c [ 233.602717] ath10k_pci_diag_read_mem+0x68/0x21c [ath10k_pci] [ 233.602725] ath10k_pci_diag_read32+0x48/0x74 [ath10k_pci] [ 233.602733] ath10k_pci_dump_registers+0x5c/0x16c [ath10k_pci] [ 233.602741] ath10k_pci_fw_crashed_dump+0xb8/0x548 [ath10k_pci] [ 233.602749] ath10k_pci_napi_poll+0x60/0x128 [ath10k_pci] [ 233.602757] net_rx_action+0x140/0x388 [ 233.602766] __do_softirq+0x1b0/0x35c [...] ath10k_pci_fw_crashed_dump() is called from NAPI contexts, and firmware memory dumps are retrieved using the diag memory interface. A simple reproduction case is to run this on QCA6174A / WLAN.RM.4.4.1-00132-QCARMSWP-1, which happens to be a way to b0rk the firmware: dd if=/sys/kernel/debug/ieee80211/phy0/ath10k/mem_value bs=4K count=1 of=/dev/null (NB: simulated firmware crashes, via debugfs, don't trigger firmware dumps.) The fix is to move the crash-dump into a workqueue context, and avoid relying on 'data_lock' for most mutual exclusion. We only keep using it here for protecting 'fw_crash_counter', while the rest of the coredump buffers are protected by a new 'dump_mutex'. I've tested the above with simulated firmware crashes (debugfs 'reset' file), real firmware crashes (the 'dd' command above), and a variety of reboot and suspend/resume configurations on QCA6174A. Reported here: http://lkml.kernel.org/linux-wireless/20190325202706.GA68720@google.com Fixes: 25733c4e67df ("ath10k: pci: use mutex for diagnostic window CE polling") Signed-off-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/ce.c | 2 +- drivers/net/wireless/ath/ath10k/core.c | 1 + drivers/net/wireless/ath/ath10k/core.h | 3 +++ drivers/net/wireless/ath/ath10k/coredump.c | 6 +++--- drivers/net/wireless/ath/ath10k/pci.c | 24 +++++++++++++++++----- drivers/net/wireless/ath/ath10k/pci.h | 2 ++ 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/ce.c b/drivers/net/wireless/ath/ath10k/ce.c index 24b983edb357..eca87f7c5b6c 100644 --- a/drivers/net/wireless/ath/ath10k/ce.c +++ b/drivers/net/wireless/ath/ath10k/ce.c @@ -1855,7 +1855,7 @@ void ath10k_ce_dump_registers(struct ath10k *ar, struct ath10k_ce_crash_data ce_data; u32 addr, id; - lockdep_assert_held(&ar->data_lock); + lockdep_assert_held(&ar->dump_mutex); ath10k_err(ar, "Copy Engine register dump:\n"); diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 835b8de92d55..aff585658fc0 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -3119,6 +3119,7 @@ struct ath10k *ath10k_core_create(size_t priv_size, struct device *dev, goto err_free_wq; mutex_init(&ar->conf_mutex); + mutex_init(&ar->dump_mutex); spin_lock_init(&ar->data_lock); INIT_LIST_HEAD(&ar->peers); diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index e08a17b01e03..e35aae5146f1 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -1063,6 +1063,9 @@ struct ath10k { /* prevents concurrent FW reconfiguration */ struct mutex conf_mutex; + /* protects coredump data */ + struct mutex dump_mutex; + /* protects shared structure data */ spinlock_t data_lock; diff --git a/drivers/net/wireless/ath/ath10k/coredump.c b/drivers/net/wireless/ath/ath10k/coredump.c index 33838d9c1cb6..45a355fb62b9 100644 --- a/drivers/net/wireless/ath/ath10k/coredump.c +++ b/drivers/net/wireless/ath/ath10k/coredump.c @@ -1102,7 +1102,7 @@ struct ath10k_fw_crash_data *ath10k_coredump_new(struct ath10k *ar) { struct ath10k_fw_crash_data *crash_data = ar->coredump.fw_crash_data; - lockdep_assert_held(&ar->data_lock); + lockdep_assert_held(&ar->dump_mutex); if (ath10k_coredump_mask == 0) /* coredump disabled */ @@ -1146,7 +1146,7 @@ static struct ath10k_dump_file_data *ath10k_coredump_build(struct ath10k *ar) if (!buf) return NULL; - spin_lock_bh(&ar->data_lock); + mutex_lock(&ar->dump_mutex); dump_data = (struct ath10k_dump_file_data *)(buf); strlcpy(dump_data->df_magic, "ATH10K-FW-DUMP", @@ -1213,7 +1213,7 @@ static struct ath10k_dump_file_data *ath10k_coredump_build(struct ath10k *ar) sofar += sizeof(*dump_tlv) + crash_data->ramdump_buf_len; } - spin_unlock_bh(&ar->data_lock); + mutex_unlock(&ar->dump_mutex); return dump_data; } diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c index 271f92c24d44..2c27f407a851 100644 --- a/drivers/net/wireless/ath/ath10k/pci.c +++ b/drivers/net/wireless/ath/ath10k/pci.c @@ -1441,7 +1441,7 @@ static void ath10k_pci_dump_registers(struct ath10k *ar, __le32 reg_dump_values[REG_DUMP_COUNT_QCA988X] = {}; int i, ret; - lockdep_assert_held(&ar->data_lock); + lockdep_assert_held(&ar->dump_mutex); ret = ath10k_pci_diag_read_hi(ar, ®_dump_values[0], hi_failure_state, @@ -1656,7 +1656,7 @@ static void ath10k_pci_dump_memory(struct ath10k *ar, int ret, i; u8 *buf; - lockdep_assert_held(&ar->data_lock); + lockdep_assert_held(&ar->dump_mutex); if (!crash_data) return; @@ -1734,14 +1734,19 @@ static void ath10k_pci_dump_memory(struct ath10k *ar, } } -static void ath10k_pci_fw_crashed_dump(struct ath10k *ar) +static void ath10k_pci_fw_dump_work(struct work_struct *work) { + struct ath10k_pci *ar_pci = container_of(work, struct ath10k_pci, + dump_work); struct ath10k_fw_crash_data *crash_data; + struct ath10k *ar = ar_pci->ar; char guid[UUID_STRING_LEN + 1]; - spin_lock_bh(&ar->data_lock); + mutex_lock(&ar->dump_mutex); + spin_lock_bh(&ar->data_lock); ar->stats.fw_crash_counter++; + spin_unlock_bh(&ar->data_lock); crash_data = ath10k_coredump_new(ar); @@ -1756,11 +1761,18 @@ static void ath10k_pci_fw_crashed_dump(struct ath10k *ar) ath10k_ce_dump_registers(ar, crash_data); ath10k_pci_dump_memory(ar, crash_data); - spin_unlock_bh(&ar->data_lock); + mutex_unlock(&ar->dump_mutex); queue_work(ar->workqueue, &ar->restart_work); } +static void ath10k_pci_fw_crashed_dump(struct ath10k *ar) +{ + struct ath10k_pci *ar_pci = ath10k_pci_priv(ar); + + queue_work(ar->workqueue, &ar_pci->dump_work); +} + void ath10k_pci_hif_send_complete_check(struct ath10k *ar, u8 pipe, int force) { @@ -3442,6 +3454,8 @@ int ath10k_pci_setup_resource(struct ath10k *ar) spin_lock_init(&ar_pci->ps_lock); mutex_init(&ar_pci->ce_diag_mutex); + INIT_WORK(&ar_pci->dump_work, ath10k_pci_fw_dump_work); + timer_setup(&ar_pci->rx_post_retry, ath10k_pci_rx_replenish_retry, 0); if (QCA_REV_6174(ar) || QCA_REV_9377(ar)) diff --git a/drivers/net/wireless/ath/ath10k/pci.h b/drivers/net/wireless/ath/ath10k/pci.h index 3773c79f322f..4455ed6c5275 100644 --- a/drivers/net/wireless/ath/ath10k/pci.h +++ b/drivers/net/wireless/ath/ath10k/pci.h @@ -121,6 +121,8 @@ struct ath10k_pci { /* For protecting ce_diag */ struct mutex ce_diag_mutex; + struct work_struct dump_work; + struct ath10k_ce ce; struct timer_list rx_post_retry; From 9e80ad37f6788ed52b89a3cfcd593e0aa69b216d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 3 Mar 2019 18:24:33 +0100 Subject: [PATCH 135/181] ath10k: Drop WARN_ON()s that always trigger during system resume ath10k_mac_vif_chan() always returns an error for the given vif during system-wide resume which reliably triggers two WARN_ON()s in ath10k_bss_info_changed() and they are not particularly useful in that code path, so drop them. Tested: QCA6174 hw3.2 PCI with WLAN.RM.2.0-00180-QCARMSWPZ-1 Tested: QCA6174 hw3.2 SDIO with WLAN.RMH.4.4.1-00007-QCARMSWP-1 Fixes: cd93b83ad927 ("ath10k: support for multicast rate control") Fixes: f279294e9ee2 ("ath10k: add support for configuring management packet rate") Cc: stable@vger.kernel.org Reviewed-by: Brian Norris Tested-by: Brian Norris Tested-by: Claire Chang Signed-off-by: Rafael J. Wysocki Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/mac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 41e89db244d2..9c703d287333 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -5774,7 +5774,7 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw, } if (changed & BSS_CHANGED_MCAST_RATE && - !WARN_ON(ath10k_mac_vif_chan(arvif->vif, &def))) { + !ath10k_mac_vif_chan(arvif->vif, &def)) { band = def.chan->band; rateidx = vif->bss_conf.mcast_rate[band] - 1; @@ -5812,7 +5812,7 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw, } if (changed & BSS_CHANGED_BASIC_RATES) { - if (WARN_ON(ath10k_mac_vif_chan(vif, &def))) { + if (ath10k_mac_vif_chan(vif, &def)) { mutex_unlock(&ar->conf_mutex); return; } From dfbd199a7cfe3e3cd8531e1353cdbd7175bfbc5e Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 24 Feb 2019 21:55:28 -0300 Subject: [PATCH 136/181] selinux: use kernel linux/socket.h for genheaders and mdp When compiling genheaders and mdp from a newer host kernel, the following error happens: In file included from scripts/selinux/genheaders/genheaders.c:18: ./security/selinux/include/classmap.h:238:2: error: #error New address family defined, please update secclass_map. #error New address family defined, please update secclass_map. ^~~~~ make[3]: *** [scripts/Makefile.host:107: scripts/selinux/genheaders/genheaders] Error 1 make[2]: *** [scripts/Makefile.build:599: scripts/selinux/genheaders] Error 2 make[1]: *** [scripts/Makefile.build:599: scripts/selinux] Error 2 make[1]: *** Waiting for unfinished jobs.... Instead of relying on the host definition, include linux/socket.h in classmap.h to have PF_MAX. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara Acked-by: Stephen Smalley [PM: manually merge in mdp.c, subject line tweaks] Signed-off-by: Paul Moore --- scripts/selinux/genheaders/genheaders.c | 1 - scripts/selinux/mdp/mdp.c | 1 - security/selinux/include/classmap.h | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/selinux/genheaders/genheaders.c b/scripts/selinux/genheaders/genheaders.c index 1ceedea847dd..544ca126a8a8 100644 --- a/scripts/selinux/genheaders/genheaders.c +++ b/scripts/selinux/genheaders/genheaders.c @@ -9,7 +9,6 @@ #include #include #include -#include struct security_class_mapping { const char *name; diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c index 073fe7537f6c..6d51b74bc679 100644 --- a/scripts/selinux/mdp/mdp.c +++ b/scripts/selinux/mdp/mdp.c @@ -32,7 +32,6 @@ #include #include #include -#include static void usage(char *name) { diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index bd5fe0d3204a..201f7e588a29 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include #define COMMON_FILE_SOCK_PERMS "ioctl", "read", "write", "create", \ "getattr", "setattr", "lock", "relabelfrom", "relabelto", "append", "map" From 6a5c5d26c4c6c3cc486fef0bf04ff9551132611b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Apr 2019 09:48:53 -0700 Subject: [PATCH 137/181] rdma: fix build errors on s390 and MIPS due to bad ZERO_PAGE use The parameter to ZERO_PAGE() was wrong, but since all architectures except for MIPS and s390 ignore it, it wasn't noticed until 0-day reported the build error. Fixes: 67f269b37f9b ("RDMA/ucontext: Fix regression with disassociate") Cc: stable@vger.kernel.org Cc: Andrea Arcangeli Cc: Leon Romanovsky Cc: Jason Gunthorpe Signed-off-by: Linus Torvalds --- drivers/infiniband/core/uverbs_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 7843e89235c3..c489f545baae 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -895,7 +895,7 @@ static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) /* Read only pages can just use the system zero page. */ if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { - vmf->page = ZERO_PAGE(vmf->vm_start); + vmf->page = ZERO_PAGE(vmf->address); get_page(vmf->page); return 0; } From 80871482fd5cb1cb396ea232237a7d9c540854f9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Apr 2019 09:51:29 -0700 Subject: [PATCH 138/181] x86: make ZERO_PAGE() at least parse its argument This doesn't really do anything, but at least we now parse teh ZERO_PAGE() address argument so that we'll catch the most obvious errors in usage next time they'll happen. See commit 6a5c5d26c4c6 ("rdma: fix build errors on s390 and MIPS due to bad ZERO_PAGE use") what happens when we don't have any use of the macro argument at all. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2779ace16d23..50b3e2d963c9 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -46,7 +46,7 @@ void ptdump_walk_user_pgd_level_checkwx(void); */ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __visible; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; extern struct list_head pgd_list; From 95c169251bf734aa555a1e8043e4d88ec97a04ec Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 25 Apr 2019 12:06:54 -0400 Subject: [PATCH 139/181] ipv6: invert flowlabel sharing check in process and user mode A request for a flowlabel fails in process or user exclusive mode must fail if the caller pid or uid does not match. Invert the test. Previously, the test was unsafe wrt PID recycling, but indeed tested for inequality: fl1->owner != fl->owner Fixes: 4f82f45730c68 ("net ip6 flowlabel: Make owner a union of struct pid* and kuid_t") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv6/ip6_flowlabel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index cb54a8a3c273..a05036bc808d 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -633,9 +633,9 @@ recheck: if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && - (fl1->owner.pid == fl->owner.pid)) || + (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && - uid_eq(fl1->owner.uid, fl->owner.uid))) + !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; From ca2fe2956acef2f87f6c55549874fdd2e92d9824 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 26 Apr 2019 10:10:05 -0700 Subject: [PATCH 140/181] tcp: add sanity tests in tcp_add_backlog() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard and Bruno both reported that my commit added a bug, and Bruno was able to determine the problem came when a segment wih a FIN packet was coalesced to a prior one in tcp backlog queue. It turns out the header prediction in tcp_rcv_established() looks back to TCP headers in the packet, not in the metadata (aka TCP_SKB_CB(skb)->tcp_flags) The fast path in tcp_rcv_established() is not supposed to handle a FIN flag (it does not call tcp_fin()) Therefore we need to make sure to propagate the FIN flag, so that the coalesced packet does not go through the fast path, the same than a GRO packet carrying a FIN flag. While we are at it, make sure we do not coalesce packets with RST or SYN, or if they do not have ACK set. Many thanks to Richard and Bruno for pinpointing the bad commit, and to Richard for providing a first version of the fix. Fixes: 4f693b55c3d2 ("tcp: implement coalescing on backlog queue") Signed-off-by: Eric Dumazet Reported-by: Richard Purdie Reported-by: Bruno Prémont Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2f8039a26b08..a2896944aa37 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1673,7 +1673,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || ((TCP_SKB_CB(tail)->tcp_flags | - TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) || + TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || + !((TCP_SKB_CB(tail)->tcp_flags & + TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || #ifdef CONFIG_TLS_DEVICE @@ -1692,6 +1694,15 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + /* We have to update both TCP_SKB_CB(tail)->tcp_flags and + * thtail->fin, so that the fast path in tcp_rcv_established() + * is not entered if we append a packet with a FIN. + * SYN, RST, URG are not present. + * ACK is set on both packets. + * PSH : we do not really care in TCP stack, + * at least for 'GRO' packets. + */ + thtail->fin |= th->fin; TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; if (TCP_SKB_CB(skb)->has_rxtstamp) { From 1d3fd8a10bedb09006cfc963bfcf051c3021f626 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Sat, 27 Apr 2019 09:14:33 -0400 Subject: [PATCH 141/181] vrf: Use orig netdev to count Ip6InNoRoutes and a fresh route lookup when sending dest unreach When there is no route to an IPv6 dest addr, skb_dst(skb) points to loopback dev in the case of that the IP6CB(skb)->iif is enslaved to a vrf. This causes Ip6InNoRoutes to be incremented on the loopback dev. This also causes the lookup to fail on icmpv6_send() and the dest unreachable to not sent and Ip6OutNoRoutes gets incremented on the loopback dev. To reproduce: * Gateway configuration: ip link add dev vrf_258 type vrf table 258 ip link set dev enp0s9 master vrf_258 ip addr add 66:1/64 dev enp0s9 ip -6 route add unreachable default metric 8192 table 258 sysctl -w net.ipv6.conf.all.forwarding=1 sysctl -w net.ipv6.conf.enp0s9.forwarding=1 * Sender configuration: ip addr add 66::2/64 dev enp0s9 ip -6 route add default via 66::1 and ping 67::1 for example from the sender. Fix this by counting on the original netdev and reset the skb dst to force a fresh lookup. v2: Fix typo of destination address in the repro steps. v3: Simplify the loopback check (per David Ahern) and use reverse Christmas tree format (per David Miller). Signed-off-by: Stephen Suryaputra Reviewed-by: David Ahern Tested-by: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7178e32eb15d..b4899f0de0d0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3668,23 +3668,34 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { - int type; struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(dst->dev); + struct inet6_dev *idev; + int type; + + if (netif_is_l3_master(skb->dev) && + dst->dev == net->loopback_dev) + idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); + else + idev = ip6_dst_idev(dst); + switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { - IP6_INC_STATS(dev_net(dst->dev), - __in6_dev_get_safely(skb->dev), - IPSTATS_MIB_INADDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); break; } /* FALLTHROUGH */ case IPSTATS_MIB_OUTNOROUTES: - IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), - ipstats_mib_noroutes); + IP6_INC_STATS(net, idev, ipstats_mib_noroutes); break; } + + /* Start over by dropping the dst for l3mdev case */ + if (netif_is_l3_master(skb->dev)) + skb_dst_drop(skb); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb(skb); return 0; From 6c0afef5fb0c27758f4d52b2210c61b6bd8b4470 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Apr 2019 16:49:06 -0700 Subject: [PATCH 142/181] ipv6/flowlabel: wait rcu grace period before put_pid() syzbot was able to catch a use-after-free read in pid_nr_ns() [1] ip6fl_seq_show() seems to use RCU protection, dereferencing fl->owner.pid but fl_free() releases fl->owner.pid before rcu grace period is started. [1] BUG: KASAN: use-after-free in pid_nr_ns+0x128/0x140 kernel/pid.c:407 Read of size 4 at addr ffff888094012a04 by task syz-executor.0/18087 CPU: 0 PID: 18087 Comm: syz-executor.0 Not tainted 5.1.0-rc6+ #89 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 __asan_report_load4_noabort+0x14/0x20 mm/kasan/generic_report.c:131 pid_nr_ns+0x128/0x140 kernel/pid.c:407 ip6fl_seq_show+0x2f8/0x4f0 net/ipv6/ip6_flowlabel.c:794 seq_read+0xad3/0x1130 fs/seq_file.c:268 proc_reg_read+0x1fe/0x2c0 fs/proc/inode.c:227 do_loop_readv_writev fs/read_write.c:701 [inline] do_loop_readv_writev fs/read_write.c:688 [inline] do_iter_read+0x4a9/0x660 fs/read_write.c:922 vfs_readv+0xf0/0x160 fs/read_write.c:984 kernel_readv fs/splice.c:358 [inline] default_file_splice_read+0x475/0x890 fs/splice.c:413 do_splice_to+0x12a/0x190 fs/splice.c:876 splice_direct_to_actor+0x2d2/0x970 fs/splice.c:953 do_splice_direct+0x1da/0x2a0 fs/splice.c:1062 do_sendfile+0x597/0xd00 fs/read_write.c:1443 __do_sys_sendfile64 fs/read_write.c:1498 [inline] __se_sys_sendfile64 fs/read_write.c:1490 [inline] __x64_sys_sendfile64+0x15a/0x220 fs/read_write.c:1490 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x458da9 Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f300d24bc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 0000000000458da9 RDX: 00000000200000c0 RSI: 0000000000000008 RDI: 0000000000000007 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 000000000000005a R11: 0000000000000246 R12: 00007f300d24c6d4 R13: 00000000004c5fa3 R14: 00000000004da748 R15: 00000000ffffffff Allocated by task 17543: save_stack+0x45/0xd0 mm/kasan/common.c:75 set_track mm/kasan/common.c:87 [inline] __kasan_kmalloc mm/kasan/common.c:497 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:470 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:505 slab_post_alloc_hook mm/slab.h:437 [inline] slab_alloc mm/slab.c:3393 [inline] kmem_cache_alloc+0x11a/0x6f0 mm/slab.c:3555 alloc_pid+0x55/0x8f0 kernel/pid.c:168 copy_process.part.0+0x3b08/0x7980 kernel/fork.c:1932 copy_process kernel/fork.c:1709 [inline] _do_fork+0x257/0xfd0 kernel/fork.c:2226 __do_sys_clone kernel/fork.c:2333 [inline] __se_sys_clone kernel/fork.c:2327 [inline] __x64_sys_clone+0xbf/0x150 kernel/fork.c:2327 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 7789: save_stack+0x45/0xd0 mm/kasan/common.c:75 set_track mm/kasan/common.c:87 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:459 kasan_slab_free+0xe/0x10 mm/kasan/common.c:467 __cache_free mm/slab.c:3499 [inline] kmem_cache_free+0x86/0x260 mm/slab.c:3765 put_pid.part.0+0x111/0x150 kernel/pid.c:111 put_pid+0x20/0x30 kernel/pid.c:105 fl_free+0xbe/0xe0 net/ipv6/ip6_flowlabel.c:102 ip6_fl_gc+0x295/0x3e0 net/ipv6/ip6_flowlabel.c:152 call_timer_fn+0x190/0x720 kernel/time/timer.c:1325 expire_timers kernel/time/timer.c:1362 [inline] __run_timers kernel/time/timer.c:1681 [inline] __run_timers kernel/time/timer.c:1649 [inline] run_timer_softirq+0x652/0x1700 kernel/time/timer.c:1694 __do_softirq+0x266/0x95a kernel/softirq.c:293 The buggy address belongs to the object at ffff888094012a00 which belongs to the cache pid_2 of size 88 The buggy address is located 4 bytes inside of 88-byte region [ffff888094012a00, ffff888094012a58) The buggy address belongs to the page: page:ffffea0002500480 count:1 mapcount:0 mapping:ffff88809a483080 index:0xffff888094012980 flags: 0x1fffc0000000200(slab) raw: 01fffc0000000200 ffffea00018a3508 ffffea0002524a88 ffff88809a483080 raw: ffff888094012980 ffff888094012000 000000010000001b 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888094012900: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc ffff888094012980: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc >ffff888094012a00: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc ^ ffff888094012a80: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc ffff888094012b00: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc Fixes: 4f82f45730c6 ("net ip6 flowlabel: Make owner a union of struct pid * and kuid_t") Signed-off-by: Eric Dumazet Cc: Eric W. Biederman Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/ip6_flowlabel.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a05036bc808d..be5f3d7ceb96 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -94,15 +94,21 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) return fl; } +static void fl_free_rcu(struct rcu_head *head) +{ + struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); + + if (fl->share == IPV6_FL_S_PROCESS) + put_pid(fl->owner.pid); + kfree(fl->opt); + kfree(fl); +} + static void fl_free(struct ip6_flowlabel *fl) { - if (fl) { - if (fl->share == IPV6_FL_S_PROCESS) - put_pid(fl->owner.pid); - kfree(fl->opt); - kfree_rcu(fl, rcu); - } + if (fl) + call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) From b13023421b5179413421333f602850914f6a7ad8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 30 Apr 2019 08:34:08 +0100 Subject: [PATCH 143/181] rxrpc: Fix net namespace cleanup In rxrpc_destroy_all_calls(), there are two phases: (1) make sure the ->calls list is empty, emitting error messages if not, and (2) wait for the RCU cleanup to happen on outstanding calls (ie. ->nr_calls becomes 0). To avoid taking the call_lock, the function prechecks ->calls and if empty, it returns to avoid taking the lock - this is wrong, however: it still needs to go and do the second phase and wait for ->nr_calls to become 0. Without this, the rxrpc_net struct may get deallocated before we get to the RCU cleanup for the last calls. This can lead to: Slab corruption (Not tainted): kmalloc-16k start=ffff88802b178000, len=16384 050: 6b 6b 6b 6b 6b 6b 6b 6b 61 6b 6b 6b 6b 6b 6b 6b kkkkkkkkakkkkkkk Note the "61" at offset 0x58. This corresponds to the ->nr_calls member of struct rxrpc_net (which is >9k in size, and thus allocated out of the 16k slab). Fix this by flipping the condition on the if-statement, putting the locked section inside the if-body and dropping the return from there. The function will then always go on to wait for the RCU cleanup on outstanding calls. Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/call_object.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8aa2937b069f..fe96881a334d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -604,30 +604,30 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) _enter(""); - if (list_empty(&rxnet->calls)) - return; + if (!list_empty(&rxnet->calls)) { + write_lock(&rxnet->call_lock); - write_lock(&rxnet->call_lock); + while (!list_empty(&rxnet->calls)) { + call = list_entry(rxnet->calls.next, + struct rxrpc_call, link); + _debug("Zapping call %p", call); - while (!list_empty(&rxnet->calls)) { - call = list_entry(rxnet->calls.next, struct rxrpc_call, link); - _debug("Zapping call %p", call); + rxrpc_see_call(call); + list_del_init(&call->link); - rxrpc_see_call(call); - list_del_init(&call->link); + pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", + call, atomic_read(&call->usage), + rxrpc_call_states[call->state], + call->flags, call->events); - pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", - call, atomic_read(&call->usage), - rxrpc_call_states[call->state], - call->flags, call->events); + write_unlock(&rxnet->call_lock); + cond_resched(); + write_lock(&rxnet->call_lock); + } write_unlock(&rxnet->call_lock); - cond_resched(); - write_lock(&rxnet->call_lock); } - write_unlock(&rxnet->call_lock); - atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } From f949a12fd697479f68d99dc65e9bbab68ee49043 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 30 Apr 2019 13:44:19 +0300 Subject: [PATCH 144/181] net: dsa: bcm_sf2: fix buffer overflow doing set_rxnfc The "fs->location" is a u32 that comes from the user in ethtool_set_rxnfc(). We can't pass unclamped values to test_bit() or it results in an out of bounds access beyond the end of the bitmap. Fixes: 7318166cacad ("net: dsa: bcm_sf2: Add support for ethtool::rxnfc") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2_cfp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index e6234d209787..4212bc4a5f31 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -886,6 +886,9 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port, fs->m_ext.data[1])) return -EINVAL; + if (fs->location != RX_CLS_LOC_ANY && fs->location >= CFP_NUM_RULES) + return -EINVAL; + if (fs->location != RX_CLS_LOC_ANY && test_bit(fs->location, priv->cfp.used)) return -EBUSY; @@ -974,6 +977,9 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port, u32 loc) struct cfp_rule *rule; int ret; + if (loc >= CFP_NUM_RULES) + return -EINVAL; + /* Refuse deleting unused rules, and those that are not unique since * that could leave IPv6 rules with one of the chained rule in the * table. From c93ad1337ad06a718890a89cdd85188ff9a5a5cc Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 30 Apr 2019 19:34:08 +0800 Subject: [PATCH 145/181] appletalk: Set error code if register_snap_client failed If register_snap_client fails in atalk_init, error code should be set, otherwise it will triggers NULL pointer dereference while unloading module. Fixes: 9804501fa122 ("appletalk: Fix potential NULL pointer dereference in unregister_snap_client") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 709d2542f729..dbe8b1993be9 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1920,6 +1920,7 @@ static int __init atalk_init(void) ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv); if (!ddp_dl) { pr_crit("Unable to register DDP with SNAP.\n"); + rc = -ENOMEM; goto out_sock; } From a622b40035d16196bf19b2b33b854862595245fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Apr 2019 06:27:58 -0700 Subject: [PATCH 146/181] l2ip: fix possible use-after-free Before taking a refcount on a rcu protected structure, we need to make sure the refcount is not zero. syzbot reported : refcount_t: increment on 0; use-after-free. WARNING: CPU: 1 PID: 23533 at lib/refcount.c:156 refcount_inc_checked lib/refcount.c:156 [inline] WARNING: CPU: 1 PID: 23533 at lib/refcount.c:156 refcount_inc_checked+0x61/0x70 lib/refcount.c:154 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 23533 Comm: syz-executor.2 Not tainted 5.1.0-rc7+ #93 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 panic+0x2cb/0x65c kernel/panic.c:214 __warn.cold+0x20/0x45 kernel/panic.c:571 report_bug+0x263/0x2b0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:179 [inline] fixup_bug arch/x86/kernel/traps.c:174 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973 RIP: 0010:refcount_inc_checked lib/refcount.c:156 [inline] RIP: 0010:refcount_inc_checked+0x61/0x70 lib/refcount.c:154 Code: 1d 98 2b 2a 06 31 ff 89 de e8 db 2c 40 fe 84 db 75 dd e8 92 2b 40 fe 48 c7 c7 20 7a a1 87 c6 05 78 2b 2a 06 01 e8 7d d9 12 fe <0f> 0b eb c1 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 RSP: 0018:ffff888069f0fba8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 000000000000f353 RSI: ffffffff815afcb6 RDI: ffffed100d3e1f67 RBP: ffff888069f0fbb8 R08: ffff88809b1845c0 R09: ffffed1015d23ef1 R10: ffffed1015d23ef0 R11: ffff8880ae91f787 R12: ffff8880a8f26968 R13: 0000000000000004 R14: dffffc0000000000 R15: ffff8880a49a6440 l2tp_tunnel_inc_refcount net/l2tp/l2tp_core.h:240 [inline] l2tp_tunnel_get+0x250/0x580 net/l2tp/l2tp_core.c:173 pppol2tp_connect+0xc00/0x1c70 net/l2tp/l2tp_ppp.c:702 __sys_connect+0x266/0x330 net/socket.c:1808 __do_sys_connect net/socket.c:1819 [inline] __se_sys_connect net/socket.c:1816 [inline] __x64_sys_connect+0x73/0xb0 net/socket.c:1816 Fixes: 54652eb12c1b ("l2tp: hold tunnel while looking up sessions in l2tp_netlink") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index aee33d132018..52b5a2797c0c 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -169,8 +169,8 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) rcu_read_lock_bh(); list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (tunnel->tunnel_id == tunnel_id) { - l2tp_tunnel_inc_refcount(tunnel); + if (tunnel->tunnel_id == tunnel_id && + refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; @@ -190,8 +190,8 @@ struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth) rcu_read_lock_bh(); list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (++count > nth) { - l2tp_tunnel_inc_refcount(tunnel); + if (++count > nth && + refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; } From 8449eedaa1da6a51d67190c905b1b54243e095f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Sat, 27 Apr 2019 20:34:19 +0200 Subject: [PATCH 147/181] io_uring: fix handling SQEs requesting NOWAIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all request types set REQ_F_FORCE_NONBLOCK when they needed async punting; reverse logic instead and set REQ_F_NOWAIT if request mustn't be punted. Signed-off-by: Stefan Bühler Merged with my previous patch for this. Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0e9fb2cb1984..d5e23a6dd6aa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -221,7 +221,7 @@ struct io_kiocb { struct list_head list; unsigned int flags; refcount_t refs; -#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ @@ -774,10 +774,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) return ret; - if (force_nonblock) { + + /* don't allow async punt if RWF_NOWAIT was requested */ + if (kiocb->ki_flags & IOCB_NOWAIT) + req->flags |= REQ_F_NOWAIT; + + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; - req->flags |= REQ_F_FORCE_NONBLOCK; - } + if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -1436,8 +1440,7 @@ restart: struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; - /* Ensure we clear previously set forced non-block flag */ - req->flags &= ~REQ_F_FORCE_NONBLOCK; + /* Ensure we clear previously set non-block flag */ req->rw.ki_flags &= ~IOCB_NOWAIT; ret = 0; @@ -1623,7 +1626,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, goto out; ret = __io_submit_sqe(ctx, req, s, true); - if (ret == -EAGAIN) { + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); From 1e84b97b7377bd0198f87b49ad3e396e84bf0458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Wed, 24 Apr 2019 23:54:16 +0200 Subject: [PATCH 148/181] io_uring: fix notes on barriers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The application reading the CQ ring needs a barrier to pair with the smp_store_release in io_commit_cqring, not the barrier after it. Also a write barrier *after* writing something (but not *before* writing anything interesting) doesn't order anything, so an smp_wmb() after writing SQ tail is not needed. Additionally consider reading SQ head and writing CQ tail in the notes. Also add some clarifications how the various other fields in the ring buffers are used. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 110 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d5e23a6dd6aa..7ab93e854eb2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4,15 +4,28 @@ * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. When the application reads the CQ ring - * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() - * the kernel uses after writing the tail. Failure to do so could cause a - * delay in when the application notices that completion events available. - * This isn't a fatal condition. Likewise, the application must use an - * appropriate smp_wmb() both before writing the SQ tail, and after writing - * the SQ tail. The first one orders the sqe writes with the tail write, and - * the latter is paired with the smp_rmb() the kernel will issue before - * reading the SQ tail on submission. + * the application and kernel side. + * + * After the application reads the CQ ring tail, it must use an + * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses + * before writing the tail (using smp_load_acquire to read the tail will + * do). It also needs a smp_mb() before updating CQ head (ordering the + * entry load(s) with the head store), pairing with an implicit barrier + * through a control-dependency in io_get_cqring (smp_store_release to + * store head will do). Failure to do so could lead to reading invalid + * CQ entries. + * + * Likewise, the application must use an appropriate smp_wmb() before + * writing the SQ tail (ordering SQ entry stores with the tail store), + * which pairs with smp_load_acquire in io_get_sqring (smp_store_release + * to store the tail will do). And it needs a barrier ordering the SQ + * head load before writing new SQ entries (smp_load_acquire to read + * head will do). + * + * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application + * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* + * updating the SQ tail; a full memory barrier smp_mb() is needed + * between. * * Also see the examples in the liburing library: * @@ -70,20 +83,108 @@ struct io_uring { u32 tail ____cacheline_aligned_in_smp; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_SQ_RING. + * + * The offsets to the member fields are published through struct + * io_sqring_offsets when calling io_uring_setup. + */ struct io_sq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The kernel controls head and the application controls tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of invalid entries dropped by the kernel due to + * invalid index stored in array + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * After a new SQ head value was read by the application this + * counter includes all submissions that were dropped reaching + * the new SQ head (and possibly more). + */ u32 dropped; + /* + * Runtime flags + * + * Written by the kernel, shouldn't be modified by the + * application. + * + * The application needs a full memory barrier before checking + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. + */ u32 flags; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ u32 array[]; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_CQ_RING. + * + * The offsets to the member fields are published through struct + * io_cqring_offsets when calling io_uring_setup. + */ struct io_cq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The application controls head and the kernel tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of completion events lost because the queue was full; + * this should be avoided by the application by making sure + * there are not more requests pending thatn there is space in + * the completion queue. + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * As completion events come in out of order this counter is not + * ordered with any other data. + */ u32 overflow; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ struct io_uring_cqe cqes[]; }; From 4f7067c3fb7f2974363a28c597a41949d971af02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Wed, 24 Apr 2019 23:54:17 +0200 Subject: [PATCH 149/181] io_uring: remove unnecessary barrier before wq_has_sleeper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit wq_has_sleeper has a full barrier internally. The smp_rmb barrier in io_uring_poll synchronizes with it. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7ab93e854eb2..bb71b7f00bb3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -418,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) /* order cqe stores with ring update */ smp_store_release(&ring->r.tail, ctx->cached_cq_tail); - /* - * Write sider barrier of tail update, app has read side. See - * comment at the top of this file. - */ - smp_wmb(); - if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); @@ -2677,7 +2671,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) __poll_t mask = 0; poll_wait(file, &ctx->cq_wait, wait); - /* See comment at the top of this file */ + /* + * synchronizes with barrier from wq_has_sleeper call in + * io_commit_cqring + */ smp_rmb(); if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != ctx->sq_ring->ring_entries) From 115e12e58dbc055e98c965e3255aed7b20214f95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Wed, 24 Apr 2019 23:54:18 +0200 Subject: [PATCH 150/181] io_uring: remove unnecessary barrier before reading cq head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory operations before reading cq head are unrelated and we don't care about their order. Document that the control dependency in combination with READ_ONCE and WRITE_ONCE forms a barrier we need. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bb71b7f00bb3..3671a654a146 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -431,8 +431,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) unsigned tail; tail = ctx->cached_cq_tail; - /* See comment at the top of the file */ - smp_rmb(); + /* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) return NULL; From 9e4c15a3939448d2ea9b9bf59561183bbe3fdc49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Wed, 24 Apr 2019 23:54:19 +0200 Subject: [PATCH 151/181] io_uring: remove unnecessary barrier after updating SQ head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no operation afterwards to order with. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3671a654a146..d3c57ee233fe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1798,12 +1798,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * write new data to them. */ smp_store_release(&ring->r.head, ctx->cached_sq_head); - - /* - * write side barrier of head update, app has read side. See - * comment at the top of this file - */ - smp_wmb(); } } From 82ab082c0e2f8592c2ff6b2ab99a92d8406c8c2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Wed, 24 Apr 2019 23:54:20 +0200 Subject: [PATCH 152/181] io_uring: remove unnecessary barrier before reading SQ tail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no operation before to order with. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d3c57ee233fe..662f1c070c8c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1831,8 +1831,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) * though the application is the one updating it. */ head = ctx->cached_sq_head; - /* See comment at the top of this file */ - smp_rmb(); /* make sure SQ entry isn't read before tail */ if (head == smp_load_acquire(&ring->r.tail)) return false; From b841f19524a16cd93a39f9306191f85c549a2bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Wed, 24 Apr 2019 23:54:21 +0200 Subject: [PATCH 153/181] io_uring: remove unnecessary barrier after incrementing dropped counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit smp_store_release in io_commit_sqring already orders the store to dropped before the update to SQ head. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 662f1c070c8c..2ebc33cc907b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1846,8 +1846,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; ring->dropped++; - /* See comment at the top of this file */ - smp_wmb(); return false; } From 62977281a6384d3904c02272a638cc3ac3bac54d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20B=C3=BChler?= Date: Wed, 24 Apr 2019 23:54:22 +0200 Subject: [PATCH 154/181] io_uring: remove unnecessary barrier after unsetting IORING_SQ_NEED_WAKEUP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no operation to order with afterwards, and removing the flag is not critical in any way. There will always be a "race condition" where the application will trigger IORING_ENTER_SQ_WAKEUP when it isn't actually needed. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2ebc33cc907b..77b247b5d10b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1969,13 +1969,11 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); continue; } finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); } i = 0; From 2c2a2fb1e2a9256714338875bede6b7cbd4b9542 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 30 Apr 2019 11:18:21 +0200 Subject: [PATCH 155/181] Revert "ACPICA: Clear status of GPEs before enabling them" Revert commit c8b1917c8987 ("ACPICA: Clear status of GPEs before enabling them") that causes problems with Thunderbolt controllers to occur if a dock device is connected at init time (the xhci_hcd and thunderbolt modules crash which prevents peripherals connected through them from working). Commit c8b1917c8987 effectively causes commit ecc1165b8b74 ("ACPICA: Dispatch active GPEs at init time") to get undone, so the problem addressed by commit ecc1165b8b74 appears again as a result of it. Fixes: c8b1917c8987 ("ACPICA: Clear status of GPEs before enabling them") Link: https://lore.kernel.org/lkml/s5hy33siofw.wl-tiwai@suse.de/T/#u Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1132943 Reported-by: Michael Hirmke Reported-by: Takashi Iwai Cc: 4.17+ # 4.17+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evgpe.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c index 5e9d7348c16f..62d3aa74277b 100644 --- a/drivers/acpi/acpica/evgpe.c +++ b/drivers/acpi/acpica/evgpe.c @@ -81,12 +81,8 @@ acpi_status acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) ACPI_FUNCTION_TRACE(ev_enable_gpe); - /* Clear the GPE status */ - status = acpi_hw_clear_gpe(gpe_event_info); - if (ACPI_FAILURE(status)) - return_ACPI_STATUS(status); - /* Enable the requested GPE */ + status = acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_ENABLE); return_ACPI_STATUS(status); } From 0e2338749192ce0e52e7174c5352f627632f478a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Apr 2019 12:22:25 -0700 Subject: [PATCH 156/181] ipv6: fix races in ip6_dst_destroy() We had many syzbot reports that seem to be caused by use-after-free of struct fib6_info. ip6_dst_destroy(), fib6_drop_pcpu_from() and rt6_remove_exception() are writers vs rt->from, and use non consistent synchronization among themselves. Switching to xchg() will solve the issues with no possible lockdep issues. BUG: KASAN: user-memory-access in atomic_dec_and_test include/asm-generic/atomic-instrumented.h:747 [inline] BUG: KASAN: user-memory-access in fib6_info_release include/net/ip6_fib.h:294 [inline] BUG: KASAN: user-memory-access in fib6_info_release include/net/ip6_fib.h:292 [inline] BUG: KASAN: user-memory-access in fib6_drop_pcpu_from net/ipv6/ip6_fib.c:927 [inline] BUG: KASAN: user-memory-access in fib6_purge_rt+0x4f6/0x670 net/ipv6/ip6_fib.c:960 Write of size 4 at addr 0000000000ffffb4 by task syz-executor.1/7649 CPU: 0 PID: 7649 Comm: syz-executor.1 Not tainted 5.1.0-rc6+ #183 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 kasan_report.cold+0x5/0x40 mm/kasan/report.c:321 check_memory_region_inline mm/kasan/generic.c:185 [inline] check_memory_region+0x123/0x190 mm/kasan/generic.c:191 kasan_check_write+0x14/0x20 mm/kasan/common.c:108 atomic_dec_and_test include/asm-generic/atomic-instrumented.h:747 [inline] fib6_info_release include/net/ip6_fib.h:294 [inline] fib6_info_release include/net/ip6_fib.h:292 [inline] fib6_drop_pcpu_from net/ipv6/ip6_fib.c:927 [inline] fib6_purge_rt+0x4f6/0x670 net/ipv6/ip6_fib.c:960 fib6_del_route net/ipv6/ip6_fib.c:1813 [inline] fib6_del+0xac2/0x10a0 net/ipv6/ip6_fib.c:1844 fib6_clean_node+0x3a8/0x590 net/ipv6/ip6_fib.c:2006 fib6_walk_continue+0x495/0x900 net/ipv6/ip6_fib.c:1928 fib6_walk+0x9d/0x100 net/ipv6/ip6_fib.c:1976 fib6_clean_tree+0xe0/0x120 net/ipv6/ip6_fib.c:2055 __fib6_clean_all+0x118/0x2a0 net/ipv6/ip6_fib.c:2071 fib6_clean_all+0x2b/0x40 net/ipv6/ip6_fib.c:2082 rt6_sync_down_dev+0x134/0x150 net/ipv6/route.c:4057 rt6_disable_ip+0x27/0x5f0 net/ipv6/route.c:4062 addrconf_ifdown+0xa2/0x1220 net/ipv6/addrconf.c:3705 addrconf_notify+0x19a/0x2260 net/ipv6/addrconf.c:3630 notifier_call_chain+0xc7/0x240 kernel/notifier.c:93 __raw_notifier_call_chain kernel/notifier.c:394 [inline] raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:401 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1753 call_netdevice_notifiers_extack net/core/dev.c:1765 [inline] call_netdevice_notifiers net/core/dev.c:1779 [inline] dev_close_many+0x33f/0x6f0 net/core/dev.c:1522 rollback_registered_many+0x43b/0xfd0 net/core/dev.c:8177 rollback_registered+0x109/0x1d0 net/core/dev.c:8242 unregister_netdevice_queue net/core/dev.c:9289 [inline] unregister_netdevice_queue+0x1ee/0x2c0 net/core/dev.c:9282 unregister_netdevice include/linux/netdevice.h:2658 [inline] __tun_detach+0xd5b/0x1000 drivers/net/tun.c:727 tun_detach drivers/net/tun.c:744 [inline] tun_chr_close+0xe0/0x180 drivers/net/tun.c:3443 __fput+0x2e5/0x8d0 fs/file_table.c:278 ____fput+0x16/0x20 fs/file_table.c:309 task_work_run+0x14a/0x1c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x90a/0x2fa0 kernel/exit.c:876 do_group_exit+0x135/0x370 kernel/exit.c:980 __do_sys_exit_group kernel/exit.c:991 [inline] __se_sys_exit_group kernel/exit.c:989 [inline] __x64_sys_exit_group+0x44/0x50 kernel/exit.c:989 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x458da9 Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffeafc2a6a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 000000000000001c RCX: 0000000000458da9 RDX: 0000000000412a80 RSI: 0000000000a54ef0 RDI: 0000000000000043 RBP: 00000000004be552 R08: 000000000000000c R09: 000000000004c0d1 R10: 0000000002341940 R11: 0000000000000246 R12: 00000000ffffffff R13: 00007ffeafc2a7f0 R14: 000000000004c065 R15: 00007ffeafc2a800 Fixes: a68886a69180 ("net/ipv6: Make from in rt6_info rcu protected") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: David Ahern Reviewed-by: David Ahern Acked-by: Martin KaFai Lau Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 +--- net/ipv6/route.c | 9 ++------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6613d8dbb0e5..91247a6fc67f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -921,9 +921,7 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, if (pcpu_rt) { struct fib6_info *from; - from = rcu_dereference_protected(pcpu_rt->from, - lockdep_is_held(&table->tb6_lock)); - rcu_assign_pointer(pcpu_rt->from, NULL); + from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); fib6_info_release(from); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b4899f0de0d0..2cc166bc978d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -379,11 +379,8 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - rcu_read_lock(); - from = rcu_dereference(rt->from); - rcu_assign_pointer(rt->from, NULL); + from = xchg((__force struct fib6_info **)&rt->from, NULL); fib6_info_release(from); - rcu_read_unlock(); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -1288,9 +1285,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ - from = rcu_dereference_protected(rt6_ex->rt6i->from, - lockdep_is_held(&rt6_exception_lock)); - rcu_assign_pointer(rt6_ex->rt6i->from, NULL); + from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); fib6_info_release(from); dst_dev_put(&rt6_ex->rt6i->dst); From 5c8b0b54db22c54f2aec991b388f550d3a927f26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 Apr 2019 10:16:07 -0600 Subject: [PATCH 157/181] io_uring: have submission side sqe errors post a cqe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we only post a cqe if we get an error OUTSIDE of submission. For submission, we return the error directly through io_uring_enter(). This is a bit awkward for applications, and it makes more sense to always post a cqe with an error, if the error happens on behalf of an sqe. This changes submission behavior a bit. io_uring_enter() returns -ERROR for an error, and > 0 for number of sqes submitted. Before this change, if you wanted to submit 8 entries and had an error on the 5th entry, io_uring_enter() would return 4 (for number of entries successfully submitted) and rewind the sqring. The application would then have to peek at the sqring and figure out what was wrong with the head sqe, and then skip it itself. With this change, we'll return 5 since we did consume 5 sqes, and the last sqe (with the error) will result in a cqe being posted with the error. This makes the logic easier to handle in the application, and it cleans up the submission part. Suggested-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 77b247b5d10b..0a894d7baceb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1801,14 +1801,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } } -/* - * Undo last io_get_sqring() - */ -static void io_drop_sqring(struct io_ring_ctx *ctx) -{ - ctx->cached_sq_head--; -} - /* * Fetch an sqe, if one is available. Note that s->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to @@ -2018,7 +2010,7 @@ static int io_sq_thread(void *data) static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; - int i, ret = 0, submit = 0; + int i, submit = 0; if (to_submit > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, to_submit); @@ -2027,6 +2019,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) for (i = 0; i < to_submit; i++) { struct sqe_submit s; + int ret; if (!io_get_sqring(ctx, &s)) break; @@ -2034,21 +2027,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; + submit++; ret = io_submit_sqe(ctx, &s, statep); - if (ret) { - io_drop_sqring(ctx); - break; - } - - submit++; + if (ret) + io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); } io_commit_sqring(ctx); if (statep) io_submit_state_end(statep); - return submit ? submit : ret; + return submit; } static unsigned io_cqring_events(struct io_cq_ring *ring) @@ -2779,24 +2769,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - - if (submitted < 0) - goto out_ctx; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; min_complete = min(min_complete, ctx->cq_entries); - /* - * The application could have included the 'to_submit' count - * in how many events it wanted to wait for. If we failed to - * submit the desired count, we may need to adjust the number - * of events to poll/wait for. - */ - if (submitted < to_submit) - min_complete = min_t(unsigned, submitted, min_complete); - if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); From 799381e49b4e7b0177664cdbc54a2de8b41647a8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 28 Apr 2019 18:10:39 -0700 Subject: [PATCH 158/181] Documentation: fix netdev-FAQ.rst markup warning Fix ReST underline warning: ./Documentation/networking/netdev-FAQ.rst:135: WARNING: Title underline too short. Q: I made changes to only a few patches in a patch series should I resend only those changed? -------------------------------------------------------------------------------------------- Fixes: ffa91253739c ("Documentation: networking: Update netdev-FAQ regarding patches") Signed-off-by: Randy Dunlap Cc: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/netdev-FAQ.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 8c7a713cf657..642fa963be3c 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -132,7 +132,7 @@ version that should be applied. If there is any doubt, the maintainer will reply and ask what should be done. Q: I made changes to only a few patches in a patch series should I resend only those changed? --------------------------------------------------------------------------------------------- +--------------------------------------------------------------------------------------------- A: No, please resend the entire patch series and make sure you do number your patches such that it is clear this is the latest and greatest set of patches that can be applied. From 37e9c087c81447cebd4ca022226829e319b0e280 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 29 Apr 2019 07:51:44 +0200 Subject: [PATCH 159/181] stmmac: pci: Fix typo in IOT2000 comment Signed-off-by: Jan Kiszka Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index cc1e887e47b5..26db6aa002d1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -160,7 +160,7 @@ static const struct dmi_system_id quark_pci_dmi[] = { .driver_data = (void *)&galileo_stmmac_dmi_data, }, /* - * There are 2 types of SIMATIC IOT2000: IOT20202 and IOT2040. + * There are 2 types of SIMATIC IOT2000: IOT2020 and IOT2040. * The asset tag "6ES7647-0AA00-0YA2" is only for IOT2020 which * has only one pci network device while other asset tags are * for IOT2040 which has two. From fbd019737d71e405f86549fd738f81e2ff3dd073 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 29 Apr 2019 14:16:19 +0800 Subject: [PATCH 160/181] sctp: avoid running the sctp state machine recursively Ying triggered a call trace when doing an asconf testing: BUG: scheduling while atomic: swapper/12/0/0x10000100 Call Trace: [] dump_stack+0x19/0x1b [] __schedule_bug+0x64/0x72 [] __schedule+0x9ba/0xa00 [] __cond_resched+0x26/0x30 [] _cond_resched+0x3a/0x50 [] kmem_cache_alloc_node+0x38/0x200 [] __alloc_skb+0x5d/0x2d0 [] sctp_packet_transmit+0x610/0xa20 [sctp] [] sctp_outq_flush+0x2ce/0xc00 [sctp] [] sctp_outq_uncork+0x1c/0x20 [sctp] [] sctp_cmd_interpreter.isra.22+0xc8/0x1460 [sctp] [] sctp_do_sm+0xe1/0x350 [sctp] [] sctp_primitive_ASCONF+0x3d/0x50 [sctp] [] sctp_cmd_interpreter.isra.22+0x114/0x1460 [sctp] [] sctp_do_sm+0xe1/0x350 [sctp] [] sctp_assoc_bh_rcv+0xf4/0x1b0 [sctp] [] sctp_inq_push+0x51/0x70 [sctp] [] sctp_rcv+0xa8b/0xbd0 [sctp] As it shows, the first sctp_do_sm() running under atomic context (NET_RX softirq) invoked sctp_primitive_ASCONF() that uses GFP_KERNEL flag later, and this flag is supposed to be used in non-atomic context only. Besides, sctp_do_sm() was called recursively, which is not expected. Vlad tried to fix this recursive call in Commit c0786693404c ("sctp: Fix oops when sending queued ASCONF chunks") by introducing a new command SCTP_CMD_SEND_NEXT_ASCONF. But it didn't work as this command is still used in the first sctp_do_sm() call, and sctp_primitive_ASCONF() will be called in this command again. To avoid calling sctp_do_sm() recursively, we send the next queued ASCONF not by sctp_primitive_ASCONF(), but by sctp_sf_do_prm_asconf() in the 1st sctp_do_sm() directly. Reported-by: Ying Xu Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/command.h | 1 - net/sctp/sm_sideeffect.c | 29 ----------------------------- net/sctp/sm_statefuns.c | 35 +++++++++++++++++++++++++++-------- 3 files changed, 27 insertions(+), 38 deletions(-) diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index 6640f84fe536..6d5beac29bc1 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -105,7 +105,6 @@ enum sctp_verb { SCTP_CMD_T1_RETRAN, /* Mark for retransmission after T1 timeout */ SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */ SCTP_CMD_SEND_MSG, /* Send the whole use message */ - SCTP_CMD_SEND_NEXT_ASCONF, /* Send the next ASCONF after ACK */ SCTP_CMD_PURGE_ASCONF_QUEUE, /* Purge all asconf queues.*/ SCTP_CMD_SET_ASOC, /* Restore association context */ SCTP_CMD_LAST diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 1d143bc3f73d..4aa03588f87b 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1112,32 +1112,6 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc, } -/* Sent the next ASCONF packet currently stored in the association. - * This happens after the ASCONF_ACK was succeffully processed. - */ -static void sctp_cmd_send_asconf(struct sctp_association *asoc) -{ - struct net *net = sock_net(asoc->base.sk); - - /* Send the next asconf chunk from the addip chunk - * queue. - */ - if (!list_empty(&asoc->addip_chunk_list)) { - struct list_head *entry = asoc->addip_chunk_list.next; - struct sctp_chunk *asconf = list_entry(entry, - struct sctp_chunk, list); - list_del_init(entry); - - /* Hold the chunk until an ASCONF_ACK is received. */ - sctp_chunk_hold(asconf); - if (sctp_primitive_ASCONF(net, asoc, asconf)) - sctp_chunk_free(asconf); - else - asoc->addip_last_asconf = asconf; - } -} - - /* These three macros allow us to pull the debugging code out of the * main flow of sctp_do_sm() to keep attention focused on the real * functionality there. @@ -1783,9 +1757,6 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, } sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); break; - case SCTP_CMD_SEND_NEXT_ASCONF: - sctp_cmd_send_asconf(asoc); - break; case SCTP_CMD_PURGE_ASCONF_QUEUE: sctp_asconf_queue_teardown(asoc); break; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index c9ae3404b1bb..713a669d2058 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3824,6 +3824,29 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, return SCTP_DISPOSITION_CONSUME; } +static enum sctp_disposition sctp_send_next_asconf( + struct net *net, + const struct sctp_endpoint *ep, + struct sctp_association *asoc, + const union sctp_subtype type, + struct sctp_cmd_seq *commands) +{ + struct sctp_chunk *asconf; + struct list_head *entry; + + if (list_empty(&asoc->addip_chunk_list)) + return SCTP_DISPOSITION_CONSUME; + + entry = asoc->addip_chunk_list.next; + asconf = list_entry(entry, struct sctp_chunk, list); + + list_del_init(entry); + sctp_chunk_hold(asconf); + asoc->addip_last_asconf = asconf; + + return sctp_sf_do_prm_asconf(net, ep, asoc, type, asconf, commands); +} + /* * ADDIP Section 4.3 General rules for address manipulation * When building TLV parameters for the ASCONF Chunk that will add or @@ -3915,14 +3938,10 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); if (!sctp_process_asconf_ack((struct sctp_association *)asoc, - asconf_ack)) { - /* Successfully processed ASCONF_ACK. We can - * release the next asconf if we have one. - */ - sctp_add_cmd_sf(commands, SCTP_CMD_SEND_NEXT_ASCONF, - SCTP_NULL()); - return SCTP_DISPOSITION_CONSUME; - } + asconf_ack)) + return sctp_send_next_asconf(net, ep, + (struct sctp_association *)asoc, + type, commands); abort = sctp_make_abort(asoc, asconf_ack, sizeof(struct sctp_errhdr)); From 975554b03eddc1df73bda3a764a09e18cadd5f1c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 30 Apr 2019 13:34:51 +0100 Subject: [PATCH 161/181] io_uring: fix SQPOLL cpu validation In io_sq_offload_start(), we call cpu_possible() on an unbounded cpu value from userspace. On v5.1-rc7 on arm64 with CONFIG_DEBUG_PER_CPU_MAPS, this results in a splat: WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpu_max_bits_warn include/linux/cpumask.h:121 [inline] There was an attempt to fix this in commit: 917257daa0fea7a0 ("io_uring: only test SQPOLL cpu after we've verified it") ... by adding a check after the cpu value had been limited to NR_CPU_IDS using array_index_nospec(). However, this left an unbound check at the start of the function, for which the warning still fires. Let's fix this correctly by checking that the cpu value is bound by nr_cpu_ids before passing it to cpu_possible(). Note that only nr_cpu_ids of a cpumask are guaranteed to exist at runtime, and nr_cpu_ids can be significantly smaller than NR_CPUs. For example, an arm64 defconfig has NR_CPUS=256, while my test VM has 4 vCPUs. Following the intent from the commit message for 917257daa0fea7a0, the check is moved under the SQ_AFF branch, which is the only branch where the cpu values is consumed. The check is performed before bounding the value with array_index_nospec() so that we don't silently accept bogus cpu values from userspace, where array_index_nospec() would force these values to 0. I suspect we can remove the array_index_nospec() call entirely, but I've conservatively left that in place, updated to use nr_cpu_ids to match the prior check. Tested on arm64 with the Syzkaller reproducer: https://syzkaller.appspot.com/bug?extid=cd714a07c6de2bc34293 https://syzkaller.appspot.com/x/repro.syz?x=15d8b397200000 Full splat from before this patch: WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpu_max_bits_warn include/linux/cpumask.h:121 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpumask_check include/linux/cpumask.h:128 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpumask_test_cpu include/linux/cpumask.h:344 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_sq_offload_start fs/io_uring.c:2244 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_uring_create fs/io_uring.c:2864 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_uring_setup+0x1108/0x15a0 fs/io_uring.c:2916 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 27601 Comm: syz-executor.0 Not tainted 5.1.0-rc7 #3 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x2f0 include/linux/compiler.h:193 show_stack+0x20/0x30 arch/arm64/kernel/traps.c:158 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x110/0x190 lib/dump_stack.c:113 panic+0x384/0x68c kernel/panic.c:214 __warn+0x2bc/0x2c0 kernel/panic.c:571 report_bug+0x228/0x2d8 lib/bug.c:186 bug_handler+0xa0/0x1a0 arch/arm64/kernel/traps.c:956 call_break_hook arch/arm64/kernel/debug-monitors.c:301 [inline] brk_handler+0x1d4/0x388 arch/arm64/kernel/debug-monitors.c:316 do_debug_exception+0x1a0/0x468 arch/arm64/mm/fault.c:831 el1_dbg+0x18/0x8c cpu_max_bits_warn include/linux/cpumask.h:121 [inline] cpumask_check include/linux/cpumask.h:128 [inline] cpumask_test_cpu include/linux/cpumask.h:344 [inline] io_sq_offload_start fs/io_uring.c:2244 [inline] io_uring_create fs/io_uring.c:2864 [inline] io_uring_setup+0x1108/0x15a0 fs/io_uring.c:2916 __do_sys_io_uring_setup fs/io_uring.c:2929 [inline] __se_sys_io_uring_setup fs/io_uring.c:2926 [inline] __arm64_sys_io_uring_setup+0x50/0x70 fs/io_uring.c:2926 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall arch/arm64/kernel/syscall.c:47 [inline] el0_svc_common.constprop.0+0x148/0x2e0 arch/arm64/kernel/syscall.c:83 el0_svc_handler+0xdc/0x100 arch/arm64/kernel/syscall.c:129 el0_svc+0x8/0xc arch/arm64/kernel/entry.S:948 SMP: stopping secondary CPUs Dumping ftrace buffer: (ftrace buffer empty) Kernel Offset: disabled CPU features: 0x002,23000438 Memory Limit: none Rebooting in 1 seconds.. Fixes: 917257daa0fea7a0 ("io_uring: only test SQPOLL cpu after we've verified it") Signed-off-by: Mark Rutland Cc: Jens Axboe Cc: Alexander Viro Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Simplied the logic Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0a894d7baceb..5954047ee96d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2319,10 +2319,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, mmgrab(current->mm); ctx->sqo_mm = current->mm; - ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) - goto err; - if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) @@ -2333,11 +2329,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu; + int cpu = array_index_nospec(p->sq_thread_cpu, + nr_cpu_ids); - cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) + if (!cpu_possible(cpu)) goto err; ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, From 52e04ef4c9d459cba3afd86ec335a411b40b7fd2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 30 Apr 2019 17:30:21 +0100 Subject: [PATCH 162/181] io_uring: free allocated io_memory once If io_allocate_scq_urings() fails to allocate an sq_* region, it will call io_mem_free() for any previously allocated regions, but leave dangling pointers to these regions in the ctx. Any regions which have not yet been allocated are left NULL. Note that when returning -EOVERFLOW, the previously allocated sq_ring is not freed, which appears to be an unintentional leak. When io_allocate_scq_urings() fails, io_uring_create() will call io_ring_ctx_wait_and_kill(), which calls io_mem_free() on all the sq_* regions, assuming the pointers are valid and not NULL. This can result in pages being freed multiple times, which has been observed to corrupt the page state, leading to subsequent fun. This can also result in virt_to_page() on NULL, resulting in the use of bogus page addresses, and yet more subsequent fun. The latter can be detected with CONFIG_DEBUG_VIRTUAL on arm64. Adding a cleanup path to io_allocate_scq_urings() complicates the logic, so let's leave it to io_ring_ctx_free() to consistently free these pointers, and simplify the io_allocate_scq_urings() error paths. Full splats from before this patch below. Note that the pointer logged by the DEBUG_VIRTUAL "non-linear address" warning has been hashed, and is actually NULL. [ 26.098129] page:ffff80000e949a00 count:0 mapcount:-128 mapping:0000000000000000 index:0x0 [ 26.102976] flags: 0x63fffc000000() [ 26.104373] raw: 000063fffc000000 ffff80000e86c188 ffff80000ea3df08 0000000000000000 [ 26.108917] raw: 0000000000000000 0000000000000001 00000000ffffff7f 0000000000000000 [ 26.137235] page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0) [ 26.143960] ------------[ cut here ]------------ [ 26.146020] kernel BUG at include/linux/mm.h:547! [ 26.147586] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 26.149163] Modules linked in: [ 26.150287] Process syz-executor.21 (pid: 20204, stack limit = 0x000000000e9cefeb) [ 26.153307] CPU: 2 PID: 20204 Comm: syz-executor.21 Not tainted 5.1.0-rc7-00004-g7d30b2ea43d6 #18 [ 26.156566] Hardware name: linux,dummy-virt (DT) [ 26.158089] pstate: 40400005 (nZcv daif +PAN -UAO) [ 26.159869] pc : io_mem_free+0x9c/0xa8 [ 26.161436] lr : io_mem_free+0x9c/0xa8 [ 26.162720] sp : ffff000013003d60 [ 26.164048] x29: ffff000013003d60 x28: ffff800025048040 [ 26.165804] x27: 0000000000000000 x26: ffff800025048040 [ 26.167352] x25: 00000000000000c0 x24: ffff0000112c2820 [ 26.169682] x23: 0000000000000000 x22: 0000000020000080 [ 26.171899] x21: ffff80002143b418 x20: ffff80002143b400 [ 26.174236] x19: ffff80002143b280 x18: 0000000000000000 [ 26.176607] x17: 0000000000000000 x16: 0000000000000000 [ 26.178997] x15: 0000000000000000 x14: 0000000000000000 [ 26.181508] x13: 00009178a5e077b2 x12: 0000000000000001 [ 26.183863] x11: 0000000000000000 x10: 0000000000000980 [ 26.186437] x9 : ffff000013003a80 x8 : ffff800025048a20 [ 26.189006] x7 : ffff8000250481c0 x6 : ffff80002ffe9118 [ 26.191359] x5 : ffff80002ffe9118 x4 : 0000000000000000 [ 26.193863] x3 : ffff80002ffefe98 x2 : 44c06ddd107d1f00 [ 26.196642] x1 : 0000000000000000 x0 : 000000000000003e [ 26.198892] Call trace: [ 26.199893] io_mem_free+0x9c/0xa8 [ 26.201155] io_ring_ctx_wait_and_kill+0xec/0x180 [ 26.202688] io_uring_setup+0x6c4/0x6f0 [ 26.204091] __arm64_sys_io_uring_setup+0x18/0x20 [ 26.205576] el0_svc_common.constprop.0+0x7c/0xe8 [ 26.207186] el0_svc_handler+0x28/0x78 [ 26.208389] el0_svc+0x8/0xc [ 26.209408] Code: aa0203e0 d0006861 9133a021 97fcdc3c (d4210000) [ 26.211995] ---[ end trace bdb81cd43a21e50d ]--- [ 81.770626] ------------[ cut here ]------------ [ 81.825015] virt_to_phys used for non-linear address: 000000000d42f2c7 ( (null)) [ 81.827860] WARNING: CPU: 1 PID: 30171 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x48/0x68 [ 81.831202] Modules linked in: [ 81.832212] CPU: 1 PID: 30171 Comm: syz-executor.20 Not tainted 5.1.0-rc7-00004-g7d30b2ea43d6 #19 [ 81.835616] Hardware name: linux,dummy-virt (DT) [ 81.836863] pstate: 60400005 (nZCv daif +PAN -UAO) [ 81.838727] pc : __virt_to_phys+0x48/0x68 [ 81.840572] lr : __virt_to_phys+0x48/0x68 [ 81.842264] sp : ffff80002cf67c70 [ 81.843858] x29: ffff80002cf67c70 x28: ffff800014358e18 [ 81.846463] x27: 0000000000000000 x26: 0000000020000080 [ 81.849148] x25: 0000000000000000 x24: ffff80001bb01f40 [ 81.851986] x23: ffff200011db06c8 x22: ffff2000127e3c60 [ 81.854351] x21: ffff800014358cc0 x20: ffff800014358d98 [ 81.856711] x19: 0000000000000000 x18: 0000000000000000 [ 81.859132] x17: 0000000000000000 x16: 0000000000000000 [ 81.861586] x15: 0000000000000000 x14: 0000000000000000 [ 81.863905] x13: 0000000000000000 x12: ffff1000037603e9 [ 81.866226] x11: 1ffff000037603e8 x10: 0000000000000980 [ 81.868776] x9 : ffff80002cf67840 x8 : ffff80001bb02920 [ 81.873272] x7 : ffff1000037603e9 x6 : ffff80001bb01f47 [ 81.875266] x5 : ffff1000037603e9 x4 : dfff200000000000 [ 81.876875] x3 : ffff200010087528 x2 : ffff1000059ecf58 [ 81.878751] x1 : 44c06ddd107d1f00 x0 : 0000000000000000 [ 81.880453] Call trace: [ 81.881164] __virt_to_phys+0x48/0x68 [ 81.882919] io_mem_free+0x18/0x110 [ 81.886585] io_ring_ctx_wait_and_kill+0x13c/0x1f0 [ 81.891212] io_uring_setup+0xa60/0xad0 [ 81.892881] __arm64_sys_io_uring_setup+0x2c/0x38 [ 81.894398] el0_svc_common.constprop.0+0xac/0x150 [ 81.896306] el0_svc_handler+0x34/0x88 [ 81.897744] el0_svc+0x8/0xc [ 81.898715] ---[ end trace b4a703802243cbba ]--- Fixes: 2b188cc1bb857a9d ("Add io_uring IO interface") Signed-off-by: Mark Rutland Cc: Jens Axboe Cc: Alexander Viro Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5954047ee96d..046fc4e1e155 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2396,8 +2396,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) static void io_mem_free(void *ptr) { - struct page *page = virt_to_head_page(ptr); + struct page *page; + if (!ptr) + return; + + page = virt_to_head_page(ptr); if (put_page_testzero(page)) free_compound_page(page); } @@ -2816,17 +2820,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->sq_ring); + if (!ctx->sq_sqes) return -ENOMEM; - } cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) { - io_mem_free(ctx->sq_ring); - io_mem_free(ctx->sq_sqes); + if (!cq_ring) return -ENOMEM; - } ctx->cq_ring = cq_ring; cq_ring->ring_mask = p->cq_entries - 1; From 817869d2519f0cb7be5b3482129dadc806dfb747 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 Apr 2019 14:44:05 -0600 Subject: [PATCH 163/181] io_uring: drop req submit reference always in async punt If we don't end up actually calling submit in io_sq_wq_submit_work(), we still need to drop the submit reference to the request. If we don't, then we can leak the request. This can happen if we race with ring shutdown while flushing the workqueue for requests that require use of the mm_struct. Fixes: e65ef56db494 ("io_uring: use regular request ref counts") Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 046fc4e1e155..18cecb6a0151 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1568,10 +1568,11 @@ restart: break; cond_resched(); } while (1); - - /* drop submission reference */ - io_put_req(req); } + + /* drop submission reference */ + io_put_req(req); + if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); io_put_req(req); From 60a27b906d1a372474669c914c10d6c993858a4a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 26 Apr 2019 18:45:20 +0800 Subject: [PATCH 164/181] block: fix handling for BIO_NO_PAGE_REF Commit 399254aaf489211 ("block: add BIO_NO_PAGE_REF flag") introduces BIO_NO_PAGE_REF, and once this flag is set for one bio, all pages in the bio won't be get/put during IO. However, if one bio is submitted via __blkdev_direct_IO_simple(), even though BIO_NO_PAGE_REF is set, pages still may be put. Fixes this issue by avoiding to put pages if BIO_NO_PAGE_REF is set. Fixes: 399254aaf489211 ("block: add BIO_NO_PAGE_REF flag") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- fs/block_dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 24615c76c1d0..bb28e2ead679 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -264,7 +264,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio_for_each_segment_all(bvec, &bio, i, iter_all) { if (should_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + if (!bio_flagged(&bio, BIO_NO_PAGE_REF)) + put_page(bvec->bv_page); } if (unlikely(bio.bi_status)) From f5eb4d3b92a6a1096ef3480b54782a9409281300 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 26 Apr 2019 18:45:21 +0800 Subject: [PATCH 165/181] iov_iter: fix iov_iter_type Commit 875f1d0769cd ("iov_iter: add ITER_BVEC_FLAG_NO_REF flag") introduces one extra flag of ITER_BVEC_FLAG_NO_REF, and this flag is stored into iter->type. However, iov_iter_type() doesn't consider the new added flag, fix it by masking this flag in iov_iter_type(). Fixes: 875f1d0769cd ("iov_iter: add ITER_BVEC_FLAG_NO_REF flag") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/uio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index f184af1999a8..2d0131ad4604 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -60,7 +60,7 @@ struct iov_iter { static inline enum iter_type iov_iter_type(const struct iov_iter *i) { - return i->type & ~(READ | WRITE); + return i->type & ~(READ | WRITE | ITER_BVEC_FLAG_NO_REF); } static inline bool iter_is_iovec(const struct iov_iter *i) From b2cf86e1563e33a14a1c69b3e508d15dc12f804c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 29 Apr 2019 11:46:55 -0400 Subject: [PATCH 166/181] packet: in recvmsg msg_name return at least sizeof sockaddr_ll Packet send checks that msg_name is at least sizeof sockaddr_ll. Packet recv must return at least this length, so that its output can be passed unmodified to packet send. This ceased to be true since adding support for lladdr longer than sll_addr. Since, the return value uses true address length. Always return at least sizeof sockaddr_ll, even if address length is shorter. Zero the padding bytes. Change v1->v2: do not overwrite zeroed padding again. use copy_len. Fixes: 0fb375fb9b93 ("[AF_PACKET]: Allow for > 8 byte hardware addresses.") Suggested-by: David Laight Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9419c5cf4de5..7d361cd53ad5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3344,20 +3344,29 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { + int copy_len; + /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); + copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); + copy_len = msg->msg_namelen; + if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { + memset(msg->msg_name + + offsetof(struct sockaddr_ll, sll_addr), + 0, sizeof(sll->sll_addr)); + msg->msg_namelen = sizeof(struct sockaddr_ll); + } } - memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, - msg->msg_namelen); + memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (pkt_sk(sk)->auxdata) { From 486efdc8f6ce802b27e15921d2353cc740c55451 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 29 Apr 2019 11:53:18 -0400 Subject: [PATCH 167/181] packet: validate msg_namelen in send directly Packet sockets in datagram mode take a destination address. Verify its length before passing to dev_hard_header. Prior to 2.6.14-rc3, the send code ignored sll_halen. This is established behavior. Directly compare msg_namelen to dev->addr_len. Change v1->v2: initialize addr in all paths Fixes: 6b8d95f1795c4 ("packet: validate address length if non-zero") Suggested-by: David Laight Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7d361cd53ad5..9b81813dd16a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2602,8 +2602,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); + unsigned char *addr = NULL; int tp_len, size_max; - unsigned char *addr; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; @@ -2614,7 +2614,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = po->num; - addr = NULL; } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -2624,10 +2623,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) sll_addr))) goto out; proto = saddr->sll_protocol; - addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); - if (addr && dev && saddr->sll_halen < dev->addr_len) - goto out_put; + if (po->sk.sk_socket->type == SOCK_DGRAM) { + if (dev && msg->msg_namelen < dev->addr_len + + offsetof(struct sockaddr_ll, sll_addr)) + goto out_put; + addr = saddr->sll_addr; + } } err = -ENXIO; @@ -2799,7 +2801,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct sk_buff *skb; struct net_device *dev; __be16 proto; - unsigned char *addr; + unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; @@ -2816,7 +2818,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = po->num; - addr = NULL; } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -2824,10 +2825,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; - addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); - if (addr && dev && saddr->sll_halen < dev->addr_len) - goto out_unlock; + if (sock->type == SOCK_DGRAM) { + if (dev && msg->msg_namelen < dev->addr_len + + offsetof(struct sockaddr_ll, sll_addr)) + goto out_unlock; + addr = saddr->sll_addr; + } } err = -ENXIO; From 15d55bae4e3c43cd9f87fd93c73a263e172d34e1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 29 Apr 2019 10:30:09 -0700 Subject: [PATCH 168/181] selftests: fib_rule_tests: Fix icmp proto with ipv6 A recent commit returns an error if icmp is used as the ip-proto for IPv6 fib rules. Update fib_rule_tests to send ipv6-icmp instead of icmp. Fixes: 5e1a99eae8499 ("ipv4: Add ICMPv6 support when parse route ipproto") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_rule_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index d4cfb6a7a086..5b92c1f4dc04 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -147,8 +147,8 @@ fib_rule6_test() fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then - match="ipproto icmp" - fib_rule6_test_match_n_redirect "$match" "$match" "ipproto icmp match" + match="ipproto ipv6-icmp" + fib_rule6_test_match_n_redirect "$match" "$match" "ipproto ipv6-icmp match" fi } From 2dcb003314032c6efb13a065ffae60d164b2dd35 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Apr 2019 12:19:12 -0700 Subject: [PATCH 169/181] net/tls: avoid NULL pointer deref on nskb->sk in fallback update_chksum() accesses nskb->sk before it has been set by complete_skb(), move the init up. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls_device_fallback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index a3ebd4b02714..c3a5fe624b4e 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -201,13 +201,14 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) skb_put(nskb, skb->len); memcpy(nskb->data, skb->data, headln); - update_chksum(nskb, headln); nskb->destructor = skb->destructor; nskb->sk = sk; skb->destructor = NULL; skb->sk = NULL; + update_chksum(nskb, headln); + delta = nskb->truesize - skb->truesize; if (likely(delta < 0)) WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); From d4ef647510b1200fe1c996ff1cbf5ac47eb930cc Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 1 May 2019 16:59:16 +0100 Subject: [PATCH 170/181] io_uring: avoid page allocation warnings In io_sqe_buffer_register() we allocate a number of arrays based on the iov_len from the user-provided iov. While we limit iov_len to SZ_1G, we can still attempt to allocate arrays exceeding MAX_ORDER. On a 64-bit system with 4KiB pages, for an iov where iov_base = 0x10 and iov_len = SZ_1G, we'll calculate that nr_pages = 262145. When we try to allocate a corresponding array of (16-byte) bio_vecs, requiring 4194320 bytes, which is greater than 4MiB. This results in SLUB warning that we're trying to allocate greater than MAX_ORDER, and failing the allocation. Avoid this by using kvmalloc() for allocations dependent on the user-provided iov_len. At the same time, fix a leak of imu->bvec when registration fails. Full splat from before this patch: WARNING: CPU: 1 PID: 2314 at mm/page_alloc.c:4595 __alloc_pages_nodemask+0x7ac/0x2938 mm/page_alloc.c:4595 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 2314 Comm: syz-executor326 Not tainted 5.1.0-rc7-dirty #4 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x2f0 include/linux/compiler.h:193 show_stack+0x20/0x30 arch/arm64/kernel/traps.c:158 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x110/0x190 lib/dump_stack.c:113 panic+0x384/0x68c kernel/panic.c:214 __warn+0x2bc/0x2c0 kernel/panic.c:571 report_bug+0x228/0x2d8 lib/bug.c:186 bug_handler+0xa0/0x1a0 arch/arm64/kernel/traps.c:956 call_break_hook arch/arm64/kernel/debug-monitors.c:301 [inline] brk_handler+0x1d4/0x388 arch/arm64/kernel/debug-monitors.c:316 do_debug_exception+0x1a0/0x468 arch/arm64/mm/fault.c:831 el1_dbg+0x18/0x8c __alloc_pages_nodemask+0x7ac/0x2938 mm/page_alloc.c:4595 alloc_pages_current+0x164/0x278 mm/mempolicy.c:2132 alloc_pages include/linux/gfp.h:509 [inline] kmalloc_order+0x20/0x50 mm/slab_common.c:1231 kmalloc_order_trace+0x30/0x2b0 mm/slab_common.c:1243 kmalloc_large include/linux/slab.h:480 [inline] __kmalloc+0x3dc/0x4f0 mm/slub.c:3791 kmalloc_array include/linux/slab.h:670 [inline] io_sqe_buffer_register fs/io_uring.c:2472 [inline] __io_uring_register fs/io_uring.c:2962 [inline] __do_sys_io_uring_register fs/io_uring.c:3008 [inline] __se_sys_io_uring_register fs/io_uring.c:2990 [inline] __arm64_sys_io_uring_register+0x9e0/0x1bc8 fs/io_uring.c:2990 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall arch/arm64/kernel/syscall.c:47 [inline] el0_svc_common.constprop.0+0x148/0x2e0 arch/arm64/kernel/syscall.c:83 el0_svc_handler+0xdc/0x100 arch/arm64/kernel/syscall.c:129 el0_svc+0x8/0xc arch/arm64/kernel/entry.S:948 SMP: stopping secondary CPUs Dumping ftrace buffer: (ftrace buffer empty) Kernel Offset: disabled CPU features: 0x002,23000438 Memory Limit: none Rebooting in 1 seconds.. Fixes: edafccee56ff3167 ("io_uring: add support for pre-mapped user IO buffers") Signed-off-by: Mark Rutland Cc: Alexander Viro Cc: Jens Axboe Cc: linux-fsdevel@vger.kernel.org Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 18cecb6a0151..84efb8956734 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2443,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); - kfree(imu->bvec); + kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -2535,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, if (!pages || nr_pages > got_pages) { kfree(vmas); kfree(pages); - pages = kmalloc_array(nr_pages, sizeof(struct page *), + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - vmas = kmalloc_array(nr_pages, + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), GFP_KERNEL); if (!pages || !vmas) { @@ -2549,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, got_pages = nr_pages; } - imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { @@ -2588,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, } if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); + kvfree(imu->bvec); goto err; } @@ -2610,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ctx->nr_user_bufs++; } - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); return 0; err: - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); io_sqe_buffer_unregister(ctx); return ret; } From d2f0c961148f65bc73eda72b9fa3a4e80973cb49 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 29 Apr 2019 16:39:30 +0300 Subject: [PATCH 171/181] ipv4: ip_do_fragment: Preserve skb_iif during fragmentation Previously, during fragmentation after forwarding, skb->skb_iif isn't preserved, i.e. 'ip_copy_metadata' does not copy skb_iif from given 'from' skb. As a result, ip_do_fragment's creates fragments with zero skb_iif, leading to inconsistent behavior. Assume for example an eBPF program attached at tc egress (post forwarding) that examines __sk_buff->ingress_ifindex: - the correct iif is observed if forwarding path does not involve fragmentation/refragmentation - a bogus iif is observed if forwarding path involves fragmentation/refragmentatiom Fix, by preserving skb_iif during 'ip_copy_metadata'. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c80188875f39..e8bb2e85c5a4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -519,6 +519,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; + to->skb_iif = from->skb_iif; skb_dst_drop(to); skb_dst_copy(to, from); to->dev = from->dev; From 6f303d60534c46aa1a239f29c321f95c83dda748 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 1 May 2019 11:05:41 -0700 Subject: [PATCH 172/181] gcc-9: silence 'address-of-packed-member' warning We already did this for clang, but now gcc has that warning too. Yes, yes, the address may be unaligned. And that's kind of the point. Signed-off-by: Linus Torvalds --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2b99679148dc..633d1196bf00 100644 --- a/Makefile +++ b/Makefile @@ -678,6 +678,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,) KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation) KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context) +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os @@ -719,7 +720,6 @@ ifdef CONFIG_CC_IS_CLANG KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) -KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) # Quiet clang warning: comparison of unsigned expression < 0 is always false KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the From cf676908846a06443fa5e6724ca3f5dd7460eca1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 1 May 2019 11:07:40 -0700 Subject: [PATCH 173/181] gcc-9: don't warn about uninitialized variable I'm not sure what made gcc warn about this code now. The 'ret' variable does end up initialized in all cases, but it's definitely not obvious, so the compiler is quite reasonable to warn about this. So just add initialization to make it all much more obvious both to compilers and to humans. Signed-off-by: Linus Torvalds --- drivers/i2c/i2c-core-base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 38af18645133..c480ca385ffb 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -185,7 +185,7 @@ static int i2c_generic_bus_free(struct i2c_adapter *adap) int i2c_generic_scl_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; - int i = 0, scl = 1, ret; + int i = 0, scl = 1, ret = 0; if (bri->prepare_recovery) bri->prepare_recovery(adap); From 459e3a21535ae3c7a9a123650e54f5c882b8fcbf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 1 May 2019 11:20:53 -0700 Subject: [PATCH 174/181] gcc-9: properly declare the {pv,hv}clock_page storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pvlock_page and hvclock_page variables are (as the name implies) addresses to pages, created by the linker script. But we declared them as just "extern u8" variables, which _works_, but now that gcc does some more bounds checking, it causes warnings like warning: array subscript 1 is outside array bounds of ‘u8[1]’ when we then access more than one byte from those variables. Fix this by simply making the declaration of the variables match reality, which makes the compiler happy too. Signed-off-by: Linus Torvalds --- arch/x86/entry/vdso/vclock_gettime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 007b3fe9d727..98c7d12b945c 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -29,12 +29,12 @@ extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); #ifdef CONFIG_PARAVIRT_CLOCK -extern u8 pvclock_page +extern u8 pvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif #ifdef CONFIG_HYPERV_TSCPAGE -extern u8 hvclock_page +extern u8 hvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif From f68d7c44e76532e46f292ad941aa3706cb9e6e40 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 30 Apr 2019 10:46:10 +0800 Subject: [PATCH 175/181] selftests: fib_rule_tests: print the result and return 1 if any tests failed Fixes: 65b2b4939a64 ("selftests: net: initial fib rule tests") Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_rule_tests.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 5b92c1f4dc04..4b7e107865bf 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -27,6 +27,7 @@ log_test() nsuccess=$((nsuccess+1)) printf "\n TEST: %-50s [ OK ]\n" "${msg}" else + ret=1 nfail=$((nfail+1)) printf "\n TEST: %-50s [FAIL]\n" "${msg}" if [ "${PAUSE_ON_FAIL}" = "yes" ]; then @@ -245,4 +246,9 @@ setup run_fibrule_tests cleanup +if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} +fi + exit $ret From 7e74e235bb31a1fefc28d5303da0718b88627ea8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 1 May 2019 12:19:20 -0700 Subject: [PATCH 176/181] gcc-9: don't warn about uninitialized btrfs extent_type variable The 'extent_type' variable does seem to be reliably initialized, but it's _very_ non-obvious, since there's a "goto next" case that jumps over the normal initialization. That will then always trigger the "start >= extent_end" test, which will end up never falling through to the use of that variable. But the code is certainly not obvious, and the compiler warning looks reasonable. Make 'extent_type' an int, and initialize it to an invalid negative value, which seems to be the common pattern in other places. Signed-off-by: Linus Torvalds --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 82fdda8ff5ab..2973608824ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6783,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); - u8 extent_type; + int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; From 15d2aba7c602cd9005b20ff011b670547b3882c4 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 22 Apr 2019 16:43:30 -0600 Subject: [PATCH 177/181] PCI/portdrv: Use shared MSI/MSI-X vector for Bandwidth Management The Interrupt Message Number in the PCIe Capabilities register (PCIe r4.0, sec 7.5.3.2) indicates which MSI/MSI-X vector is shared by interrupts related to the PCIe Capability, including Link Bandwidth Management and Link Autonomous Bandwidth Interrupts (Link Control, 7.5.3.7), Command Completed and Hot-Plug Interrupts (Slot Control, 7.5.3.10), and the PME Interrupt (Root Control, 7.5.3.12). pcie_message_numbers() checked whether we want to enable PME or Hot-Plug interrupts but neglected to check for Link Bandwidth Management, so if we only wanted the Bandwidth Management interrupts, it decided we didn't need any vectors at all. Then pcie_port_enable_irq_vec() tried to reallocate zero vectors, which failed, resulting in fallback to INTx. On some systems, e.g., an X79-based workstation, that INTx seems broken or not handled correctly, so we got spurious IRQ16 interrupts for Bandwidth Management events. Change pcie_message_numbers() so that if we want Link Bandwidth Management interrupts, we use the shared MSI/MSI-X vector from the PCIe Capabilities register. Fixes: e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification") Link: https://lore.kernel.org/lkml/155597243666.19387.1205950870601742062.stgit@gimli.home Signed-off-by: Alex Williamson [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/portdrv_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7d04f9d087a6..1b330129089f 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -55,7 +55,8 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask, * 7.8.2, 7.10.10, 7.31.2. */ - if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) { + if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP | + PCIE_PORT_SERVICE_BWNOTIF)) { pcie_capability_read_word(dev, PCI_EXP_FLAGS, ®16); *pme = (reg16 & PCI_EXP_FLAGS_IRQ) >> 9; nvec = *pme + 1; From f3505745c07ff50c22aeca9dde98762d1c8b5898 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Tue, 30 Apr 2019 05:12:57 +0200 Subject: [PATCH 178/181] rds: ib: force endiannes annotation While the endiannes is being handled correctly as indicated by the comment above the offending line - sparse was unhappy with the missing annotation as be64_to_cpu() expects a __be64 argument. To mitigate this annotation all involved variables are changed to a consistent __le64 and the conversion to uint64_t delayed to the call to rds_cong_map_updated(). Signed-off-by: Nicholas Mc Guire Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 70559854837e..8946c89d7392 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -772,7 +772,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, unsigned long frag_off; unsigned long to_copy; unsigned long copied; - uint64_t uncongested = 0; + __le64 uncongested = 0; void *addr; /* catch completely corrupt packets */ @@ -789,7 +789,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, copied = 0; while (copied < RDS_CONG_MAP_BYTES) { - uint64_t *src, *dst; + __le64 *src, *dst; unsigned int k; to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); @@ -824,9 +824,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, } /* the congestion map is in little endian order */ - uncongested = le64_to_cpu(uncongested); - - rds_cong_map_updated(map, uncongested); + rds_cong_map_updated(map, le64_to_cpu(uncongested)); } static void rds_ib_process_recv(struct rds_connection *conn, From 886b7a50100a50f1cbd08a6f8ec5884dfbe082dc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 30 Apr 2019 10:45:12 -0700 Subject: [PATCH 179/181] ipv6: A few fixes on dereferencing rt->from It is a followup after the fix in commit 9c69a1320515 ("route: Avoid crash from dereferencing NULL rt->from") rt6_do_redirect(): 1. NULL checking is needed on rt->from because a parallel fib6_info delete could happen that sets rt->from to NULL. (e.g. rt6_remove_exception() and fib6_drop_pcpu_from()). 2. fib6_info_hold() is not enough. Same reason as (1). Meaning, holding dst->__refcnt cannot ensure rt->from is not NULL or rt->from->fib6_ref is not 0. Instead of using fib6_info_hold_safe() which ip6_rt_cache_alloc() is already doing, this patch chooses to extend the rcu section to keep "from" dereference-able after checking for NULL. inet6_rtm_getroute(): 1. NULL checking is also needed on rt->from for a similar reason. Note that inet6_rtm_getroute() is using RTNL_FLAG_DOIT_UNLOCKED. Fixes: a68886a69180 ("net/ipv6: Make from in rt6_info rcu protected") Signed-off-by: Martin KaFai Lau Acked-by: Wei Wang Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2cc166bc978d..0520aca3354b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3392,11 +3392,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu rcu_read_lock(); from = rcu_dereference(rt->from); - /* This fib6_info_hold() is safe here because we hold reference to rt - * and rt already holds reference to fib6_info. - */ - fib6_info_hold(from); - rcu_read_unlock(); + if (!from) + goto out; nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); if (!nrt) @@ -3408,10 +3405,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - /* No need to remove rt from the exception table if rt is - * a cached route because rt6_insert_exception() will - * takes care of it - */ + /* rt6_insert_exception() will take care of duplicated exceptions */ if (rt6_insert_exception(nrt, from)) { dst_release_immediate(&nrt->dst); goto out; @@ -3424,7 +3418,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: - fib6_info_release(from); + rcu_read_unlock(); neigh_release(neigh); } @@ -5023,16 +5017,20 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_lock(); from = rcu_dereference(rt->from); - - if (fibmatch) - err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0); - else - err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, - &fl6.saddr, iif, RTM_NEWROUTE, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - 0); + if (from) { + if (fibmatch) + err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, + iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + else + err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, + &fl6.saddr, iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + } else { + err = -ENETUNREACH; + } rcu_read_unlock(); if (err < 0) { From 4dd2b82d5adfbe0b1587ccad7a8f76d826120f37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 May 2019 18:56:28 -0700 Subject: [PATCH 180/181] udp: fix GRO packet of death syzbot was able to crash host by sending UDP packets with a 0 payload. TCP does not have this issue since we do not aggregate packets without payload. Since dev_gro_receive() sets gso_size based on skb_gro_len(skb) it seems not worth trying to cope with padded packets. BUG: KASAN: slab-out-of-bounds in skb_gro_receive+0xf5f/0x10e0 net/core/skbuff.c:3826 Read of size 16 at addr ffff88808893fff0 by task syz-executor612/7889 CPU: 0 PID: 7889 Comm: syz-executor612 Not tainted 5.1.0-rc7+ #96 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 __asan_report_load16_noabort+0x14/0x20 mm/kasan/generic_report.c:133 skb_gro_receive+0xf5f/0x10e0 net/core/skbuff.c:3826 udp_gro_receive_segment net/ipv4/udp_offload.c:382 [inline] call_gro_receive include/linux/netdevice.h:2349 [inline] udp_gro_receive+0xb61/0xfd0 net/ipv4/udp_offload.c:414 udp4_gro_receive+0x763/0xeb0 net/ipv4/udp_offload.c:478 inet_gro_receive+0xe72/0x1110 net/ipv4/af_inet.c:1510 dev_gro_receive+0x1cd0/0x23c0 net/core/dev.c:5581 napi_gro_frags+0x36b/0xd10 net/core/dev.c:5843 tun_get_user+0x2f24/0x3fb0 drivers/net/tun.c:1981 tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2027 call_write_iter include/linux/fs.h:1866 [inline] do_iter_readv_writev+0x5e1/0x8e0 fs/read_write.c:681 do_iter_write fs/read_write.c:957 [inline] do_iter_write+0x184/0x610 fs/read_write.c:938 vfs_writev+0x1b3/0x2f0 fs/read_write.c:1002 do_writev+0x15e/0x370 fs/read_write.c:1037 __do_sys_writev fs/read_write.c:1110 [inline] __se_sys_writev fs/read_write.c:1107 [inline] __x64_sys_writev+0x75/0xb0 fs/read_write.c:1107 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x441cc0 Code: 05 48 3d 01 f0 ff ff 0f 83 9d 09 fc ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 83 3d 51 93 29 00 00 75 14 b8 14 00 00 00 0f 05 <48> 3d 01 f0 ff ff 0f 83 74 09 fc ff c3 48 83 ec 08 e8 ba 2b 00 00 RSP: 002b:00007ffe8c716118 EFLAGS: 00000246 ORIG_RAX: 0000000000000014 RAX: ffffffffffffffda RBX: 00007ffe8c716150 RCX: 0000000000441cc0 RDX: 0000000000000001 RSI: 00007ffe8c716170 RDI: 00000000000000f0 RBP: 0000000000000000 R08: 000000000000ffff R09: 0000000000a64668 R10: 0000000020000040 R11: 0000000000000246 R12: 000000000000c2d9 R13: 0000000000402b50 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 5143: save_stack+0x45/0xd0 mm/kasan/common.c:75 set_track mm/kasan/common.c:87 [inline] __kasan_kmalloc mm/kasan/common.c:497 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:470 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:505 slab_post_alloc_hook mm/slab.h:437 [inline] slab_alloc mm/slab.c:3393 [inline] kmem_cache_alloc+0x11a/0x6f0 mm/slab.c:3555 mm_alloc+0x1d/0xd0 kernel/fork.c:1030 bprm_mm_init fs/exec.c:363 [inline] __do_execve_file.isra.0+0xaa3/0x23f0 fs/exec.c:1791 do_execveat_common fs/exec.c:1865 [inline] do_execve fs/exec.c:1882 [inline] __do_sys_execve fs/exec.c:1958 [inline] __se_sys_execve fs/exec.c:1953 [inline] __x64_sys_execve+0x8f/0xc0 fs/exec.c:1953 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 5351: save_stack+0x45/0xd0 mm/kasan/common.c:75 set_track mm/kasan/common.c:87 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:459 kasan_slab_free+0xe/0x10 mm/kasan/common.c:467 __cache_free mm/slab.c:3499 [inline] kmem_cache_free+0x86/0x260 mm/slab.c:3765 __mmdrop+0x238/0x320 kernel/fork.c:677 mmdrop include/linux/sched/mm.h:49 [inline] finish_task_switch+0x47b/0x780 kernel/sched/core.c:2746 context_switch kernel/sched/core.c:2880 [inline] __schedule+0x81b/0x1cc0 kernel/sched/core.c:3518 preempt_schedule_irq+0xb5/0x140 kernel/sched/core.c:3745 retint_kernel+0x1b/0x2d arch_local_irq_restore arch/x86/include/asm/paravirt.h:767 [inline] kmem_cache_free+0xab/0x260 mm/slab.c:3766 anon_vma_chain_free mm/rmap.c:134 [inline] unlink_anon_vmas+0x2ba/0x870 mm/rmap.c:401 free_pgtables+0x1af/0x2f0 mm/memory.c:394 exit_mmap+0x2d1/0x530 mm/mmap.c:3144 __mmput kernel/fork.c:1046 [inline] mmput+0x15f/0x4c0 kernel/fork.c:1067 exec_mmap fs/exec.c:1046 [inline] flush_old_exec+0x8d9/0x1c20 fs/exec.c:1279 load_elf_binary+0x9bc/0x53f0 fs/binfmt_elf.c:864 search_binary_handler fs/exec.c:1656 [inline] search_binary_handler+0x17f/0x570 fs/exec.c:1634 exec_binprm fs/exec.c:1698 [inline] __do_execve_file.isra.0+0x1394/0x23f0 fs/exec.c:1818 do_execveat_common fs/exec.c:1865 [inline] do_execve fs/exec.c:1882 [inline] __do_sys_execve fs/exec.c:1958 [inline] __se_sys_execve fs/exec.c:1953 [inline] __x64_sys_execve+0x8f/0xc0 fs/exec.c:1953 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff88808893f7c0 which belongs to the cache mm_struct of size 1496 The buggy address is located 600 bytes to the right of 1496-byte region [ffff88808893f7c0, ffff88808893fd98) The buggy address belongs to the page: page:ffffea0002224f80 count:1 mapcount:0 mapping:ffff88821bc40ac0 index:0xffff88808893f7c0 compound_mapcount: 0 flags: 0x1fffc0000010200(slab|head) raw: 01fffc0000010200 ffffea00025b4f08 ffffea00027b9d08 ffff88821bc40ac0 raw: ffff88808893f7c0 ffff88808893e440 0000000100000001 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88808893fe80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88808893ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88808893ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff888088940000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff888088940080: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.") Signed-off-by: Eric Dumazet Cc: Paolo Abeni Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index d8776b2110c1..065334b41d57 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -352,6 +352,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; + unsigned int ulen; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { @@ -359,6 +360,12 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return NULL; } + /* Do not deal with padded or malicious packets, sorry ! */ + ulen = ntohs(uh->len); + if (ulen <= sizeof(*uh) || ulen != skb_gro_len(skb)) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } /* pull encapsulating udp header */ skb_gro_pull(skb, sizeof(struct udphdr)); skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); @@ -377,12 +384,12 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot - * leading to execessive truesize values. + * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ - if (uh->len > uh2->len || skb_gro_receive(p, skb) || - uh->len != uh2->len || + if (ulen > ntohs(uh2->len) || skb_gro_receive(p, skb) || + ulen != ntohs(uh2->len) || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; From 2078e1e7f7e0e21bd0291908f3037c39e666d27b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 May 2019 08:29:42 -0600 Subject: [PATCH 181/181] PCI/LINK: Add Kconfig option (default off) e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification") added dmesg logging whenever a link changes speed or width to a state that is considered degraded. Unfortunately, it cannot differentiate signal integrity-related link changes from those intentionally initiated by an endpoint driver, including drivers that may live in userspace or VMs when making use of vfio-pci. Some GPU drivers actively manage the link state to save power, which generates a stream of messages like this: vfio-pci 0000:07:00.0: 32.000 Gb/s available PCIe bandwidth, limited by 2.5 GT/s x16 link at 0000:00:02.0 (capable of 64.000 Gb/s with 5 GT/s x16 link) Since we can't distinguish the intentional changes from the signal integrity issues, leave the reporting turned off by default. Add a Kconfig option to turn it on if desired. Fixes: e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification") Link: https://lore.kernel.org/linux-pci/20190501142942.26972-1-keith.busch@intel.com Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/Kconfig | 8 ++++++++ drivers/pci/pcie/Makefile | 2 +- drivers/pci/pcie/portdrv.h | 4 ++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig index 5cbdbca904ac..362eb8cfa53b 100644 --- a/drivers/pci/pcie/Kconfig +++ b/drivers/pci/pcie/Kconfig @@ -142,3 +142,11 @@ config PCIE_PTM This is only useful if you have devices that support PTM, but it is safe to enable even if you don't. + +config PCIE_BW + bool "PCI Express Bandwidth Change Notification" + depends on PCIEPORTBUS + help + This enables PCI Express Bandwidth Change Notification. If + you know link width or rate changes occur only to correct + unreliable links, you may answer Y. diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile index f1d7bc1e5efa..efb9d2e71e9e 100644 --- a/drivers/pci/pcie/Makefile +++ b/drivers/pci/pcie/Makefile @@ -3,7 +3,6 @@ # Makefile for PCI Express features and port driver pcieportdrv-y := portdrv_core.o portdrv_pci.o err.o -pcieportdrv-y += bw_notification.o obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o @@ -13,3 +12,4 @@ obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o obj-$(CONFIG_PCIE_PME) += pme.o obj-$(CONFIG_PCIE_DPC) += dpc.o obj-$(CONFIG_PCIE_PTM) += ptm.o +obj-$(CONFIG_PCIE_BW) += bw_notification.o diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 1d50dc58ac40..944827a8c7d3 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -49,7 +49,11 @@ int pcie_dpc_init(void); static inline int pcie_dpc_init(void) { return 0; } #endif +#ifdef CONFIG_PCIE_BW int pcie_bandwidth_notification_init(void); +#else +static inline int pcie_bandwidth_notification_init(void) { return 0; } +#endif /* Port Type */ #define PCIE_ANY_PORT (~0)