From 643162258e57180a33e0ef7f08f0d986fbb5b4b9 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 28 Jul 2006 18:12:09 +0900 Subject: [PATCH 01/32] [IPV6] ADDRCONF: Check payload length for IFA_LOCAL attribute in RTM_{ADD,DEL}MSG message Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2316a4315a18..81702b9ba5be 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2853,7 +2853,8 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) pfx = RTA_DATA(rta[IFA_ADDRESS-1]); } if (rta[IFA_LOCAL-1]) { - if (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx))) + if (RTA_PAYLOAD(rta[IFA_LOCAL-1]) < sizeof(*pfx) || + (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx)))) return -EINVAL; pfx = RTA_DATA(rta[IFA_LOCAL-1]); } @@ -2877,7 +2878,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) pfx = RTA_DATA(rta[IFA_ADDRESS-1]); } if (rta[IFA_LOCAL-1]) { - if (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx))) + if (RTA_PAYLOAD(rta[IFA_LOCAL-1]) < sizeof(*pfx) || + (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx)))) return -EINVAL; pfx = RTA_DATA(rta[IFA_LOCAL-1]); } From 0778769d392b5b80410673f53e4f946574ebacf7 Mon Sep 17 00:00:00 2001 From: Noriaki TAKAMIYA Date: Fri, 28 Jul 2006 18:12:10 +0900 Subject: [PATCH 02/32] [IPV6] ADDRCONF: Allow user-space to specify address lifetime Based on MIPL2 kernel patch. Signed-off-by: Noriaki TAKAMIYA Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 46 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 81702b9ba5be..c0641887bdeb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1869,15 +1869,21 @@ err_exit: /* * Manual configuration of address on an interface */ -static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen) +static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, + __u32 prefered_lft, __u32 valid_lft) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; + __u8 ifa_flags = 0; int scope; ASSERT_RTNL(); + /* check the lifetime */ + if (!valid_lft || prefered_lft > valid_lft) + return -EINVAL; + if ((dev = __dev_get_by_index(ifindex)) == NULL) return -ENODEV; @@ -1889,10 +1895,29 @@ static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen) scope = ipv6_addr_scope(pfx); - ifp = ipv6_add_addr(idev, pfx, plen, scope, IFA_F_PERMANENT); + if (valid_lft == INFINITY_LIFE_TIME) + ifa_flags |= IFA_F_PERMANENT; + else if (valid_lft >= 0x7FFFFFFF/HZ) + valid_lft = 0x7FFFFFFF/HZ; + + if (prefered_lft == 0) + ifa_flags |= IFA_F_DEPRECATED; + else if ((prefered_lft >= 0x7FFFFFFF/HZ) && + (prefered_lft != INFINITY_LIFE_TIME)) + prefered_lft = 0x7FFFFFFF/HZ; + + ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags); + if (!IS_ERR(ifp)) { + spin_lock(&ifp->lock); + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = jiffies; + spin_unlock(&ifp->lock); + addrconf_dad_start(ifp, 0); in6_ifa_put(ifp); + addrconf_verify(0); return 0; } @@ -1945,7 +1970,8 @@ int addrconf_add_ifaddr(void __user *arg) return -EFAULT; rtnl_lock(); - err = inet6_addr_add(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen); + err = inet6_addr_add(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); rtnl_unlock(); return err; } @@ -2870,6 +2896,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct rtattr **rta = arg; struct ifaddrmsg *ifm = NLMSG_DATA(nlh); struct in6_addr *pfx; + __u32 valid_lft = INFINITY_LIFE_TIME, prefered_lft = INFINITY_LIFE_TIME; pfx = NULL; if (rta[IFA_ADDRESS-1]) { @@ -2886,7 +2913,18 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (pfx == NULL) return -EINVAL; - return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); + if (rta[IFA_CACHEINFO-1]) { + struct ifa_cacheinfo *ci; + if (RTA_PAYLOAD(rta[IFA_CACHEINFO-1]) < sizeof(*ci)) + return -EINVAL; + ci = RTA_DATA(rta[IFA_CACHEINFO-1]); + valid_lft = ci->ifa_valid; + prefered_lft = ci->ifa_prefered; + } + + return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen, + prefered_lft, valid_lft); + } /* Maximum length of ifa_cacheinfo attributes */ From 8f27ebb9823b7f6b7a67ab325b515f75ba51bf4c Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 28 Jul 2006 18:12:11 +0900 Subject: [PATCH 03/32] [IPV6] ADDRCONF: Do not verify an address with infinity lifetime We also do not try regenarating new temporary address corresponding to an address with infinite preferred lifetime. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c0641887bdeb..93a40a8ade88 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2797,12 +2797,16 @@ restart: ifp->idev->nd_parms->retrans_time / HZ; #endif - if (age >= ifp->valid_lft) { + if (ifp->valid_lft != INFINITY_LIFE_TIME && + age >= ifp->valid_lft) { spin_unlock(&ifp->lock); in6_ifa_hold(ifp); read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); goto restart; + } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { + spin_unlock(&ifp->lock); + continue; } else if (age >= ifp->prefered_lft) { /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */ int deprecate = 0; From 6c223828058bc45f070d35b63d4a819a8df0146d Mon Sep 17 00:00:00 2001 From: Noriaki TAKAMIYA Date: Fri, 28 Jul 2006 18:12:12 +0900 Subject: [PATCH 04/32] [IPV6] ADDRCONF: Support get operation of single address Based on MIPL2 kernel patch. Signed-off-by: Noriaki TAKAMIYA Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 59 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 93a40a8ade88..3ef3fe20283f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3165,6 +3165,62 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) return inet6_dump_addr(skb, cb, type); } +static int inet6_rtm_getaddr(struct sk_buff *in_skb, + struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in6_addr *addr = NULL; + struct net_device *dev = NULL; + struct inet6_ifaddr *ifa; + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + INET6_IFADDR_RTA_SPACE); + int err; + + if (rta[IFA_ADDRESS-1]) { + if (RTA_PAYLOAD(rta[IFA_ADDRESS-1]) < sizeof(*addr)) + return -EINVAL; + addr = RTA_DATA(rta[IFA_ADDRESS-1]); + } + if (rta[IFA_LOCAL-1]) { + if (RTA_PAYLOAD(rta[IFA_LOCAL-1]) < sizeof(*addr) || + (addr && memcmp(addr, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*addr)))) + return -EINVAL; + addr = RTA_DATA(rta[IFA_LOCAL-1]); + } + if (addr == NULL) + return -EINVAL; + + if (ifm->ifa_index) + dev = __dev_get_by_index(ifm->ifa_index); + + if ((ifa = ipv6_get_ifaddr(addr, dev, 1)) == NULL) + return -EADDRNOTAVAIL; + + if ((skb = alloc_skb(size, GFP_KERNEL)) == NULL) { + err = -ENOBUFS; + goto out; + } + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, RTM_NEWADDR, 0); + if (err < 0) { + err = -EMSGSIZE; + goto out_free; + } + + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + in6_ifa_put(ifa); + return err; +out_free: + kfree_skb(skb); + goto out; +} + static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) { struct sk_buff *skb; @@ -3407,7 +3463,8 @@ static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = { [RTM_GETLINK - RTM_BASE] = { .dumpit = inet6_dump_ifinfo, }, [RTM_NEWADDR - RTM_BASE] = { .doit = inet6_rtm_newaddr, }, [RTM_DELADDR - RTM_BASE] = { .doit = inet6_rtm_deladdr, }, - [RTM_GETADDR - RTM_BASE] = { .dumpit = inet6_dump_ifaddr, }, + [RTM_GETADDR - RTM_BASE] = { .doit = inet6_rtm_getaddr, + .dumpit = inet6_dump_ifaddr, }, [RTM_GETMULTICAST - RTM_BASE] = { .dumpit = inet6_dump_ifmcaddr, }, [RTM_GETANYCAST - RTM_BASE] = { .dumpit = inet6_dump_ifacaddr, }, [RTM_NEWROUTE - RTM_BASE] = { .doit = inet6_rtm_newroute, }, From 081bba5b3ace5698eccf2f1a378cd4a9a4c98a85 Mon Sep 17 00:00:00 2001 From: Noriaki TAKAMIYA Date: Fri, 28 Jul 2006 18:12:13 +0900 Subject: [PATCH 05/32] [IPV6] ADDRCONF: NLM_F_REPLACE support for RTM_NEWADDR Based on MIPL2 kernel patch. Signed-off-by: Noriaki YAKAMIYA Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3ef3fe20283f..8ea1e36bf8eb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2894,6 +2894,55 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return inet6_addr_del(ifm->ifa_index, pfx, ifm->ifa_prefixlen); } +static int +inet6_addr_modify(int ifindex, struct in6_addr *pfx, + __u32 prefered_lft, __u32 valid_lft) +{ + struct inet6_ifaddr *ifp = NULL; + struct net_device *dev; + int ifa_flags = 0; + + if ((dev = __dev_get_by_index(ifindex)) == NULL) + return -ENODEV; + + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + + if (!valid_lft || (prefered_lft > valid_lft)) + return -EINVAL; + + ifp = ipv6_get_ifaddr(pfx, dev, 1); + if (ifp == NULL) + return -ENOENT; + + if (valid_lft == INFINITY_LIFE_TIME) + ifa_flags = IFA_F_PERMANENT; + else if (valid_lft >= 0x7FFFFFFF/HZ) + valid_lft = 0x7FFFFFFF/HZ; + + if (prefered_lft == 0) + ifa_flags = IFA_F_DEPRECATED; + else if ((prefered_lft >= 0x7FFFFFFF/HZ) && + (prefered_lft != INFINITY_LIFE_TIME)) + prefered_lft = 0x7FFFFFFF/HZ; + + spin_lock_bh(&ifp->lock); + ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED|IFA_F_PERMANENT)) | ifa_flags; + + ifp->tstamp = jiffies; + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + + spin_unlock_bh(&ifp->lock); + if (!(ifp->flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); + + addrconf_verify(0); + + return 0; +} + static int inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { @@ -2926,6 +2975,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) prefered_lft = ci->ifa_prefered; } + if (nlh->nlmsg_flags & NLM_F_REPLACE) { + int ret; + ret = inet6_addr_modify(ifm->ifa_index, pfx, + prefered_lft, valid_lft); + if (ret == 0 || !(nlh->nlmsg_flags & NLM_F_CREATE)) + return ret; + } + return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen, prefered_lft, valid_lft); From 679e898a4742d4a4a47430b67fd68a789a73dcfd Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 30 Jul 2006 20:19:11 -0700 Subject: [PATCH 06/32] [XFRM]: Fix protocol field value for outgoing IPv6 GSO packets Signed-off-by: Patrick McHardy Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/xfrm6_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 0eea60ea9ebc..c8c8b44a0f58 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -125,7 +125,7 @@ static int xfrm6_output_finish(struct sk_buff *skb) if (!skb_is_gso(skb)) return xfrm6_output_finish2(skb); - skb->protocol = htons(ETH_P_IP); + skb->protocol = htons(ETH_P_IPV6); segs = skb_gso_segment(skb, 0); kfree_skb(skb); if (unlikely(IS_ERR(segs))) From 497c615abad7ee81994dd592194535aea2aad617 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 30 Jul 2006 20:19:33 -0700 Subject: [PATCH 07/32] [IPV6]: Audit all ip6_dst_lookup/ip6_dst_store calls The current users of ip6_dst_lookup can be divided into two classes: 1) The caller holds no locks and is in user-context (UDP). 2) The caller does not want to lookup the dst cache at all. The second class covers everyone except UDP because most people do the cache lookup directly before calling ip6_dst_lookup. This patch adds ip6_sk_dst_lookup for the first class. Similarly ip6_dst_store users can be divded into those that need to take the socket dst lock and those that don't. This patch adds __ip6_dst_store for those (everyone except UDP/datagram) that don't need an extra lock. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/ip6_route.h | 12 +++- include/net/ipv6.h | 3 + net/dccp/ipv6.c | 4 +- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 120 +++++++++++++++++++++---------- net/ipv6/tcp_ipv6.c | 4 +- net/ipv6/udp.c | 2 +- 8 files changed, 100 insertions(+), 49 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ab29dafb1a6a..96b0e66406ec 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -139,16 +139,22 @@ extern rwlock_t rt6_lock; /* * Store a destination cache entry in a socket */ -static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - struct in6_addr *daddr) +static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst, + struct in6_addr *daddr) { struct ipv6_pinfo *np = inet6_sk(sk); struct rt6_info *rt = (struct rt6_info *) dst; - write_lock(&sk->sk_dst_lock); sk_setup_caps(sk, dst); np->daddr_cache = daddr; np->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; +} + +static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, + struct in6_addr *daddr) +{ + write_lock(&sk->sk_dst_lock); + __ip6_dst_store(sk, dst, daddr); write_unlock(&sk->sk_dst_lock); } diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a8fdf7970b37..ece7e8a84ffd 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -468,6 +468,9 @@ extern void ip6_flush_pending_frames(struct sock *sk); extern int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl); +extern int ip6_sk_dst_lookup(struct sock *sk, + struct dst_entry **dst, + struct flowi *fl); /* * skb processing functions diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 9f3d4d7cd0bf..610c722ac27f 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -230,7 +230,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ipv6_addr_copy(&np->saddr, saddr); inet->rcv_saddr = LOOPBACK4_IPV6; - ip6_dst_store(sk, dst, NULL); + __ip6_dst_store(sk, dst, NULL); icsk->icsk_ext_hdr_len = 0; if (np->opt != NULL) @@ -863,7 +863,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, * comment in that function for the gory details. -acme */ - ip6_dst_store(newsk, dst, NULL); + __ip6_dst_store(newsk, dst, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); newdp6 = (struct dccp6_sock *)newsk; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5a0ba58b86cc..ac85e9c532c2 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -658,7 +658,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return err; } - ip6_dst_store(sk, dst, NULL); + __ip6_dst_store(sk, dst, NULL); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 5c950cc79d80..bf491077b822 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -185,7 +185,7 @@ int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) return err; } - ip6_dst_store(sk, dst, NULL); + __ip6_dst_store(sk, dst, NULL); } skb->dst = dst_clone(dst); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3bc74ce78800..5e74a37695f7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -723,48 +723,51 @@ fail: return err; } -int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +static struct dst_entry *ip6_sk_dst_check(struct sock *sk, + struct dst_entry *dst, + struct flowi *fl) { - int err = 0; + struct ipv6_pinfo *np = inet6_sk(sk); + struct rt6_info *rt = (struct rt6_info *)dst; - *dst = NULL; - if (sk) { - struct ipv6_pinfo *np = inet6_sk(sk); - - *dst = sk_dst_check(sk, np->dst_cookie); - if (*dst) { - struct rt6_info *rt = (struct rt6_info*)*dst; - - /* Yes, checking route validity in not connected - * case is not very simple. Take into account, - * that we do not support routing by source, TOS, - * and MSG_DONTROUTE --ANK (980726) - * - * 1. If route was host route, check that - * cached destination is current. - * If it is network route, we still may - * check its validity using saved pointer - * to the last used address: daddr_cache. - * We do not want to save whole address now, - * (because main consumer of this service - * is tcp, which has not this problem), - * so that the last trick works only on connected - * sockets. - * 2. oif also should be the same. - */ - if (((rt->rt6i_dst.plen != 128 || - !ipv6_addr_equal(&fl->fl6_dst, - &rt->rt6i_dst.addr)) - && (np->daddr_cache == NULL || - !ipv6_addr_equal(&fl->fl6_dst, - np->daddr_cache))) - || (fl->oif && fl->oif != (*dst)->dev->ifindex)) { - dst_release(*dst); - *dst = NULL; - } - } + if (!dst) + goto out; + + /* Yes, checking route validity in not connected + * case is not very simple. Take into account, + * that we do not support routing by source, TOS, + * and MSG_DONTROUTE --ANK (980726) + * + * 1. If route was host route, check that + * cached destination is current. + * If it is network route, we still may + * check its validity using saved pointer + * to the last used address: daddr_cache. + * We do not want to save whole address now, + * (because main consumer of this service + * is tcp, which has not this problem), + * so that the last trick works only on connected + * sockets. + * 2. oif also should be the same. + */ + if (((rt->rt6i_dst.plen != 128 || + !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr)) + && (np->daddr_cache == NULL || + !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache))) + || (fl->oif && fl->oif != dst->dev->ifindex)) { + dst_release(dst); + dst = NULL; } +out: + return dst; +} + +static int ip6_dst_lookup_tail(struct sock *sk, + struct dst_entry **dst, struct flowi *fl) +{ + int err; + if (*dst == NULL) *dst = ip6_route_output(sk, fl); @@ -773,7 +776,6 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) if (ipv6_addr_any(&fl->fl6_src)) { err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); - if (err) goto out_err_release; } @@ -786,8 +788,48 @@ out_err_release: return err; } +/** + * ip6_dst_lookup - perform route lookup on flow + * @sk: socket which provides route info + * @dst: pointer to dst_entry * for result + * @fl: flow to lookup + * + * This function performs a route lookup on the given flow. + * + * It returns zero on success, or a standard errno code on error. + */ +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +{ + *dst = NULL; + return ip6_dst_lookup_tail(sk, dst, fl); +} EXPORT_SYMBOL_GPL(ip6_dst_lookup); +/** + * ip6_sk_dst_lookup - perform socket cached route lookup on flow + * @sk: socket which provides the dst cache and route info + * @dst: pointer to dst_entry * for result + * @fl: flow to lookup + * + * This function performs a route lookup on the given flow with the + * possibility of using the cached route in the socket if it is valid. + * It will take the socket dst lock when operating on the dst cache. + * As a result, this function can only be used in process context. + * + * It returns zero on success, or a standard errno code on error. + */ +int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +{ + *dst = NULL; + if (sk) { + *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); + *dst = ip6_sk_dst_check(sk, *dst, fl); + } + + return ip6_dst_lookup_tail(sk, dst, fl); +} +EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup); + static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 923989d0520d..b76fd7fba5f5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -270,7 +270,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL); + __ip6_dst_store(sk, dst, NULL); icsk->icsk_ext_hdr_len = 0; if (np->opt) @@ -947,7 +947,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(newsk, dst, NULL); + __ip6_dst_store(newsk, dst, NULL); newtcp6sk = (struct tcp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ccc57f434cd3..3d54f246411e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -782,7 +782,7 @@ do_udp_sendmsg: connected = 0; } - err = ip6_dst_lookup(sk, &dst, fl); + err = ip6_sk_dst_lookup(sk, &dst, fl); if (err) goto out; if (final_p) From 9cd3ecd674cf3194e07435b5b9559c4d432026d5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 30 Jul 2006 20:20:07 -0700 Subject: [PATCH 08/32] [NETFILTER]: include/linux/netfilter_bridge.h: header cleanup Header doesn't use anything from atomic.h. It fixes headers_check warning: include/linux/netfilter_bridge.h requires asm/atomic.h, which does not exist Compile tested on alpha arm i386-up sparc sparc64-up x86_64 alpha-up i386 sparc64 sparc-up x86_64-up Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/netfilter_bridge.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 31f02ba036ce..10c13dc4665b 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -6,7 +6,6 @@ #include #if defined(__KERNEL__) && defined(CONFIG_BRIDGE_NETFILTER) -#include #include #endif From f4d26fb336f3c08066bffbe907d3104be4fb91a8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 30 Jul 2006 20:20:28 -0700 Subject: [PATCH 09/32] [NET]: Fix ___pskb_trim when entire frag_list needs dropping When the trim point is within the head and there is no paged data, ___pskb_trim fails to drop the first element in the frag_list. This patch fixes this by moving the len <= offset case out of the page data loop. This patch also adds a missing kfree_skb on the frag that we just cloned. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 476aa3978504..d236f02c6467 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -846,7 +846,11 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) return err; - for (i = 0; i < nfrags; i++) { + i = 0; + if (offset >= len) + goto drop_pages; + + for (; i < nfrags; i++) { int end = offset + skb_shinfo(skb)->frags[i].size; if (end < len) { @@ -854,9 +858,9 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) continue; } - if (len > offset) - skb_shinfo(skb)->frags[i++].size = len - offset; + skb_shinfo(skb)->frags[i++].size = len - offset; +drop_pages: skb_shinfo(skb)->nr_frags = i; for (; i < nfrags; i++) @@ -864,7 +868,7 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) if (skb_shinfo(skb)->frag_list) skb_drop_fraglist(skb); - break; + goto done; } for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); @@ -879,6 +883,7 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) return -ENOMEM; nfrag->next = frag->next; + kfree_skb(frag); frag = nfrag; *fragp = frag; } @@ -897,6 +902,7 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) break; } +done: if (len > skb_headlen(skb)) { skb->data_len -= skb->len - len; skb->len = len; From 9cac2c35e26cc44978df654306bb92d7cfe7e2de Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 30 Jul 2006 20:20:54 -0700 Subject: [PATCH 10/32] [ATALK]: Make CONFIG_DEV_APPLETALK a tristate. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise we allow building appletalk drivers in-kernel when CONFIG_ATALK is modular. That doesn't work because these drivers use symbols such as "alloc_talkdev" which is exported from code built by CONFIG_ATALK. Noticed by Toralf Förster. Signed-off-by: David S. Miller --- drivers/net/appletalk/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/appletalk/Kconfig b/drivers/net/appletalk/Kconfig index b14e89004c3a..0a0e0cd81a23 100644 --- a/drivers/net/appletalk/Kconfig +++ b/drivers/net/appletalk/Kconfig @@ -29,7 +29,7 @@ config ATALK even politically correct people are allowed to say Y here. config DEV_APPLETALK - bool "Appletalk interfaces support" + tristate "Appletalk interfaces support" depends on ATALK help AppleTalk is the protocol that Apple computers can use to communicate From 118075b3cdc90e0815362365f3fc64d672ace0d6 Mon Sep 17 00:00:00 2001 From: James Morris Date: Sun, 30 Jul 2006 20:21:45 -0700 Subject: [PATCH 11/32] [TCP]: fix memory leak in net/ipv4/tcp_probe.c::tcpprobe_read() Based upon a patch by Jesper Juhl. Signed-off-by: James Morris Acked-by: Stephen Hemminger Acked-by: Jesper Juhl Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index d7d517a3a238..b3435324b573 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -114,7 +114,7 @@ static int tcpprobe_open(struct inode * inode, struct file * file) static ssize_t tcpprobe_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { - int error = 0, cnt; + int error = 0, cnt = 0; unsigned char *tbuf; if (!buf || len < 0) From 3687b1dc6fe83a500ba4d3235704594f6a111a2d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 30 Jul 2006 20:35:54 -0700 Subject: [PATCH 12/32] [TCP]: SNMPv2 tcpAttemptFails counter error Refer to RFC2012, tcpAttemptFails is defined as following: tcpAttemptFails OBJECT-TYPE SYNTAX Counter32 MAX-ACCESS read-only STATUS current DESCRIPTION "The number of times TCP connections have made a direct transition to the CLOSED state from either the SYN-SENT state or the SYN-RCVD state, plus the number of times TCP connections have made a direct transition to the LISTEN state from the SYN-RCVD state." ::= { tcp 7 } When I lookup into RFC793, I found that the state change should occured under following condition: 1. SYN-SENT -> CLOSED a) Received ACK,RST segment when SYN-SENT state. 2. SYN-RCVD -> CLOSED b) Received SYN segment when SYN-RCVD state(came from LISTEN). c) Received RST segment when SYN-RCVD state(came from SYN-SENT). d) Received SYN segment when SYN-RCVD state(came from SYN-SENT). 3. SYN-RCVD -> LISTEN e) Received RST segment when SYN-RCVD state(came from LISTEN). In my test, those direct state transition can not be counted to tcpAttemptFails. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +++ net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_minisocks.c | 4 +++- net/ipv6/tcp_ipv6.c | 2 -- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 0720bddff1e9..7a093d0aa0fe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -914,6 +914,9 @@ static inline void tcp_set_state(struct sock *sk, int state) static inline void tcp_done(struct sock *sk) { + if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f6f39e814291..4b04c3edd4a9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -438,7 +438,6 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) It can f.e. if SYNs crossed. */ if (!sock_owned_by_user(sk)) { - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); sk->sk_err = err; sk->sk_error_report(sk); @@ -874,7 +873,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) drop_and_free: reqsk_free(req); drop: - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); return 0; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0ccb7cb22b15..624e2b2c7f53 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -589,8 +589,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ - if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) + if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; + } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b76fd7fba5f5..b843a650be71 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -427,7 +427,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case TCP_SYN_RECV: /* Cannot happen. It can, it SYNs are crossed. --ANK */ if (!sock_owned_by_user(sk)) { - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ @@ -831,7 +830,6 @@ drop: if (req) reqsk_free(req); - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); return 0; /* don't send reset */ } From 792d1932e319ff8ba01361e7d151b1794c55c31f Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Jul 2006 20:43:26 -0700 Subject: [PATCH 13/32] [NET]: Network Event Notifier Mechanism. This patch uses notifier blocks to implement a network event notifier mechanism. Clients register their callback function by calling register_netevent_notifier() like this: static struct notifier_block nb = { .notifier_call = my_callback_func }; ... register_netevent_notifier(&nb); Signed-off-by: Tom Tucker Signed-off-by: Steve Wise Signed-off-by: David S. Miller --- include/net/netevent.h | 33 ++++++++++++++++++++ net/core/netevent.c | 69 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 include/net/netevent.h create mode 100644 net/core/netevent.c diff --git a/include/net/netevent.h b/include/net/netevent.h new file mode 100644 index 000000000000..e5d216241423 --- /dev/null +++ b/include/net/netevent.h @@ -0,0 +1,33 @@ +#ifndef _NET_EVENT_H +#define _NET_EVENT_H + +/* + * Generic netevent notifiers + * + * Authors: + * Tom Tucker + * Steve Wise + * + * Changes: + */ +#ifdef __KERNEL__ + +#include + +struct netevent_redirect { + struct dst_entry *old; + struct dst_entry *new; +}; + +enum netevent_notif_type { + NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ + NETEVENT_PMTU_UPDATE, /* arg is struct dst_entry ptr */ + NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ +}; + +extern int register_netevent_notifier(struct notifier_block *nb); +extern int unregister_netevent_notifier(struct notifier_block *nb); +extern int call_netevent_notifiers(unsigned long val, void *v); + +#endif +#endif diff --git a/net/core/netevent.c b/net/core/netevent.c new file mode 100644 index 000000000000..35d02c38554e --- /dev/null +++ b/net/core/netevent.c @@ -0,0 +1,69 @@ +/* + * Network event notifiers + * + * Authors: + * Tom Tucker + * Steve Wise + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + */ + +#include +#include + +static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); + +/** + * register_netevent_notifier - register a netevent notifier block + * @nb: notifier + * + * Register a notifier to be called when a netevent occurs. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + */ +int register_netevent_notifier(struct notifier_block *nb) +{ + int err; + + err = atomic_notifier_chain_register(&netevent_notif_chain, nb); + return err; +} + +/** + * netevent_unregister_notifier - unregister a netevent notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_neigh_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + */ + +int unregister_netevent_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&netevent_notif_chain, nb); +} + +/** + * call_netevent_notifiers - call all netevent notifier blocks + * @val: value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + * Call all neighbour notifier blocks. Parameters and return value + * are as for notifier_call_chain(). + */ + +int call_netevent_notifiers(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&netevent_notif_chain, val, v); +} + +EXPORT_SYMBOL_GPL(register_netevent_notifier); +EXPORT_SYMBOL_GPL(unregister_netevent_notifier); +EXPORT_SYMBOL_GPL(call_netevent_notifiers); From 8d71740c56a9058acc4378504a356d543ff1308b Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Jul 2006 20:43:36 -0700 Subject: [PATCH 14/32] [NET]: Core net changes to generate netevents Generate netevents for: - neighbour changes - routing redirects - pmtu changes Signed-off-by: Tom Tucker Signed-off-by: Steve Wise Signed-off-by: David S. Miller --- net/core/Makefile | 2 +- net/core/neighbour.c | 14 ++++++++------ net/ipv4/route.c | 8 ++++++++ net/ipv6/route.c | 7 +++++++ 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/net/core/Makefile b/net/core/Makefile index e9bd2467d5a9..2645ba428d48 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -7,7 +7,7 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o ethtool.o dev_mcast.o dst.o \ +obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o obj-$(CONFIG_XFRM) += flow.o diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7ad681f5e712..5130d2efdbbe 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -754,6 +755,7 @@ static void neigh_timer_handler(unsigned long arg) neigh->nud_state = NUD_STALE; neigh->updated = jiffies; neigh_suspect(neigh); + notify = 1; } } else if (state & NUD_DELAY) { if (time_before_eq(now, @@ -762,6 +764,7 @@ static void neigh_timer_handler(unsigned long arg) neigh->nud_state = NUD_REACHABLE; neigh->updated = jiffies; neigh_connect(neigh); + notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { NEIGH_PRINTK2("neigh %p is probed.\n", neigh); @@ -819,6 +822,8 @@ static void neigh_timer_handler(unsigned long arg) out: write_unlock(&neigh->lock); } + if (notify) + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); #ifdef CONFIG_ARPD if (notify && neigh->parms->app_probes) @@ -926,9 +931,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, { u8 old; int err; -#ifdef CONFIG_ARPD int notify = 0; -#endif struct net_device *dev; int update_isrouter = 0; @@ -948,9 +951,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh_suspect(neigh); neigh->nud_state = new; err = 0; -#ifdef CONFIG_ARPD notify = old & NUD_VALID; -#endif goto out; } @@ -1022,9 +1023,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (neigh->parms->base_reachable_time << 1); -#ifdef CONFIG_ARPD notify = 1; -#endif } if (new == old) goto out; @@ -1056,6 +1055,9 @@ out: (neigh->flags & ~NTF_ROUTER); } write_unlock_bh(&neigh->lock); + + if (notify) + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); #ifdef CONFIG_ARPD if (notify && neigh->parms->app_probes) neigh_app_notify(neigh); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2dc6dbb28467..19bd49d69d9f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -104,6 +104,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL #include #endif @@ -1125,6 +1126,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, struct rtable *rth, **rthp; u32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; + struct netevent_redirect netevent; if (!in_dev) return; @@ -1216,6 +1218,11 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, rt_drop(rt); goto do_next; } + + netevent.old = &rth->u.dst; + netevent.new = &rt->u.dst; + call_netevent_notifiers(NETEVENT_REDIRECT, + &netevent); rt_del(hash, rth); if (!rt_intern_hash(hash, rt, &rt)) @@ -1452,6 +1459,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) } dst->metrics[RTAX_MTU-1] = mtu; dst_set_expires(dst, ip_rt_mtu_expires); + call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 87c39c978cd0..4b163711f3a8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -53,6 +53,7 @@ #include #include #include +#include #include @@ -742,6 +743,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; } dst->metrics[RTAX_MTU-1] = mtu; + call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); } } @@ -1155,6 +1157,7 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, struct rt6_info *rt, *nrt = NULL; int strict; struct fib6_node *fn; + struct netevent_redirect netevent; /* * Get the "current" route for this destination and @@ -1252,6 +1255,10 @@ restart: if (ip6_ins_rt(nrt, NULL, NULL, NULL)) goto out; + netevent.old = &rt->u.dst; + netevent.new = &nrt->u.dst; + call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); + if (rt->rt6i_flags&RTF_CACHE) { ip6_del_rt(rt, NULL, NULL, NULL); return; From e795d092507d571d66f2ec98d3efdc7dd284bf80 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Jul 2006 20:44:19 -0700 Subject: [PATCH 15/32] [NET] infiniband: Cleanup ib_addr module to use the netevents Signed-off-by: Tom Tucker Signed-off-by: Steve Wise Signed-off-by: David S. Miller --- drivers/infiniband/core/addr.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index d294bbc42f09..1205e8027829 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -35,6 +35,7 @@ #include #include #include +#include #include MODULE_AUTHOR("Sean Hefty"); @@ -326,25 +327,22 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) } EXPORT_SYMBOL(rdma_addr_cancel); -static int addr_arp_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pkt, struct net_device *orig_dev) +static int netevent_callback(struct notifier_block *self, unsigned long event, + void *ctx) { - struct arphdr *arp_hdr; + if (event == NETEVENT_NEIGH_UPDATE) { + struct neighbour *neigh = ctx; - arp_hdr = (struct arphdr *) skb->nh.raw; - - if (arp_hdr->ar_op == htons(ARPOP_REQUEST) || - arp_hdr->ar_op == htons(ARPOP_REPLY)) - set_timeout(jiffies); - - kfree_skb(skb); + if (neigh->dev->type == ARPHRD_INFINIBAND && + (neigh->nud_state & NUD_VALID)) { + set_timeout(jiffies); + } + } return 0; } -static struct packet_type addr_arp = { - .type = __constant_htons(ETH_P_ARP), - .func = addr_arp_recv, - .af_packet_priv = (void*) 1, +static struct notifier_block nb = { + .notifier_call = netevent_callback }; static int addr_init(void) @@ -353,13 +351,13 @@ static int addr_init(void) if (!addr_wq) return -ENOMEM; - dev_add_pack(&addr_arp); + register_netevent_notifier(&nb); return 0; } static void addr_cleanup(void) { - dev_remove_pack(&addr_arp); + unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); } From a280b89982f48e9a32c6410a37419b12ca88af6b Mon Sep 17 00:00:00 2001 From: James Morris Date: Sun, 30 Jul 2006 20:46:38 -0700 Subject: [PATCH 16/32] [SECURITY] secmark: nul-terminate secdata The patch below fixes a problem in the iptables SECMARK target, where the user-supplied 'selctx' string may not be nul-terminated. From initial analysis, it seems that the strlen() called from selinux_string_to_sid() could run until it arbitrarily finds a zero, and possibly cause a kernel oops before then. The impact of this appears limited because the operation requires CAP_NET_ADMIN, which is essentially always root. Also, the module is not yet in wide use. Signed-off-by: James Morris Signed-off-by: Stephen Smalley Signed-off-by: David S. Miller --- net/netfilter/xt_SECMARK.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index c2ce9c4011cc..de9537ad9a7c 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -57,6 +57,8 @@ static int checkentry_selinux(struct xt_secmark_target_info *info) { int err; struct xt_secmark_target_selinux_info *sel = &info->u.sel; + + sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0'; err = selinux_string_to_sid(sel->selctx, &sel->selsid); if (err) { From 52499afe40387524e9f46ef9ce4695efccdd2ed9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 31 Jul 2006 22:32:09 -0700 Subject: [PATCH 17/32] [TCP]: Process linger2 timeout consistently. Based upon guidance from Alexey Kuznetsov. When linger2 is active, we check to see if the fin_wait2 timeout is longer than the timewait. If it is, we schedule the keepalive timer for the difference between the timewait timeout and the fin_wait2 timeout. When this orphan socket is seen by tcp_keepalive_timer() it will try to transform this fin_wait2 socket into a fin_wait2 mini-socket, again if linger2 is active. Not all paths were setting this initial keepalive timer correctly. The tcp input path was doing it correctly, but tcp_close() wasn't, potentially making the socket linger longer than it really needs to. Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f6a2d9223d07..7b621e44b124 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1659,7 +1659,8 @@ adjudge_to_death: const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); + inet_csk_reset_keepalive_timer(sk, + tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; From 8af2745645243b5e5b031504a643bf2158571dc7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Jul 2006 22:35:23 -0700 Subject: [PATCH 18/32] [NET]: Add netdev_alloc_skb(). Add a dev_alloc_skb variant that takes a struct net_device * paramater. For now that paramater is unused, but I'll use it to allocate the skb from node-local memory in a follow-up patch. Also there have been some other plans mentioned on the list that can use it. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/skbuff.h | 22 ++++++++++++++++++++++ net/core/skbuff.c | 24 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4307e764ef0a..b91fbfb7d54c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1104,6 +1104,28 @@ static inline struct sk_buff *dev_alloc_skb(unsigned int length) return __dev_alloc_skb(length, GFP_ATOMIC); } +extern struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask); + +/** + * netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. Although this function + * allocates memory it can be called from an interrupt. + */ +static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev, + unsigned int length) +{ + return __netdev_alloc_skb(dev, length, GFP_ATOMIC); +} + /** * skb_cow - copy header of skb when it is required * @skb: buffer to cow diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d236f02c6467..71487b915d67 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -256,6 +256,29 @@ nodata: goto out; } +/** + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + skb = alloc_skb(length + NET_SKB_PAD, gfp_mask); + if (likely(skb)) + skb_reserve(skb, NET_SKB_PAD); + return skb; +} static void skb_drop_list(struct sk_buff **listp) { @@ -2048,6 +2071,7 @@ EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(kfree_skb); EXPORT_SYMBOL(__pskb_pull_tail); EXPORT_SYMBOL(__alloc_skb); +EXPORT_SYMBOL(__netdev_alloc_skb); EXPORT_SYMBOL(pskb_copy); EXPORT_SYMBOL(pskb_expand_head); EXPORT_SYMBOL(skb_checksum); From a20e9c6291f27cac4a9ab450d124794c012f87d4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 31 Jul 2006 22:38:16 -0700 Subject: [PATCH 19/32] [TG3]: Convert to netdev_alloc_skb Signed-off-by: David S. Miller --- drivers/net/tg3.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 1b8138f641e3..6f97962dd06b 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -68,8 +68,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.63" -#define DRV_MODULE_RELDATE "July 25, 2006" +#define DRV_MODULE_VERSION "3.64" +#define DRV_MODULE_RELDATE "July 31, 2006" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 @@ -3097,7 +3097,7 @@ static int tg3_alloc_rx_skb(struct tg3 *tp, u32 opaque_key, * Callers depend upon this behavior and assume that * we leave everything unchanged if we fail. */ - skb = dev_alloc_skb(skb_size); + skb = netdev_alloc_skb(tp->dev, skb_size); if (skb == NULL) return -ENOMEM; @@ -3270,7 +3270,7 @@ static int tg3_rx(struct tg3 *tp, int budget) tg3_recycle_rx(tp, opaque_key, desc_idx, *post_ptr); - copy_skb = dev_alloc_skb(len + 2); + copy_skb = netdev_alloc_skb(tp->dev, len + 2); if (copy_skb == NULL) goto drop_it_no_recycle; @@ -8618,7 +8618,7 @@ static int tg3_run_loopback(struct tg3 *tp, int loopback_mode) err = -EIO; tx_len = 1514; - skb = dev_alloc_skb(tx_len); + skb = netdev_alloc_skb(tp->dev, tx_len); if (!skb) return -ENOMEM; From 87f5032e0ca149bd03f0e2b46071b0c4a2312e82 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 31 Jul 2006 22:39:40 -0700 Subject: [PATCH 20/32] [E1000]: Convert to netdev_alloc_skb Signed-off-by: David S. Miller --- drivers/net/e1000/e1000_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index da62db897426..627f224d78bc 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -3127,7 +3127,7 @@ e1000_change_mtu(struct net_device *netdev, int new_mtu) break; } - /* NOTE: dev_alloc_skb reserves 16 bytes, and typically NET_IP_ALIGN + /* NOTE: netdev_alloc_skb reserves 16 bytes, and typically NET_IP_ALIGN * means we reserve 2 more, this pushes us to allocate from the next * larger slab size * i.e. RXBUFFER_2048 --> size-4096 slab */ @@ -3708,7 +3708,7 @@ e1000_clean_rx_irq(struct e1000_adapter *adapter, #define E1000_CB_LENGTH 256 if (length < E1000_CB_LENGTH) { struct sk_buff *new_skb = - dev_alloc_skb(length + NET_IP_ALIGN); + netdev_alloc_skb(netdev, length + NET_IP_ALIGN); if (new_skb) { skb_reserve(new_skb, NET_IP_ALIGN); new_skb->dev = netdev; @@ -3979,7 +3979,7 @@ e1000_alloc_rx_buffers(struct e1000_adapter *adapter, while (cleaned_count--) { if (!(skb = buffer_info->skb)) - skb = dev_alloc_skb(bufsz); + skb = netdev_alloc_skb(netdev, bufsz); else { skb_trim(skb, 0); goto map_skb; @@ -3997,7 +3997,7 @@ e1000_alloc_rx_buffers(struct e1000_adapter *adapter, DPRINTK(RX_ERR, ERR, "skb align check failed: %u bytes " "at %p\n", bufsz, skb->data); /* Try again, without freeing the previous */ - skb = dev_alloc_skb(bufsz); + skb = netdev_alloc_skb(netdev, bufsz); /* Failed allocation, critical failure */ if (!skb) { dev_kfree_skb(oldskb); @@ -4121,7 +4121,8 @@ e1000_alloc_rx_buffers_ps(struct e1000_adapter *adapter, rx_desc->read.buffer_addr[j+1] = ~0; } - skb = dev_alloc_skb(adapter->rx_ps_bsize0 + NET_IP_ALIGN); + skb = netdev_alloc_skb(netdev, + adapter->rx_ps_bsize0 + NET_IP_ALIGN); if (unlikely(!skb)) { adapter->alloc_rx_buff_failed++; From b10866fd7dd9ae9b8dd03646d28702a76d624474 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 31 Jul 2006 23:46:18 -0700 Subject: [PATCH 21/32] [NETFILTER]: SIP helper: expect RTP streams in both directions Since we don't know in which direction the first packet will arrive, we need to create one expectation for each direction, which is currently prevented by max_expected beeing set to 1. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_sip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c index fc87ce0da40d..4f222d6be009 100644 --- a/net/ipv4/netfilter/ip_conntrack_sip.c +++ b/net/ipv4/netfilter/ip_conntrack_sip.c @@ -442,7 +442,7 @@ static int __init init(void) sip[i].tuple.src.u.udp.port = htons(ports[i]); sip[i].mask.src.u.udp.port = 0xFFFF; sip[i].mask.dst.protonum = 0xFF; - sip[i].max_expected = 1; + sip[i].max_expected = 2; sip[i].timeout = 3 * 60; /* 3 minutes */ sip[i].me = THIS_MODULE; sip[i].help = sip_help; From 3ab720881b6e36bd5190a3a11cee8d8d067c4ad7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 31 Jul 2006 23:47:31 -0700 Subject: [PATCH 22/32] [NETFILTER]: xt_hashlimit/xt_string: missing string validation The hashlimit table name and the textsearch algorithm need to be terminated, the textsearch pattern length must not exceed the maximum size. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_hashlimit.c | 3 +++ net/netfilter/xt_string.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 92980ab8ce48..6b662449e825 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -508,6 +508,9 @@ hashlimit_checkentry(const char *tablename, if (!r->cfg.expire) return 0; + if (r->name[sizeof(r->name) - 1] != '\0') + return 0; + /* This is the best we've got: We cannot release and re-grab lock, * since checkentry() is called before ip_tables.c grabs ipt_mutex. * We also cannot grab the hashtable spinlock, since htable_create will diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 0ebb6ac2c8c7..d8e3891b5f8b 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -55,7 +55,10 @@ static int checkentry(const char *tablename, /* Damn, can't handle this case properly with iptables... */ if (conf->from_offset > conf->to_offset) return 0; - + if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0') + return 0; + if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) + return 0; ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, GFP_KERNEL, TS_AUTOLOAD); if (IS_ERR(ts_conf)) From b60dfc6c20bd5f19de0083362ce377c89b1e5a24 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 1 Aug 2006 00:00:12 -0700 Subject: [PATCH 23/32] [NET]: Kill the WARN_ON() calls for checksum fixups. We have a more complete solution in the works, involving the seperation of CHECKSUM_HW on input vs. output, and having netfilter properly do incremental checksums. But that is a very involved patch and is thus 2.6.19 material. What we have now is infinitely better than the past, wherein all TSO packets were dropped due to corrupt checksums as soon at the NAT module was loaded. At least now, the checksums do get fixed up, it just isn't the cleanest nor most optimal solution. Signed-off-by: David S. Miller --- net/core/dev.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 4d2b5167d7f5..5b630cece707 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1166,11 +1166,6 @@ int skb_checksum_help(struct sk_buff *skb, int inward) goto out_set_summed; if (unlikely(skb_shinfo(skb)->gso_size)) { - static int warned; - - WARN_ON(!warned); - warned = 1; - /* Let GSO fix up the checksum. */ goto out_set_summed; } @@ -1220,11 +1215,6 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) __skb_pull(skb, skb->mac_len); if (unlikely(skb->ip_summed != CHECKSUM_HW)) { - static int warned; - - WARN_ON(!warned); - warned = 1; - if (skb_header_cloned(skb) && (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) return ERR_PTR(err); From 32c524d1c48b62be49fa1b1dd93fed10792debc0 Mon Sep 17 00:00:00 2001 From: Wei Dong Date: Wed, 2 Aug 2006 13:39:57 -0700 Subject: [PATCH 24/32] [IPV6]: SNMPv2 "ipv6IfStatsInHdrErrors" counter error When I tested Linux kernel 2.6.17.7 about statistics "ipv6IfStatsInHdrErrors", found that this counter couldn't increase correctly. The criteria is RFC2465: ipv6IfStatsInHdrErrors OBJECT-TYPE SYNTAX Counter3 MAX-ACCESS read-only STATUS current DESCRIPTION "The number of input datagrams discarded due to errors in their IPv6 headers, including version number mismatch, other format errors, hop count exceeded, errors discovered in processing their IPv6 options, etc." ::= { ipv6IfStatsEntry 2 } When I send TTL=0 and TTL=1 a packet to a router which need to be forwarded, router just sends an ICMPv6 message to tell the sender that TIME_EXCEED and HOPLIMITS, but no increments for this counter(in the function ip6_forward). Signed-off-by: Wei Dong Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5e74a37695f7..70c9234b70e7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -356,6 +356,7 @@ int ip6_forward(struct sk_buff *skb) skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0, skb->dev); + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; From dafee490858f79e144c5e6cdd84ceb9efa20a3f1 Mon Sep 17 00:00:00 2001 From: Wei Dong Date: Wed, 2 Aug 2006 13:41:21 -0700 Subject: [PATCH 25/32] [IPV6]: SNMPv2 "ipv6IfStatsOutFragCreates" counter error When I tested linux kernel 2.6.71.7 about statistics "ipv6IfStatsOutFragCreates", and found that it couldn't increase correctly. The criteria is RFC 2465: ipv6IfStatsOutFragCreates OBJECT-TYPE SYNTAX Counter32 MAX-ACCESS read-only STATUS current DESCRIPTION "The number of output datagram fragments that have been generated as a result of fragmentation at this output interface." ::= { ipv6IfStatsEntry 15 } I think there are two issues in Linux kernel. 1st: RFC2465 specifies the counter is "The number of output datagram fragments...". I think increasing this counter after output a fragment successfully is better. And it should not be increased even though a fragment is created but failed to output. 2nd: If we send a big ICMP/ICMPv6 echo request to a host, and receive ICMP/ICMPv6 echo reply consisted of some fragments. As we know that in Linux kernel first fragmentation occurs in ICMP layer(maybe saying transport layer is better), but this is not the "real" fragmentation,just do some "pre-fragment" -- allocate space for date, and form a frag_list, etc. The "real" fragmentation happens in IP layer -- set offset and MF flag and so on. So I think in "fast path" for ip_fragment/ip6_fragment, if we send a fragment which "pre-fragment" by upper layer we should also increase "ipv6IfStatsOutFragCreates". Signed-off-by: Wei Dong Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 7 ++++--- net/ipv6/ip6_output.c | 8 +++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7c9f9a6421b8..9bf307a29783 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -526,6 +526,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); + if (!err) + IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -649,9 +651,6 @@ slow_path: /* * Put this fragment into the sending queue. */ - - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); - iph->tot_len = htons(len + hlen); ip_send_check(iph); @@ -659,6 +658,8 @@ slow_path: err = output(skb2); if (err) goto fail; + + IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); IP_INC_STATS(IPSTATS_MIB_FRAGOKS); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 70c9234b70e7..69451af6abe7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -596,6 +596,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } err = output(skb); + if(!err) + IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES); + if (err || !frag) break; @@ -707,12 +710,11 @@ slow_path: /* * Put this fragment into the sending queue. */ - - IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES); - err = output(frag); if (err) goto fail; + + IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); IP6_INC_STATS(IPSTATS_MIB_FRAGOKS); From 76f10ad0e67cbc6ded2ee143e5188e0b7ff9fb15 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 2 Aug 2006 14:06:55 -0700 Subject: [PATCH 26/32] [NET]: Remove lockdep_set_class() call from skb_queue_head_init(). The skb_queue_head_init() function is used both in drivers for private use and in the core networking code. The usage models are vastly set of functions that is only softirq safe; while the driver usage tends to be more limited to a few hardirq safe accessor functions. Rather than annotating all 133+ driver usages, for now just split this lock into a per queue class. This change is obviously safe and probably should make 2.6.18. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b91fbfb7d54c..b22d4a6083bd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -606,10 +606,17 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) extern struct lock_class_key skb_queue_lock_key; +/* + * This function creates a split out lock class for each invocation; + * this is needed for now since a whole lot of users of the skb-queue + * infrastructure in drivers have different locking usage (in hardirq) + * than the networking core (in softirq only). In the long run either the + * network layer or drivers should need annotation to consolidate the + * main types of usage into 3 classes. + */ static inline void skb_queue_head_init(struct sk_buff_head *list) { spin_lock_init(&list->lock); - lockdep_set_class(&list->lock, &skb_queue_lock_key); list->prev = list->next = (struct sk_buff *)list; list->qlen = 0; } From 2b7e24b66d31d677d76b49918e711eb360c978b6 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 Aug 2006 14:07:58 -0700 Subject: [PATCH 27/32] [NET]: skb_queue_lock_key() is no longer used. Signed-off-by: Adrian Bunk Acked-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- net/core/skbuff.c | 7 ------- 2 files changed, 9 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b22d4a6083bd..19c96d498e20 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -604,8 +604,6 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) return list_->qlen; } -extern struct lock_class_key skb_queue_lock_key; - /* * This function creates a split out lock class for each invocation; * this is needed for now since a whole lot of users of the skb-queue diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 71487b915d67..022d8894c11d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,13 +70,6 @@ static kmem_cache_t *skbuff_head_cache __read_mostly; static kmem_cache_t *skbuff_fclone_cache __read_mostly; -/* - * lockdep: lock class key used by skb_queue_head_init(): - */ -struct lock_class_key skb_queue_lock_key; - -EXPORT_SYMBOL(skb_queue_lock_key); - /* * Keep out-of-line to prevent kernel bloat. * __builtin_return_address is not used because it is not always From dc49c1f94e3469d94b952e8f5160dd4ccd791d79 Mon Sep 17 00:00:00 2001 From: Catherine Zhang Date: Wed, 2 Aug 2006 14:12:06 -0700 Subject: [PATCH 28/32] [AF_UNIX]: Kernel memory leak fix for af_unix datagram getpeersec patch From: Catherine Zhang This patch implements a cleaner fix for the memory leak problem of the original unix datagram getpeersec patch. Instead of creating a security context each time a unix datagram is sent, we only create the security context when the receiver requests it. This new design requires modification of the current unix_getsecpeer_dgram LSM hook and addition of two new hooks, namely, secid_to_secctx and release_secctx. The former retrieves the security context and the latter releases it. A hook is required for releasing the security context because it is up to the security module to decide how that's done. In the case of Selinux, it's a simple kfree operation. Acked-by: Stephen Smalley Signed-off-by: David S. Miller --- include/linux/security.h | 41 ++++++++++++++++++++++++++++++++++------ include/net/af_unix.h | 6 ++---- include/net/scm.h | 29 ++++++++++++++++++++++++---- net/ipv4/ip_sockglue.c | 9 +++++++-- net/unix/af_unix.c | 17 +++++------------ security/dummy.c | 14 ++++++++++++-- security/selinux/hooks.c | 38 +++++++++++++++++++++++-------------- 7 files changed, 110 insertions(+), 44 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index f75303831d09..aa5b8043dc38 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1109,6 +1109,16 @@ struct swap_info_struct; * @name contains the name of the security module being unstacked. * @ops contains a pointer to the struct security_operations of the module to unstack. * + * @secid_to_secctx: + * Convert secid to security context. + * @secid contains the security ID. + * @secdata contains the pointer that stores the converted security context. + * + * @release_secctx: + * Release the security context. + * @secdata contains the security context. + * @seclen contains the length of the security context. + * * This is the main security structure. */ struct security_operations { @@ -1289,6 +1299,8 @@ struct security_operations { int (*getprocattr)(struct task_struct *p, char *name, void *value, size_t size); int (*setprocattr)(struct task_struct *p, char *name, void *value, size_t size); + int (*secid_to_secctx)(u32 secid, char **secdata, u32 *seclen); + void (*release_secctx)(char *secdata, u32 seclen); #ifdef CONFIG_SECURITY_NETWORK int (*unix_stream_connect) (struct socket * sock, @@ -1317,7 +1329,7 @@ struct security_operations { int (*socket_shutdown) (struct socket * sock, int how); int (*socket_sock_rcv_skb) (struct sock * sk, struct sk_buff * skb); int (*socket_getpeersec_stream) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len); - int (*socket_getpeersec_dgram) (struct sk_buff *skb, char **secdata, u32 *seclen); + int (*socket_getpeersec_dgram) (struct socket *sock, struct sk_buff *skb, u32 *secid); int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority); void (*sk_free_security) (struct sock *sk); unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir); @@ -2059,6 +2071,16 @@ static inline int security_netlink_recv(struct sk_buff * skb, int cap) return security_ops->netlink_recv(skb, cap); } +static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) +{ + return security_ops->secid_to_secctx(secid, secdata, seclen); +} + +static inline void security_release_secctx(char *secdata, u32 seclen) +{ + return security_ops->release_secctx(secdata, seclen); +} + /* prototypes */ extern int security_init (void); extern int register_security (struct security_operations *ops); @@ -2725,6 +2747,15 @@ static inline void securityfs_remove(struct dentry *dentry) { } +static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) +{ + return -EOPNOTSUPP; +} + +static inline void security_release_secctx(char *secdata, u32 seclen) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_SECURITY */ #ifdef CONFIG_SECURITY_NETWORK @@ -2840,10 +2871,9 @@ static inline int security_socket_getpeersec_stream(struct socket *sock, char __ return security_ops->socket_getpeersec_stream(sock, optval, optlen, len); } -static inline int security_socket_getpeersec_dgram(struct sk_buff *skb, char **secdata, - u32 *seclen) +static inline int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { - return security_ops->socket_getpeersec_dgram(skb, secdata, seclen); + return security_ops->socket_getpeersec_dgram(sock, skb, secid); } static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority) @@ -2968,8 +2998,7 @@ static inline int security_socket_getpeersec_stream(struct socket *sock, char __ return -ENOPROTOOPT; } -static inline int security_socket_getpeersec_dgram(struct sk_buff *skb, char **secdata, - u32 *seclen) +static inline int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { return -ENOPROTOOPT; } diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 2fec827c8801..c0398f5a8cb9 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -54,15 +54,13 @@ struct unix_skb_parms { struct ucred creds; /* Skb credentials */ struct scm_fp_list *fp; /* Passed files */ #ifdef CONFIG_SECURITY_NETWORK - char *secdata; /* Security context */ - u32 seclen; /* Security length */ + u32 secid; /* Security ID */ #endif }; #define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb)) #define UNIXCREDS(skb) (&UNIXCB((skb)).creds) -#define UNIXSECDATA(skb) (&UNIXCB((skb)).secdata) -#define UNIXSECLEN(skb) (&UNIXCB((skb)).seclen) +#define UNIXSID(skb) (&UNIXCB((skb)).secid) #define unix_state_rlock(s) spin_lock(&unix_sk(s)->lock) #define unix_state_runlock(s) spin_unlock(&unix_sk(s)->lock) diff --git a/include/net/scm.h b/include/net/scm.h index 02daa097cdcd..5637d5e22d5f 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -3,6 +3,7 @@ #include #include +#include /* Well, we should have at least one descriptor open * to accept passed FDs 8) @@ -20,8 +21,7 @@ struct scm_cookie struct ucred creds; /* Skb credentials */ struct scm_fp_list *fp; /* Passed files */ #ifdef CONFIG_SECURITY_NETWORK - char *secdata; /* Security context */ - u32 seclen; /* Security length */ + u32 secid; /* Passed security ID */ #endif unsigned long seq; /* Connection seqno */ }; @@ -32,6 +32,16 @@ extern int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie extern void __scm_destroy(struct scm_cookie *scm); extern struct scm_fp_list * scm_fp_dup(struct scm_fp_list *fpl); +#ifdef CONFIG_SECURITY_NETWORK +static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) +{ + security_socket_getpeersec_dgram(sock, NULL, &scm->secid); +} +#else +static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) +{ } +#endif /* CONFIG_SECURITY_NETWORK */ + static __inline__ void scm_destroy(struct scm_cookie *scm) { if (scm && scm->fp) @@ -47,6 +57,7 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, scm->creds.pid = p->tgid; scm->fp = NULL; scm->seq = 0; + unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; return __scm_send(sock, msg, scm); @@ -55,8 +66,18 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, #ifdef CONFIG_SECURITY_NETWORK static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { - if (test_bit(SOCK_PASSSEC, &sock->flags) && scm->secdata != NULL) - put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, scm->seclen, scm->secdata); + char *secdata; + u32 seclen; + int err; + + if (test_bit(SOCK_PASSSEC, &sock->flags)) { + err = security_secid_to_secctx(scm->secid, &secdata, &seclen); + + if (!err) { + put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata); + security_release_secctx(secdata, seclen); + } + } } #else static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 84f43a3c9098..2d05c4133d3e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -112,14 +112,19 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) { char *secdata; - u32 seclen; + u32 seclen, secid; int err; - err = security_socket_getpeersec_dgram(skb, &secdata, &seclen); + err = security_socket_getpeersec_dgram(NULL, skb, &secid); + if (err) + return; + + err = security_secid_to_secctx(secid, &secdata, &seclen); if (err) return; put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata); + security_release_secctx(secdata, seclen); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6f2909279268..de6ec519272e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -128,23 +128,17 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE) #ifdef CONFIG_SECURITY_NETWORK -static void unix_get_peersec_dgram(struct sk_buff *skb) +static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - int err; - - err = security_socket_getpeersec_dgram(skb, UNIXSECDATA(skb), - UNIXSECLEN(skb)); - if (err) - *(UNIXSECDATA(skb)) = NULL; + memcpy(UNIXSID(skb), &scm->secid, sizeof(u32)); } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - scm->secdata = *UNIXSECDATA(skb); - scm->seclen = *UNIXSECLEN(skb); + scm->secid = *UNIXSID(skb); } #else -static inline void unix_get_peersec_dgram(struct sk_buff *skb) +static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) @@ -1322,8 +1316,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); if (siocb->scm->fp) unix_attach_fds(siocb->scm, skb); - - unix_get_peersec_dgram(skb); + unix_get_secdata(siocb->scm, skb); skb->h.raw = skb->data; err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); diff --git a/security/dummy.c b/security/dummy.c index bbbfda70e131..58c6d399c844 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -791,8 +791,7 @@ static int dummy_socket_getpeersec_stream(struct socket *sock, char __user *optv return -ENOPROTOOPT; } -static int dummy_socket_getpeersec_dgram(struct sk_buff *skb, char **secdata, - u32 *seclen) +static int dummy_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { return -ENOPROTOOPT; } @@ -876,6 +875,15 @@ static int dummy_setprocattr(struct task_struct *p, char *name, void *value, siz return -EINVAL; } +static int dummy_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) +{ + return -EOPNOTSUPP; +} + +static void dummy_release_secctx(char *secdata, u32 seclen) +{ +} + #ifdef CONFIG_KEYS static inline int dummy_key_alloc(struct key *key, struct task_struct *ctx, unsigned long flags) @@ -1028,6 +1036,8 @@ void security_fixup_ops (struct security_operations *ops) set_to_dummy_if_null(ops, d_instantiate); set_to_dummy_if_null(ops, getprocattr); set_to_dummy_if_null(ops, setprocattr); + set_to_dummy_if_null(ops, secid_to_secctx); + set_to_dummy_if_null(ops, release_secctx); #ifdef CONFIG_SECURITY_NETWORK set_to_dummy_if_null(ops, unix_stream_connect); set_to_dummy_if_null(ops, unix_may_send); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a91c961ba38b..5d1b8c733199 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3524,25 +3524,21 @@ out: return err; } -static int selinux_socket_getpeersec_dgram(struct sk_buff *skb, char **secdata, u32 *seclen) +static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { + u32 peer_secid = SECSID_NULL; int err = 0; - u32 peer_sid; - if (skb->sk->sk_family == PF_UNIX) - selinux_get_inode_sid(SOCK_INODE(skb->sk->sk_socket), - &peer_sid); - else - peer_sid = selinux_socket_getpeer_dgram(skb); + if (sock && (sock->sk->sk_family == PF_UNIX)) + selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); + else if (skb) + peer_secid = selinux_socket_getpeer_dgram(skb); - if (peer_sid == SECSID_NULL) - return -EINVAL; + if (peer_secid == SECSID_NULL) + err = -EINVAL; + *secid = peer_secid; - err = security_sid_to_context(peer_sid, secdata, seclen); - if (err) - return err; - - return 0; + return err; } static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) @@ -4407,6 +4403,17 @@ static int selinux_setprocattr(struct task_struct *p, return size; } +static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) +{ + return security_sid_to_context(secid, secdata, seclen); +} + +static void selinux_release_secctx(char *secdata, u32 seclen) +{ + if (secdata) + kfree(secdata); +} + #ifdef CONFIG_KEYS static int selinux_key_alloc(struct key *k, struct task_struct *tsk, @@ -4587,6 +4594,9 @@ static struct security_operations selinux_ops = { .getprocattr = selinux_getprocattr, .setprocattr = selinux_setprocattr, + .secid_to_secctx = selinux_secid_to_secctx, + .release_secctx = selinux_release_secctx, + .unix_stream_connect = selinux_socket_unix_stream_connect, .unix_may_send = selinux_socket_unix_may_send, From 9bbf28a1ff7b9d4e7df57829c25638721984277b Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Wed, 2 Aug 2006 14:14:44 -0700 Subject: [PATCH 29/32] [DECNET]: Fix for routing bug This patch fixes a bug in the DECnet routing code where we were selecting a loopback device in preference to an outward facing device even when the destination was known non-local. This patch should fix the problem. Signed-off-by: Patrick Caulfield Signed-off-by: Steven Whitehouse Signed-off-by: David S. Miller --- net/decnet/dn_route.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 1355614ec11b..743e9fcf7c5a 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -925,8 +925,13 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old for(dev_out = dev_base; dev_out; dev_out = dev_out->next) { if (!dev_out->dn_ptr) continue; - if (dn_dev_islocal(dev_out, oldflp->fld_src)) - break; + if (!dn_dev_islocal(dev_out, oldflp->fld_src)) + continue; + if ((dev_out->flags & IFF_LOOPBACK) && + oldflp->fld_dst && + !dn_dev_islocal(dev_out, oldflp->fld_dst)) + continue; + break; } read_unlock(&dev_base_lock); if (dev_out == NULL) From e6eb307d48c81d688804f8b39a0a3ddde3cd3458 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Wed, 2 Aug 2006 14:21:19 -0700 Subject: [PATCH 30/32] [I/OAT]: Remove CPU hotplug lock from net_dma_rebalance Remove the lock_cpu_hotplug()/unlock_cpu_hotplug() calls from net_dma_rebalance The lock_cpu_hotplug()/unlock_cpu_hotplug() sequence in net_dma_rebalance is both incorrect (as pointed out by David Miller) because lock_cpu_hotplug() may sleep while the net_dma_event_lock spinlock is held, and unnecessary (as pointed out by Andrew Morton) as spin_lock() disables preemption which protects from CPU hotplug events. Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/core/dev.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 5b630cece707..f25d7ecaf035 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3419,12 +3419,9 @@ static void net_dma_rebalance(void) unsigned int cpu, i, n; struct dma_chan *chan; - lock_cpu_hotplug(); - if (net_dma_count == 0) { for_each_online_cpu(cpu) rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL); - unlock_cpu_hotplug(); return; } @@ -3444,8 +3441,6 @@ static void net_dma_rebalance(void) i++; } rcu_read_unlock(); - - unlock_cpu_hotplug(); } /** From 95ce568812822931991a24147987c5c75c0ac5b0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 2 Aug 2006 14:37:06 -0700 Subject: [PATCH 31/32] [SECURITY]: Fix build with CONFIG_SECURITY disabled. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit include/linux/security.h: In function ‘security_release_secctx’: include/linux/security.h:2757: warning: ‘return’ with a value, in function returning void Signed-off-by: David S. Miller --- include/linux/security.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/security.h b/include/linux/security.h index aa5b8043dc38..6bc2aad494ff 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2754,7 +2754,6 @@ static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *secle static inline void security_release_secctx(char *secdata, u32 seclen) { - return -EOPNOTSUPP; } #endif /* CONFIG_SECURITY */ From 29bbd72d6ee1dbf2d9f00d022f8e999aa528fb3a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 2 Aug 2006 15:02:31 -0700 Subject: [PATCH 32/32] [NET]: Fix more per-cpu typos Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/net/netdma.h | 2 +- net/core/dev.c | 4 ++-- net/ipv4/tcp.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/netdma.h b/include/net/netdma.h index ceae5ee85c04..7f53cd1d8b1e 100644 --- a/include/net/netdma.h +++ b/include/net/netdma.h @@ -29,7 +29,7 @@ static inline struct dma_chan *get_softnet_dma(void) { struct dma_chan *chan; rcu_read_lock(); - chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma)); + chan = rcu_dereference(__get_cpu_var(softnet_data).net_dma); if (chan) dma_chan_get(chan); rcu_read_unlock(); diff --git a/net/core/dev.c b/net/core/dev.c index f25d7ecaf035..d95e2626d944 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3421,7 +3421,7 @@ static void net_dma_rebalance(void) if (net_dma_count == 0) { for_each_online_cpu(cpu) - rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL); + rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL); return; } @@ -3434,7 +3434,7 @@ static void net_dma_rebalance(void) + (i < (num_online_cpus() % net_dma_count) ? 1 : 0)); while(n) { - per_cpu(softnet_data.net_dma, cpu) = chan; + per_cpu(softnet_data, cpu).net_dma = chan; cpu = next_cpu(cpu, cpu_online_map); n--; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7b621e44b124..934396bb1376 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1132,7 +1132,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, tp->ucopy.dma_chan = NULL; preempt_disable(); if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) { + !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) { preempt_enable_no_resched(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else