diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index c6c0595d267b..997ef25189fd 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -563,7 +563,7 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) static int vrf_rt6_create(struct net_device *dev) { - int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE; + int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct fib6_table *rt6i_table; @@ -583,8 +583,6 @@ static int vrf_rt6_create(struct net_device *dev) if (!rt6) goto out; - dst_hold(&rt6->dst); - rt6->rt6i_table = rt6i_table; rt6->dst.output = vrf_output6; @@ -597,8 +595,6 @@ static int vrf_rt6_create(struct net_device *dev) goto out; } - dst_hold(&rt6_local->dst); - rt6_local->rt6i_idev = in6_dev_get(dev); rt6_local->rt6i_flags = RTF_UP | RTF_NONEXTHOP | RTF_LOCAL; rt6_local->rt6i_table = rt6i_table; diff --git a/include/net/dst.h b/include/net/dst.h index 1969008783d8..f73611ec4017 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -51,13 +51,11 @@ struct dst_entry { #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 -#define DST_NOHASH 0x0008 -#define DST_NOCACHE 0x0010 -#define DST_NOCOUNT 0x0020 -#define DST_FAKE_RTABLE 0x0040 -#define DST_XFRM_TUNNEL 0x0080 -#define DST_XFRM_QUEUE 0x0100 -#define DST_METADATA 0x0200 +#define DST_NOCOUNT 0x0008 +#define DST_FAKE_RTABLE 0x0010 +#define DST_XFRM_TUNNEL 0x0020 +#define DST_XFRM_QUEUE 0x0040 +#define DST_METADATA 0x0080 short error; @@ -253,7 +251,7 @@ static inline void dst_hold(struct dst_entry *dst) * __pad_to_align_refcnt declaration in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); - atomic_inc(&dst->__refcnt); + WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); } static inline void dst_use(struct dst_entry *dst, unsigned long time) @@ -278,6 +276,8 @@ static inline struct dst_entry *dst_clone(struct dst_entry *dst) void dst_release(struct dst_entry *dst); +void dst_release_immediate(struct dst_entry *dst); + static inline void refdst_drop(unsigned long refdst) { if (!(refdst & SKB_DST_NOREF)) @@ -334,10 +334,7 @@ static inline void skb_dst_force(struct sk_buff *skb) */ static inline bool dst_hold_safe(struct dst_entry *dst) { - if (dst->flags & DST_NOCACHE) - return atomic_inc_not_zero(&dst->__refcnt); - dst_hold(dst); - return true; + return atomic_inc_not_zero(&dst->__refcnt); } /** @@ -423,26 +420,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); -void __dst_free(struct dst_entry *dst); struct dst_entry *dst_destroy(struct dst_entry *dst); - -static inline void dst_free(struct dst_entry *dst) -{ - if (dst->obsolete > 0) - return; - if (!atomic_read(&dst->__refcnt)) { - dst = dst_destroy(dst); - if (!dst) - return; - } - __dst_free(dst); -} - -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} +void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) { @@ -505,8 +484,6 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) return dst; } -void dst_subsys_init(void); - /* Flags for xfrm_lookup flags argument. */ enum { XFRM_LOOKUP_ICMP = 1 << 0, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index aa50e2e6fa2a..1a88008cc6f5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -170,7 +170,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) static inline u32 rt6_get_cookie(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_PCPU || - (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) rt = (struct rt6_info *)(rt->dst.from); return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f3da9dd2a8db..0fbf73dd531a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -116,7 +116,6 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, int flags); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); -int icmp6_dst_gc(void); void fib6_force_start_gc(struct net *net); diff --git a/include/net/route.h b/include/net/route.h index 08e689f23365..cb0a76d9dde1 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -190,7 +190,9 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, rcu_read_lock(); err = ip_route_input_noref(skb, dst, src, tos, devin); if (!err) - skb_dst_force(skb); + skb_dst_force_safe(skb); + if (!skb_dst(skb)) + err = -EINVAL; rcu_read_unlock(); return err; diff --git a/net/core/dev.c b/net/core/dev.c index b8d6dd9e8b5c..5d1830b8d2cf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8681,7 +8681,6 @@ static int __init net_dev_init(void) rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); - dst_subsys_init(); rc = 0; out: return rc; diff --git a/net/core/dst.c b/net/core/dst.c index 13ba4a090c41..f851adb9ec9b 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -42,108 +42,6 @@ * to dirty as few cache lines as possible in __dst_free(). * As this is not a very strong hint, we dont force an alignment on SMP. */ -static struct { - spinlock_t lock; - struct dst_entry *list; - unsigned long timer_inc; - unsigned long timer_expires; -} dst_garbage = { - .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), - .timer_inc = DST_GC_MAX, -}; -static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry *dst); - -static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); - -static DEFINE_MUTEX(dst_gc_mutex); -/* - * long lived entries are maintained in this list, guarded by dst_gc_mutex - */ -static struct dst_entry *dst_busy_list; - -static void dst_gc_task(struct work_struct *work) -{ - int delayed = 0; - int work_performed = 0; - unsigned long expires = ~0L; - struct dst_entry *dst, *next, head; - struct dst_entry *last = &head; - - mutex_lock(&dst_gc_mutex); - next = dst_busy_list; - -loop: - while ((dst = next) != NULL) { - next = dst->next; - prefetch(&next->next); - cond_resched(); - if (likely(atomic_read(&dst->__refcnt))) { - last->next = dst; - last = dst; - delayed++; - continue; - } - work_performed++; - - dst = dst_destroy(dst); - if (dst) { - /* NOHASH and still referenced. Unless it is already - * on gc list, invalidate it and add to gc list. - * - * Note: this is temporary. Actually, NOHASH dst's - * must be obsoleted when parent is obsoleted. - * But we do not have state "obsoleted, but - * referenced by parent", so it is right. - */ - if (dst->obsolete > 0) - continue; - - ___dst_free(dst); - dst->next = next; - next = dst; - } - } - - spin_lock_bh(&dst_garbage.lock); - next = dst_garbage.list; - if (next) { - dst_garbage.list = NULL; - spin_unlock_bh(&dst_garbage.lock); - goto loop; - } - last->next = NULL; - dst_busy_list = head.next; - if (!dst_busy_list) - dst_garbage.timer_inc = DST_GC_MAX; - else { - /* - * if we freed less than 1/10 of delayed entries, - * we can sleep longer. - */ - if (work_performed <= delayed/10) { - dst_garbage.timer_expires += dst_garbage.timer_inc; - if (dst_garbage.timer_expires > DST_GC_MAX) - dst_garbage.timer_expires = DST_GC_MAX; - dst_garbage.timer_inc += DST_GC_INC; - } else { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - } - expires = dst_garbage.timer_expires; - /* - * if the next desired timer is more than 4 seconds in the - * future then round the timer to whole seconds - */ - if (expires > 4*HZ) - expires = round_jiffies_relative(expires); - schedule_delayed_work(&dst_gc_work, expires); - } - - spin_unlock_bh(&dst_garbage.lock); - mutex_unlock(&dst_gc_mutex); -} - int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); @@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, } EXPORT_SYMBOL(dst_alloc); -static void ___dst_free(struct dst_entry *dst) -{ - /* The first case (dev==NULL) is required, when - protocol module is unloaded. - */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { - dst->input = dst_discard; - dst->output = dst_discard_out; - } - dst->obsolete = DST_OBSOLETE_DEAD; -} - -void __dst_free(struct dst_entry *dst) -{ - spin_lock_bh(&dst_garbage.lock); - ___dst_free(dst); - dst->next = dst_garbage.list; - dst_garbage.list = dst; - if (dst_garbage.timer_inc > DST_GC_INC) { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, - dst_garbage.timer_expires); - } - spin_unlock_bh(&dst_garbage.lock); -} -EXPORT_SYMBOL(__dst_free); - struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child; smp_rmb(); -again: child = dst->child; if (!(dst->flags & DST_NOCOUNT)) @@ -269,20 +138,8 @@ again: kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; - if (dst) { - int nohash = dst->flags & DST_NOHASH; - - if (atomic_dec_and_test(&dst->__refcnt)) { - /* We were real parent of this dst, so kill child. */ - if (nohash) - goto again; - } else { - /* Child is still referenced, return it for freeing. */ - if (nohash) - return dst; - /* Child is still in his hash table */ - } - } + if (dst) + dst_release_immediate(dst); return NULL; } EXPORT_SYMBOL(dst_destroy); @@ -292,26 +149,62 @@ static void dst_destroy_rcu(struct rcu_head *head) struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); dst = dst_destroy(dst); - if (dst) - __dst_free(dst); } +/* Operations to mark dst as DEAD and clean up the net device referenced + * by dst: + * 1. put the dst under loopback interface and discard all tx/rx packets + * on this route. + * 2. release the net_device + * This function should be called when removing routes from the fib tree + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to + * make the next dst_ops->check() fail. + */ +void dst_dev_put(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + + dst->obsolete = DST_OBSOLETE_DEAD; + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, true); + dst->input = dst_discard; + dst->output = dst_discard_out; + dst->dev = dev_net(dst->dev)->loopback_dev; + dev_hold(dst->dev); + dev_put(dev); +} +EXPORT_SYMBOL(dst_dev_put); + void dst_release(struct dst_entry *dst) { if (dst) { int newrefcnt; - unsigned short nocache = dst->flags & DST_NOCACHE; newrefcnt = atomic_dec_return(&dst->__refcnt); if (unlikely(newrefcnt < 0)) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); - if (!newrefcnt && unlikely(nocache)) + if (!newrefcnt) call_rcu(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); +void dst_release_immediate(struct dst_entry *dst) +{ + if (dst) { + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + if (unlikely(newrefcnt < 0)) + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", + __func__, dst, newrefcnt); + if (!newrefcnt) + dst_destroy(dst); + } +} +EXPORT_SYMBOL(dst_release_immediate); + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); @@ -377,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) dst = &md_dst->dst; dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, - DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + DST_METADATA | DST_NOCOUNT); dst->input = dst_md_discard; dst->output = dst_md_discard_out; @@ -423,86 +316,3 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); - -/* Dirty hack. We did it in 2.2 (in __dst_free), - * we have _very_ good reasons not to repeat - * this mistake in 2.3, but we have no choice - * now. _It_ _is_ _explicit_ _deliberate_ - * _race_ _condition_. - * - * Commented and originally written by Alexey. - */ -static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) -{ - if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, unregister); - - if (dev != dst->dev) - return; - - if (!unregister) { - dst->input = dst_discard; - dst->output = dst_discard_out; - } else { - dst->dev = dev_net(dst->dev)->loopback_dev; - dev_hold(dst->dev); - dev_put(dev); - } -} - -static int dst_dev_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dst_entry *dst, *last = NULL; - - switch (event) { - case NETDEV_UNREGISTER_FINAL: - case NETDEV_DOWN: - mutex_lock(&dst_gc_mutex); - for (dst = dst_busy_list; dst; dst = dst->next) { - last = dst; - dst_ifdown(dst, dev, event != NETDEV_DOWN); - } - - spin_lock_bh(&dst_garbage.lock); - dst = dst_garbage.list; - dst_garbage.list = NULL; - /* The code in dst_ifdown places a hold on the loopback device. - * If the gc entry processing is set to expire after a lengthy - * interval, this hold can cause netdev_wait_allrefs() to hang - * out and wait for a long time -- until the the loopback - * interface is released. If we're really unlucky, it'll emit - * pr_emerg messages to console too. Reset the interval here, - * so dst cleanups occur in a more timely fashion. - */ - if (dst_garbage.timer_inc > DST_GC_INC) { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, - dst_garbage.timer_expires); - } - spin_unlock_bh(&dst_garbage.lock); - - if (last) - last->next = dst; - else - dst_busy_list = dst; - for (; dst; dst = dst->next) - dst_ifdown(dst, dev, event != NETDEV_DOWN); - mutex_unlock(&dst_gc_mutex); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block dst_dev_notifier = { - .notifier_call = dst_dev_event, - .priority = -10, /* must be called after other network notifiers */ -}; - -void __init dst_subsys_init(void) -{ - register_netdevice_notifier(&dst_dev_notifier); -} diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 6f95612b4d32..5d17d843ac86 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -183,11 +183,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } -static inline void dnrt_free(struct dn_route *rt) -{ - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - static void dn_dst_check_expire(unsigned long dummy) { int i; @@ -202,14 +197,15 @@ static void dn_dst_check_expire(unsigned long dummy) spin_lock(&dn_rt_hash_table[i].lock); while ((rt = rcu_dereference_protected(*rtp, lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { - if (atomic_read(&rt->dst.__refcnt) || - (now - rt->dst.lastuse) < expire) { + if (atomic_read(&rt->dst.__refcnt) > 1 || + (now - rt->dst.lastuse) < expire) { rtp = &rt->dst.dn_next; continue; } *rtp = rt->dst.dn_next; rt->dst.dn_next = NULL; - dnrt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } spin_unlock(&dn_rt_hash_table[i].lock); @@ -235,14 +231,15 @@ static int dn_dst_gc(struct dst_ops *ops) while ((rt = rcu_dereference_protected(*rtp, lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { - if (atomic_read(&rt->dst.__refcnt) || - (now - rt->dst.lastuse) < expire) { + if (atomic_read(&rt->dst.__refcnt) > 1 || + (now - rt->dst.lastuse) < expire) { rtp = &rt->dst.dn_next; continue; } *rtp = rt->dst.dn_next; rt->dst.dn_next = NULL; - dnrt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); break; } spin_unlock_bh(&dn_rt_hash_table[i].lock); @@ -344,7 +341,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou dst_use(&rth->dst, now); spin_unlock_bh(&dn_rt_hash_table[hash].lock); - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); *rp = rth; return 0; } @@ -374,7 +371,8 @@ static void dn_run_flush(unsigned long dummy) for(; rt; rt = next) { next = rcu_dereference_raw(rt->dst.dn_next); RCU_INIT_POINTER(rt->dst.dn_next, NULL); - dnrt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } nothing_to_declare: @@ -1181,7 +1179,7 @@ make_route: if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST); + rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST); if (rt == NULL) goto e_nobufs; @@ -1215,6 +1213,7 @@ make_route: goto e_neighbour; hash = dn_hash(rt->fld.saddr, rt->fld.daddr); + /* dn_insert_route() increments dst->__refcnt */ dn_insert_route(rt, hash, (struct dn_route **)pprt); done: @@ -1237,7 +1236,7 @@ e_nobufs: err = -ENOBUFS; goto done; e_neighbour: - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); goto e_nobufs; } @@ -1445,7 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb) } make_route: - rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST); + rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST); if (rt == NULL) goto e_nobufs; @@ -1491,6 +1490,7 @@ make_route: goto e_neighbour; hash = dn_hash(rt->fld.saddr, rt->fld.daddr); + /* dn_insert_route() increments dst->__refcnt */ dn_insert_route(rt, hash, &rt); skb_dst_set(skb, &rt->dst); @@ -1514,7 +1514,7 @@ e_nobufs: goto done; e_neighbour: - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); goto done; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2157dc08c407..ff47ea1408fe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -152,7 +152,8 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) * free_fib_info_rcu() */ - dst_free(&rt->dst); + dst_dev_put(&rt->dst); + dst_release_immediate(&rt->dst); } static void free_nh_exceptions(struct fib_nh *nh) @@ -194,8 +195,10 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) struct rtable *rt; rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); - if (rt) - dst_free(&rt->dst); + if (rt) { + dst_dev_put(&rt->dst); + dst_release_immediate(&rt->dst); + } } free_percpu(rtp); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9b38cf18144e..c816cd53f7fc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -589,11 +589,6 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, build_sk_flow_key(fl4, sk); } -static inline void rt_free(struct rtable *rt) -{ - call_rcu(&rt->dst.rcu_head, dst_rcu_free); -} - static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) @@ -603,12 +598,14 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); - rt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); - rt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } } @@ -1302,7 +1299,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, - __be32 daddr) + __be32 daddr, const bool do_cache) { bool ret = false; @@ -1331,10 +1328,13 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, if (!rt->rt_gateway) rt->rt_gateway = daddr; - if (!(rt->dst.flags & DST_NOCACHE)) { + if (do_cache) { + dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); - if (orig) - rt_free(orig); + if (orig) { + dst_dev_put(&orig->dst); + dst_release(&orig->dst); + } ret = true; } @@ -1357,12 +1357,20 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) } orig = *p; + /* hold dst before doing cmpxchg() to avoid race condition + * on this dst + */ + dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { - if (orig) - rt_free(orig); - } else + if (orig) { + dst_dev_put(&orig->dst); + dst_release(&orig->dst); + } + } else { + dst_release(&rt->dst); ret = false; + } return ret; } @@ -1433,7 +1441,8 @@ static bool rt_cache_valid(const struct rtable *rt) static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, - struct fib_info *fi, u16 type, u32 itag) + struct fib_info *fi, u16 type, u32 itag, + const bool do_cache) { bool cached = false; @@ -1454,8 +1463,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #endif rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) - cached = rt_bind_exception(rt, fnhe, daddr); - else if (!(rt->dst.flags & DST_NOCACHE)) + cached = rt_bind_exception(rt, fnhe, daddr, do_cache); + else if (do_cache) cached = rt_cache_route(nh, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or @@ -1463,7 +1472,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ - rt->dst.flags |= DST_NOCACHE; if (!rt->rt_gateway) rt->rt_gateway = daddr; rt_add_uncached_list(rt); @@ -1486,7 +1494,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (will_cache ? 0 : DST_HOST) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); @@ -1730,7 +1738,8 @@ rt_cache: rth->dst.input = ip_forward; - rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, + do_cache); set_lwt_redirect(rth); skb_dst_set(skb, &rth->dst); out: @@ -2018,10 +2027,8 @@ local_input: rth->dst.input = lwtunnel_input; } - if (unlikely(!rt_cache_route(nh, rth))) { - rth->dst.flags |= DST_NOCACHE; + if (unlikely(!rt_cache_route(nh, rth))) rt_add_uncached_list(rth); - } } skb_dst_set(skb, &rth->dst); err = 0; @@ -2217,10 +2224,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth = rcu_dereference(*prth); rt_cache: - if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); + if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; - } } add: @@ -2254,7 +2259,7 @@ add: #endif } - rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); set_lwt_redirect(rth); return rth; @@ -2504,7 +2509,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->input = dst_discard; new->output = dst_discard_out; - new->dev = ort->dst.dev; + new->dev = net->loopback_dev; if (new->dev) dev_hold(new->dev); @@ -2519,7 +2524,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - dst_free(new); } dst_release(dst_orig); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2bc638c48b86..f3450f092d71 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1977,9 +1977,10 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; - dst_hold(dst); - old = xchg(&sk->sk_rx_dst, dst); - dst_release(old); + if (dst_hold_safe(dst)) { + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); + } } /* @@ -2303,13 +2304,11 @@ void udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { - /* DST_NOCACHE can not be used without taking a reference */ - if (dst->flags & DST_NOCACHE) { - if (likely(atomic_inc_not_zero(&dst->__refcnt))) - skb_dst_set(skb, dst); - } else { - skb_dst_set_noref(skb, dst); - } + /* set noref for now. + * any place which wants to hold dst has to call + * dst_hold_safe() + */ + skb_dst_set_noref(skb, dst); } } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0aa36b093013..2a6397714d70 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5576,8 +5576,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ip6_del_rt(rt); } if (ifp->rt) { - dst_hold(&ifp->rt->dst); - ip6_del_rt(ifp->rt); + if (dst_hold_safe(&ifp->rt->dst)) + ip6_del_rt(ifp->rt); } rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index deea901746c8..4f3e4657f2d6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -153,11 +153,6 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } -static void rt6_rcu_free(struct rt6_info *rt) -{ - call_rcu(&rt->dst.rcu_head, dst_rcu_free); -} - static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -172,7 +167,8 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { - rt6_rcu_free(pcpu_rt); + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); *ppcpu_rt = NULL; } } @@ -185,7 +181,8 @@ static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { rt6_free_pcpu(rt); - rt6_rcu_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } } @@ -978,8 +975,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); - if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && - !atomic_read(&rt->dst.__refcnt))) + if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; if (info->nlh) { @@ -1076,7 +1072,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fib6_start_gc(info->nl_net, rt); if (!(rt->rt6i_flags & RTF_CACHE)) fib6_prune_clones(info->nl_net, pn); - rt->dst.flags &= ~DST_NOCACHE; } out: @@ -1101,8 +1096,10 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - if (!(rt->dst.flags & DST_NOCACHE)) - dst_free(&rt->dst); + /* Always release dst as dst->__refcnt is guaranteed + * to be taken before entering this function + */ + dst_release_immediate(&rt->dst); } return err; @@ -1113,8 +1110,10 @@ out: st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); - if (!(rt->dst.flags & DST_NOCACHE)) - dst_free(&rt->dst); + /* Always release dst as dst->__refcnt is guaranteed + * to be taken before entering this function + */ + dst_release_immediate(&rt->dst); return err; #endif } @@ -1783,7 +1782,7 @@ static int fib6_age(struct rt6_info *rt, void *arg) } gc_args->more++; } else if (rt->rt6i_flags & RTF_CACHE) { - if (atomic_read(&rt->dst.__refcnt) == 0 && + if (atomic_read(&rt->dst.__refcnt) == 1 && time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; @@ -1821,8 +1820,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; - - gc_args.more = icmp6_dst_gc(); + gc_args.more = 0; fib6_clean_all(net, fib6_age, &gc_args); now = jiffies; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8b8efb0e55bf..5baa6fab4b97 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -698,8 +698,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); - dst_hold(&rt->dst); - for (;;) { /* Prepare header of the next frame, * before previous one went down. */ @@ -742,7 +740,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); - ip6_rt_put(rt); return 0; } @@ -750,7 +747,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); - ip6_rt_put(rt); return err; slow_path_clean: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 18fe6e2b88d5..2e4490076061 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -128,7 +128,6 @@ static void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); - rt->dst.flags |= DST_NOCACHE; rt->rt6i_uncached_list = ul; spin_lock_bh(&ul->lock); @@ -354,7 +353,7 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 0, DST_OBSOLETE_FORCE_CHK, flags); + 1, DST_OBSOLETE_FORCE_CHK, flags); if (rt) rt6_info_init(rt); @@ -381,7 +380,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, *p = NULL; } } else { - dst_destroy((struct dst_entry *)rt); + dst_release_immediate(&rt->dst); return NULL; } } @@ -932,9 +931,9 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. - It takes new route entry, the addition fails by any reason the - route is freed. In any case, if caller does not hold it, it may - be destroyed. + * It takes new route entry, the addition fails by any reason the + * route is released. + * Caller must hold dst before calling it. */ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, @@ -957,6 +956,8 @@ int ip6_ins_rt(struct rt6_info *rt) struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; struct mx6_config mxc = { .mx = NULL, }; + /* Hold dst to account for the reference from the fib6 tree */ + dst_hold(&rt->dst); return __ip6_ins_rt(rt, &info, &mxc, NULL); } @@ -1049,7 +1050,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) prev = cmpxchg(p, NULL, pcpu_rt); if (prev) { /* If someone did it before us, return prev instead */ - dst_destroy(&pcpu_rt->dst); + dst_release_immediate(&pcpu_rt->dst); pcpu_rt = prev; } } else { @@ -1059,7 +1060,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) * since rt is going away anyway. The next * dst_check() will trigger a re-lookup. */ - dst_destroy(&pcpu_rt->dst); + dst_release_immediate(&pcpu_rt->dst); pcpu_rt = rt; } dst_hold(&pcpu_rt->dst); @@ -1129,12 +1130,15 @@ redo_rt6_select: uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); - if (uncached_rt) + if (uncached_rt) { + /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() + * No need for another dst_hold() + */ rt6_uncached_list_add(uncached_rt); - else + } else { uncached_rt = net->ipv6.ip6_null_entry; - - dst_hold(&uncached_rt->dst); + dst_hold(&uncached_rt->dst); + } trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); return uncached_rt; @@ -1245,9 +1249,11 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; + struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; - rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); + rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, + DST_OBSOLETE_NONE, 0); if (rt) { rt6_info_init(rt); @@ -1257,10 +1263,8 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new->output = dst_discard_out; dst_copy_metrics(new, &ort->dst); - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); + rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; rt->rt6i_metric = 0; @@ -1269,8 +1273,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori #ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); #endif - - dst_free(new); } dst_release(dst_orig); @@ -1323,7 +1325,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); if (rt->rt6i_flags & RTF_PCPU || - (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); @@ -1356,8 +1358,8 @@ static void ip6_link_failure(struct sk_buff *skb) rt = (struct rt6_info *) skb_dst(skb); if (rt) { if (rt->rt6i_flags & RTF_CACHE) { - dst_hold(&rt->dst); - ip6_del_rt(rt); + if (dst_hold_safe(&rt->dst)) + ip6_del_rt(rt); } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; } @@ -1421,6 +1423,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, * invalidate the sk->sk_dst_cache. */ ip6_ins_rt(nrt6); + /* Release the reference taken in + * ip6_rt_cache_alloc() + */ + dst_release(&nrt6->dst); } } } @@ -1649,9 +1655,6 @@ out: return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } -static struct dst_entry *icmp6_dst_gc_list; -static DEFINE_SPINLOCK(icmp6_dst_lock); - struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { @@ -1672,19 +1675,16 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->dst.flags |= DST_HOST; rt->dst.output = ip6_output; - atomic_set(&rt->dst.__refcnt, 1); rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); - spin_lock_bh(&icmp6_dst_lock); - rt->dst.next = icmp6_dst_gc_list; - icmp6_dst_gc_list = &rt->dst; - spin_unlock_bh(&icmp6_dst_lock); - - fib6_force_start_gc(net); + /* Add this dst into uncached_list so that rt6_ifdown() can + * do proper release of the net_device + */ + rt6_uncached_list_add(rt); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); @@ -1692,48 +1692,6 @@ out: return dst; } -int icmp6_dst_gc(void) -{ - struct dst_entry *dst, **pprev; - int more = 0; - - spin_lock_bh(&icmp6_dst_lock); - pprev = &icmp6_dst_gc_list; - - while ((dst = *pprev) != NULL) { - if (!atomic_read(&dst->__refcnt)) { - *pprev = dst->next; - dst_free(dst); - } else { - pprev = &dst->next; - ++more; - } - } - - spin_unlock_bh(&icmp6_dst_lock); - - return more; -} - -static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), - void *arg) -{ - struct dst_entry *dst, **pprev; - - spin_lock_bh(&icmp6_dst_lock); - pprev = &icmp6_dst_gc_list; - while ((dst = *pprev) != NULL) { - struct rt6_info *rt = (struct rt6_info *) dst; - if (func(rt, arg)) { - *pprev = dst->next; - dst_free(dst); - } else { - pprev = &dst->next; - } - } - spin_unlock_bh(&icmp6_dst_lock); -} - static int ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); @@ -2130,7 +2088,7 @@ out: if (idev) in6_dev_put(idev); if (rt) - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); return ERR_PTR(err); } @@ -2160,7 +2118,7 @@ int ip6_route_add(struct fib6_config *cfg, return err; out: if (rt) - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); return err; } @@ -2171,8 +2129,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry || - rt->dst.flags & DST_NOCACHE) { + if (rt == net->ipv6.ip6_null_entry) { err = -ENOENT; goto out; } @@ -2397,7 +2354,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; if (ip6_ins_rt(nrt)) - goto out; + goto out_release; netevent.old = &rt->dst; netevent.new = &nrt->dst; @@ -2410,6 +2367,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu ip6_del_rt(rt); } +out_release: + /* Release the reference taken in + * ip6_rt_cache_alloc() + */ + dst_release(&nrt->dst); + out: neigh_release(neigh); } @@ -2757,9 +2720,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; rt->rt6i_table = fib6_get_table(net, tb_id); - rt->dst.flags |= DST_NOCACHE; - - atomic_set(&rt->dst.__refcnt, 1); return rt; } @@ -2847,7 +2807,6 @@ void rt6_ifdown(struct net *net, struct net_device *dev) }; fib6_clean_all(net, fib6_ifdown, &adn); - icmp6_clean_all(fib6_ifdown, &adn); if (dev) rt6_uncached_list_flush_dev(net, dev); } @@ -3185,7 +3144,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { - dst_free(&rt->dst); + dst_release_immediate(&rt->dst); goto cleanup; } @@ -3249,7 +3208,7 @@ add_errout: cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { if (nh->rt6_info) - dst_free(&nh->rt6_info->dst); + dst_release_immediate(&nh->rt6_info->dst); kfree(nh->mxc.mx); list_del(&nh->next); kfree(nh); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2e9b52bded2d..2b33847bf931 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -920,12 +920,11 @@ static void udp_v6_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); if (dst) { - if (dst->flags & DST_NOCACHE) { - if (likely(atomic_inc_not_zero(&dst->__refcnt))) - skb_dst_set(skb, dst); - } else { - skb_dst_set_noref(skb, dst); - } + /* set noref for now. + * any place which wants to hold dst has to call + * dst_hold_safe() + */ + skb_dst_set_noref(skb, dst); } } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ed4e52d95172..af8e38f47b5b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1590,7 +1590,9 @@ static void xfrm_bundle_flo_delete(struct flow_cache_object *flo) struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); struct dst_entry *dst = &xdst->u.dst; - dst_free(dst); + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ + dst->obsolete = DST_OBSOLETE_DEAD; + dst_release_immediate(dst); } static const struct flow_cache_ops xfrm_bundle_fc_ops = { @@ -1620,7 +1622,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) default: BUG(); } - xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0); + xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { struct dst_entry *dst = &xdst->u.dst; @@ -1723,10 +1725,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, if (!dst_prev) dst0 = dst1; - else { - dst_prev->child = dst_clone(dst1); - dst1->flags |= DST_NOHASH; - } + else + /* Ref count is taken during xfrm_alloc_dst() + * No need to do dst_clone() on dst1 + */ + dst_prev->child = dst1; xdst->route = dst; dst_copy_metrics(dst1, dst); @@ -1792,7 +1795,7 @@ put_states: xfrm_state_put(xfrm[i]); free_dst: if (dst0) - dst_free(dst0); + dst_release_immediate(dst0); dst0 = ERR_PTR(err); goto out; } @@ -2073,7 +2076,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, pol_dead |= pols[i]->walk.dead; } if (pol_dead) { - dst_free(&xdst->u.dst); + /* Mark DST_OBSOLETE_DEAD to fail the next + * xfrm_dst_check() + */ + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + dst_release_immediate(&xdst->u.dst); xdst = NULL; num_pols = 0; num_xfrms = 0; @@ -2120,11 +2127,12 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (xdst) { /* The policies were stolen for newly generated bundle */ xdst->num_pols = 0; - dst_free(&xdst->u.dst); + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + dst_release_immediate(&xdst->u.dst); } - /* Flow cache does not have reference, it dst_free()'s, - * but we do need to return one reference for original caller */ + /* We do need to return one reference for original caller */ dst_hold(&new_xdst->u.dst); return &new_xdst->flo; @@ -2147,9 +2155,11 @@ make_dummy_bundle: inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: - if (xdst != NULL) - dst_free(&xdst->u.dst); - else + if (xdst != NULL) { + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + dst_release_immediate(&xdst->u.dst); + } else xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } @@ -2221,7 +2231,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, } dst_hold(&xdst->u.dst); - xdst->u.dst.flags |= DST_NOCACHE; route = xdst->route; } } @@ -2636,10 +2645,12 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) * notice. That's what we are validating here via the * stale_bundle() check. * - * When a policy's bundle is pruned, we dst_free() the XFRM - * dst which causes it's ->obsolete field to be set to - * DST_OBSOLETE_DEAD. If an XFRM dst has been pruned like - * this, we want to force a new route lookup. + * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will + * be marked on it. + * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will + * be marked on it. + * Both will force stable_bundle() to fail on any xdst bundle with + * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst;