Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'net-remove-dst-garbage-collector-logic'

Wei Wang says:

====================
remove dst garbage collector logic

The current mechanism of dst release is a bit complicated. It is because
the users of dst get divided into 2 situations:
1. Most users take the reference count when using a dst and release the
reference count when done.
2. Exceptional users like IPv4/IPv6/decnet/xfrm routing code do not take
reference count when referencing to a dst due to some histotic reasons.

Due to those exceptional use cases in 2, reference count being 0 is not an
adequate evidence to indicate that no user is using this dst. So users in 1
can't free the dst simply based on reference count being 0 because users in
2 might still hold reference to it.
Instead, a dst garbage list is needed to hold the dst entries that already
get removed by the users in 2 but are still held by users in 1. And a periodic
garbage collector task is run to check all the dst entries in the list to see
if the users in 1 have released the reference to those dst entries.
If so, the dst is now ready to be freed.

This logic introduces unnecessary complications in the dst code which makes it
hard to understand and to debug.

In order to get rid of the whole dst garbage collector (gc) and make the dst
code more unified and simplified, we can make the users in 2 also take reference
count on the dst and release it properly when done.
This way, dst can be safely freed once the refcount drops to 0 and no gc
thread is needed anymore.

This patch series' target is to completely get rid of dst gc logic and free
dst based on reference count only.
Patch 1-3 are preparation patches to do some cleanup/improvement on the existing
code to make later work easier.
Patch 4-21 are real implementations.
In these patches, a temporary flag DST_NOGC is used to help transition
those exceptional users one by one. Once every component is transitioned,
this temporary flag is removed.
By the end of this patch series, all dst are refcounted when being used
and released when done. And dst will be freed when its refcount drops to 0.
No dst gc task is running anymore.

Note: This patch series depends on the decnet fix that was sent right before:
"decnet: always not take dst->__refcnt when inserting dst into hash table"

v2:
add curly braces in udp_v4/6_early_demux() in patch 02
add EXPORT_SYMBOL() for dst_dev_put() in patch 05
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+218 -466
+1 -5
drivers/net/vrf.c
··· 563 563 564 564 static int vrf_rt6_create(struct net_device *dev) 565 565 { 566 - int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE; 566 + int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM; 567 567 struct net_vrf *vrf = netdev_priv(dev); 568 568 struct net *net = dev_net(dev); 569 569 struct fib6_table *rt6i_table; ··· 583 583 if (!rt6) 584 584 goto out; 585 585 586 - dst_hold(&rt6->dst); 587 - 588 586 rt6->rt6i_table = rt6i_table; 589 587 rt6->dst.output = vrf_output6; 590 588 ··· 594 596 dst_release(&rt6->dst); 595 597 goto out; 596 598 } 597 - 598 - dst_hold(&rt6_local->dst); 599 599 600 600 rt6_local->rt6i_idev = in6_dev_get(dev); 601 601 rt6_local->rt6i_flags = RTF_UP | RTF_NONEXTHOP | RTF_LOCAL;
+10 -33
include/net/dst.h
··· 51 51 #define DST_HOST 0x0001 52 52 #define DST_NOXFRM 0x0002 53 53 #define DST_NOPOLICY 0x0004 54 - #define DST_NOHASH 0x0008 55 - #define DST_NOCACHE 0x0010 56 - #define DST_NOCOUNT 0x0020 57 - #define DST_FAKE_RTABLE 0x0040 58 - #define DST_XFRM_TUNNEL 0x0080 59 - #define DST_XFRM_QUEUE 0x0100 60 - #define DST_METADATA 0x0200 54 + #define DST_NOCOUNT 0x0008 55 + #define DST_FAKE_RTABLE 0x0010 56 + #define DST_XFRM_TUNNEL 0x0020 57 + #define DST_XFRM_QUEUE 0x0040 58 + #define DST_METADATA 0x0080 61 59 62 60 short error; 63 61 ··· 251 253 * __pad_to_align_refcnt declaration in struct dst_entry 252 254 */ 253 255 BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); 254 - atomic_inc(&dst->__refcnt); 256 + WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); 255 257 } 256 258 257 259 static inline void dst_use(struct dst_entry *dst, unsigned long time) ··· 275 277 } 276 278 277 279 void dst_release(struct dst_entry *dst); 280 + 281 + void dst_release_immediate(struct dst_entry *dst); 278 282 279 283 static inline void refdst_drop(unsigned long refdst) 280 284 { ··· 334 334 */ 335 335 static inline bool dst_hold_safe(struct dst_entry *dst) 336 336 { 337 - if (dst->flags & DST_NOCACHE) 338 - return atomic_inc_not_zero(&dst->__refcnt); 339 - dst_hold(dst); 340 - return true; 337 + return atomic_inc_not_zero(&dst->__refcnt); 341 338 } 342 339 343 340 /** ··· 420 423 void dst_init(struct dst_entry *dst, struct dst_ops *ops, 421 424 struct net_device *dev, int initial_ref, int initial_obsolete, 422 425 unsigned short flags); 423 - void __dst_free(struct dst_entry *dst); 424 426 struct dst_entry *dst_destroy(struct dst_entry *dst); 425 - 426 - static inline void dst_free(struct dst_entry *dst) 427 - { 428 - if (dst->obsolete > 0) 429 - return; 430 - if (!atomic_read(&dst->__refcnt)) { 431 - dst = dst_destroy(dst); 432 - if (!dst) 433 - return; 434 - } 435 - __dst_free(dst); 436 - } 437 - 438 - static inline void dst_rcu_free(struct rcu_head *head) 439 - { 440 - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); 441 - dst_free(dst); 442 - } 427 + void dst_dev_put(struct dst_entry *dst); 443 428 444 429 static inline void dst_confirm(struct dst_entry *dst) 445 430 { ··· 483 504 dst = dst->ops->check(dst, cookie); 484 505 return dst; 485 506 } 486 - 487 - void dst_subsys_init(void); 488 507 489 508 /* Flags for xfrm_lookup flags argument. */ 490 509 enum {
+1 -1
include/net/ip6_fib.h
··· 170 170 static inline u32 rt6_get_cookie(const struct rt6_info *rt) 171 171 { 172 172 if (rt->rt6i_flags & RTF_PCPU || 173 - (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from)) 173 + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) 174 174 rt = (struct rt6_info *)(rt->dst.from); 175 175 176 176 return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
-1
include/net/ip6_route.h
··· 116 116 const struct in6_addr *saddr, int oif, int flags); 117 117 118 118 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); 119 - int icmp6_dst_gc(void); 120 119 121 120 void fib6_force_start_gc(struct net *net); 122 121
+3 -1
include/net/route.h
··· 190 190 rcu_read_lock(); 191 191 err = ip_route_input_noref(skb, dst, src, tos, devin); 192 192 if (!err) 193 - skb_dst_force(skb); 193 + skb_dst_force_safe(skb); 194 + if (!skb_dst(skb)) 195 + err = -EINVAL; 194 196 rcu_read_unlock(); 195 197 196 198 return err;
-1
net/core/dev.c
··· 8681 8681 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", 8682 8682 NULL, dev_cpu_dead); 8683 8683 WARN_ON(rc < 0); 8684 - dst_subsys_init(); 8685 8684 rc = 0; 8686 8685 out: 8687 8686 return rc;
+43 -233
net/core/dst.c
··· 42 42 * to dirty as few cache lines as possible in __dst_free(). 43 43 * As this is not a very strong hint, we dont force an alignment on SMP. 44 44 */ 45 - static struct { 46 - spinlock_t lock; 47 - struct dst_entry *list; 48 - unsigned long timer_inc; 49 - unsigned long timer_expires; 50 - } dst_garbage = { 51 - .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), 52 - .timer_inc = DST_GC_MAX, 53 - }; 54 - static void dst_gc_task(struct work_struct *work); 55 - static void ___dst_free(struct dst_entry *dst); 56 - 57 - static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); 58 - 59 - static DEFINE_MUTEX(dst_gc_mutex); 60 - /* 61 - * long lived entries are maintained in this list, guarded by dst_gc_mutex 62 - */ 63 - static struct dst_entry *dst_busy_list; 64 - 65 - static void dst_gc_task(struct work_struct *work) 66 - { 67 - int delayed = 0; 68 - int work_performed = 0; 69 - unsigned long expires = ~0L; 70 - struct dst_entry *dst, *next, head; 71 - struct dst_entry *last = &head; 72 - 73 - mutex_lock(&dst_gc_mutex); 74 - next = dst_busy_list; 75 - 76 - loop: 77 - while ((dst = next) != NULL) { 78 - next = dst->next; 79 - prefetch(&next->next); 80 - cond_resched(); 81 - if (likely(atomic_read(&dst->__refcnt))) { 82 - last->next = dst; 83 - last = dst; 84 - delayed++; 85 - continue; 86 - } 87 - work_performed++; 88 - 89 - dst = dst_destroy(dst); 90 - if (dst) { 91 - /* NOHASH and still referenced. Unless it is already 92 - * on gc list, invalidate it and add to gc list. 93 - * 94 - * Note: this is temporary. Actually, NOHASH dst's 95 - * must be obsoleted when parent is obsoleted. 96 - * But we do not have state "obsoleted, but 97 - * referenced by parent", so it is right. 98 - */ 99 - if (dst->obsolete > 0) 100 - continue; 101 - 102 - ___dst_free(dst); 103 - dst->next = next; 104 - next = dst; 105 - } 106 - } 107 - 108 - spin_lock_bh(&dst_garbage.lock); 109 - next = dst_garbage.list; 110 - if (next) { 111 - dst_garbage.list = NULL; 112 - spin_unlock_bh(&dst_garbage.lock); 113 - goto loop; 114 - } 115 - last->next = NULL; 116 - dst_busy_list = head.next; 117 - if (!dst_busy_list) 118 - dst_garbage.timer_inc = DST_GC_MAX; 119 - else { 120 - /* 121 - * if we freed less than 1/10 of delayed entries, 122 - * we can sleep longer. 123 - */ 124 - if (work_performed <= delayed/10) { 125 - dst_garbage.timer_expires += dst_garbage.timer_inc; 126 - if (dst_garbage.timer_expires > DST_GC_MAX) 127 - dst_garbage.timer_expires = DST_GC_MAX; 128 - dst_garbage.timer_inc += DST_GC_INC; 129 - } else { 130 - dst_garbage.timer_inc = DST_GC_INC; 131 - dst_garbage.timer_expires = DST_GC_MIN; 132 - } 133 - expires = dst_garbage.timer_expires; 134 - /* 135 - * if the next desired timer is more than 4 seconds in the 136 - * future then round the timer to whole seconds 137 - */ 138 - if (expires > 4*HZ) 139 - expires = round_jiffies_relative(expires); 140 - schedule_delayed_work(&dst_gc_work, expires); 141 - } 142 - 143 - spin_unlock_bh(&dst_garbage.lock); 144 - mutex_unlock(&dst_gc_mutex); 145 - } 146 - 147 45 int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 148 46 { 149 47 kfree_skb(skb); ··· 114 216 } 115 217 EXPORT_SYMBOL(dst_alloc); 116 218 117 - static void ___dst_free(struct dst_entry *dst) 118 - { 119 - /* The first case (dev==NULL) is required, when 120 - protocol module is unloaded. 121 - */ 122 - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { 123 - dst->input = dst_discard; 124 - dst->output = dst_discard_out; 125 - } 126 - dst->obsolete = DST_OBSOLETE_DEAD; 127 - } 128 - 129 - void __dst_free(struct dst_entry *dst) 130 - { 131 - spin_lock_bh(&dst_garbage.lock); 132 - ___dst_free(dst); 133 - dst->next = dst_garbage.list; 134 - dst_garbage.list = dst; 135 - if (dst_garbage.timer_inc > DST_GC_INC) { 136 - dst_garbage.timer_inc = DST_GC_INC; 137 - dst_garbage.timer_expires = DST_GC_MIN; 138 - mod_delayed_work(system_wq, &dst_gc_work, 139 - dst_garbage.timer_expires); 140 - } 141 - spin_unlock_bh(&dst_garbage.lock); 142 - } 143 - EXPORT_SYMBOL(__dst_free); 144 - 145 219 struct dst_entry *dst_destroy(struct dst_entry * dst) 146 220 { 147 221 struct dst_entry *child; 148 222 149 223 smp_rmb(); 150 224 151 - again: 152 225 child = dst->child; 153 226 154 227 if (!(dst->flags & DST_NOCOUNT)) ··· 138 269 kmem_cache_free(dst->ops->kmem_cachep, dst); 139 270 140 271 dst = child; 141 - if (dst) { 142 - int nohash = dst->flags & DST_NOHASH; 143 - 144 - if (atomic_dec_and_test(&dst->__refcnt)) { 145 - /* We were real parent of this dst, so kill child. */ 146 - if (nohash) 147 - goto again; 148 - } else { 149 - /* Child is still referenced, return it for freeing. */ 150 - if (nohash) 151 - return dst; 152 - /* Child is still in his hash table */ 153 - } 154 - } 272 + if (dst) 273 + dst_release_immediate(dst); 155 274 return NULL; 156 275 } 157 276 EXPORT_SYMBOL(dst_destroy); ··· 149 292 struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); 150 293 151 294 dst = dst_destroy(dst); 152 - if (dst) 153 - __dst_free(dst); 154 295 } 296 + 297 + /* Operations to mark dst as DEAD and clean up the net device referenced 298 + * by dst: 299 + * 1. put the dst under loopback interface and discard all tx/rx packets 300 + * on this route. 301 + * 2. release the net_device 302 + * This function should be called when removing routes from the fib tree 303 + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to 304 + * make the next dst_ops->check() fail. 305 + */ 306 + void dst_dev_put(struct dst_entry *dst) 307 + { 308 + struct net_device *dev = dst->dev; 309 + 310 + dst->obsolete = DST_OBSOLETE_DEAD; 311 + if (dst->ops->ifdown) 312 + dst->ops->ifdown(dst, dev, true); 313 + dst->input = dst_discard; 314 + dst->output = dst_discard_out; 315 + dst->dev = dev_net(dst->dev)->loopback_dev; 316 + dev_hold(dst->dev); 317 + dev_put(dev); 318 + } 319 + EXPORT_SYMBOL(dst_dev_put); 155 320 156 321 void dst_release(struct dst_entry *dst) 157 322 { 158 323 if (dst) { 159 324 int newrefcnt; 160 - unsigned short nocache = dst->flags & DST_NOCACHE; 161 325 162 326 newrefcnt = atomic_dec_return(&dst->__refcnt); 163 327 if (unlikely(newrefcnt < 0)) 164 328 net_warn_ratelimited("%s: dst:%p refcnt:%d\n", 165 329 __func__, dst, newrefcnt); 166 - if (!newrefcnt && unlikely(nocache)) 330 + if (!newrefcnt) 167 331 call_rcu(&dst->rcu_head, dst_destroy_rcu); 168 332 } 169 333 } 170 334 EXPORT_SYMBOL(dst_release); 335 + 336 + void dst_release_immediate(struct dst_entry *dst) 337 + { 338 + if (dst) { 339 + int newrefcnt; 340 + 341 + newrefcnt = atomic_dec_return(&dst->__refcnt); 342 + if (unlikely(newrefcnt < 0)) 343 + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", 344 + __func__, dst, newrefcnt); 345 + if (!newrefcnt) 346 + dst_destroy(dst); 347 + } 348 + } 349 + EXPORT_SYMBOL(dst_release_immediate); 171 350 172 351 u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) 173 352 { ··· 270 377 271 378 dst = &md_dst->dst; 272 379 dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, 273 - DST_METADATA | DST_NOCACHE | DST_NOCOUNT); 380 + DST_METADATA | DST_NOCOUNT); 274 381 275 382 dst->input = dst_md_discard; 276 383 dst->output = dst_md_discard_out; ··· 316 423 return md_dst; 317 424 } 318 425 EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); 319 - 320 - /* Dirty hack. We did it in 2.2 (in __dst_free), 321 - * we have _very_ good reasons not to repeat 322 - * this mistake in 2.3, but we have no choice 323 - * now. _It_ _is_ _explicit_ _deliberate_ 324 - * _race_ _condition_. 325 - * 326 - * Commented and originally written by Alexey. 327 - */ 328 - static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, 329 - int unregister) 330 - { 331 - if (dst->ops->ifdown) 332 - dst->ops->ifdown(dst, dev, unregister); 333 - 334 - if (dev != dst->dev) 335 - return; 336 - 337 - if (!unregister) { 338 - dst->input = dst_discard; 339 - dst->output = dst_discard_out; 340 - } else { 341 - dst->dev = dev_net(dst->dev)->loopback_dev; 342 - dev_hold(dst->dev); 343 - dev_put(dev); 344 - } 345 - } 346 - 347 - static int dst_dev_event(struct notifier_block *this, unsigned long event, 348 - void *ptr) 349 - { 350 - struct net_device *dev = netdev_notifier_info_to_dev(ptr); 351 - struct dst_entry *dst, *last = NULL; 352 - 353 - switch (event) { 354 - case NETDEV_UNREGISTER_FINAL: 355 - case NETDEV_DOWN: 356 - mutex_lock(&dst_gc_mutex); 357 - for (dst = dst_busy_list; dst; dst = dst->next) { 358 - last = dst; 359 - dst_ifdown(dst, dev, event != NETDEV_DOWN); 360 - } 361 - 362 - spin_lock_bh(&dst_garbage.lock); 363 - dst = dst_garbage.list; 364 - dst_garbage.list = NULL; 365 - /* The code in dst_ifdown places a hold on the loopback device. 366 - * If the gc entry processing is set to expire after a lengthy 367 - * interval, this hold can cause netdev_wait_allrefs() to hang 368 - * out and wait for a long time -- until the the loopback 369 - * interface is released. If we're really unlucky, it'll emit 370 - * pr_emerg messages to console too. Reset the interval here, 371 - * so dst cleanups occur in a more timely fashion. 372 - */ 373 - if (dst_garbage.timer_inc > DST_GC_INC) { 374 - dst_garbage.timer_inc = DST_GC_INC; 375 - dst_garbage.timer_expires = DST_GC_MIN; 376 - mod_delayed_work(system_wq, &dst_gc_work, 377 - dst_garbage.timer_expires); 378 - } 379 - spin_unlock_bh(&dst_garbage.lock); 380 - 381 - if (last) 382 - last->next = dst; 383 - else 384 - dst_busy_list = dst; 385 - for (; dst; dst = dst->next) 386 - dst_ifdown(dst, dev, event != NETDEV_DOWN); 387 - mutex_unlock(&dst_gc_mutex); 388 - break; 389 - } 390 - return NOTIFY_DONE; 391 - } 392 - 393 - static struct notifier_block dst_dev_notifier = { 394 - .notifier_call = dst_dev_event, 395 - .priority = -10, /* must be called after other network notifiers */ 396 - }; 397 - 398 - void __init dst_subsys_init(void) 399 - { 400 - register_netdevice_notifier(&dst_dev_notifier); 401 - }
+17 -17
net/decnet/dn_route.c
··· 183 183 return dn_rt_hash_mask & (unsigned int)tmp; 184 184 } 185 185 186 - static inline void dnrt_free(struct dn_route *rt) 187 - { 188 - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 189 - } 190 - 191 186 static void dn_dst_check_expire(unsigned long dummy) 192 187 { 193 188 int i; ··· 197 202 spin_lock(&dn_rt_hash_table[i].lock); 198 203 while ((rt = rcu_dereference_protected(*rtp, 199 204 lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { 200 - if (atomic_read(&rt->dst.__refcnt) || 201 - (now - rt->dst.lastuse) < expire) { 205 + if (atomic_read(&rt->dst.__refcnt) > 1 || 206 + (now - rt->dst.lastuse) < expire) { 202 207 rtp = &rt->dst.dn_next; 203 208 continue; 204 209 } 205 210 *rtp = rt->dst.dn_next; 206 211 rt->dst.dn_next = NULL; 207 - dnrt_free(rt); 212 + dst_dev_put(&rt->dst); 213 + dst_release(&rt->dst); 208 214 } 209 215 spin_unlock(&dn_rt_hash_table[i].lock); 210 216 ··· 231 235 232 236 while ((rt = rcu_dereference_protected(*rtp, 233 237 lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { 234 - if (atomic_read(&rt->dst.__refcnt) || 235 - (now - rt->dst.lastuse) < expire) { 238 + if (atomic_read(&rt->dst.__refcnt) > 1 || 239 + (now - rt->dst.lastuse) < expire) { 236 240 rtp = &rt->dst.dn_next; 237 241 continue; 238 242 } 239 243 *rtp = rt->dst.dn_next; 240 244 rt->dst.dn_next = NULL; 241 - dnrt_free(rt); 245 + dst_dev_put(&rt->dst); 246 + dst_release(&rt->dst); 242 247 break; 243 248 } 244 249 spin_unlock_bh(&dn_rt_hash_table[i].lock); ··· 341 344 dst_use(&rth->dst, now); 342 345 spin_unlock_bh(&dn_rt_hash_table[hash].lock); 343 346 344 - dst_free(&rt->dst); 347 + dst_release_immediate(&rt->dst); 345 348 *rp = rth; 346 349 return 0; 347 350 } ··· 371 374 for(; rt; rt = next) { 372 375 next = rcu_dereference_raw(rt->dst.dn_next); 373 376 RCU_INIT_POINTER(rt->dst.dn_next, NULL); 374 - dnrt_free(rt); 377 + dst_dev_put(&rt->dst); 378 + dst_release(&rt->dst); 375 379 } 376 380 377 381 nothing_to_declare: ··· 1179 1181 if (dev_out->flags & IFF_LOOPBACK) 1180 1182 flags |= RTCF_LOCAL; 1181 1183 1182 - rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST); 1184 + rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST); 1183 1185 if (rt == NULL) 1184 1186 goto e_nobufs; 1185 1187 ··· 1213 1215 goto e_neighbour; 1214 1216 1215 1217 hash = dn_hash(rt->fld.saddr, rt->fld.daddr); 1218 + /* dn_insert_route() increments dst->__refcnt */ 1216 1219 dn_insert_route(rt, hash, (struct dn_route **)pprt); 1217 1220 1218 1221 done: ··· 1236 1237 err = -ENOBUFS; 1237 1238 goto done; 1238 1239 e_neighbour: 1239 - dst_free(&rt->dst); 1240 + dst_release_immediate(&rt->dst); 1240 1241 goto e_nobufs; 1241 1242 } 1242 1243 ··· 1444 1445 } 1445 1446 1446 1447 make_route: 1447 - rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST); 1448 + rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST); 1448 1449 if (rt == NULL) 1449 1450 goto e_nobufs; 1450 1451 ··· 1490 1491 goto e_neighbour; 1491 1492 1492 1493 hash = dn_hash(rt->fld.saddr, rt->fld.daddr); 1494 + /* dn_insert_route() increments dst->__refcnt */ 1493 1495 dn_insert_route(rt, hash, &rt); 1494 1496 skb_dst_set(skb, &rt->dst); 1495 1497 ··· 1514 1514 goto done; 1515 1515 1516 1516 e_neighbour: 1517 - dst_free(&rt->dst); 1517 + dst_release_immediate(&rt->dst); 1518 1518 goto done; 1519 1519 } 1520 1520
+6 -3
net/ipv4/fib_semantics.c
··· 152 152 * free_fib_info_rcu() 153 153 */ 154 154 155 - dst_free(&rt->dst); 155 + dst_dev_put(&rt->dst); 156 + dst_release_immediate(&rt->dst); 156 157 } 157 158 158 159 static void free_nh_exceptions(struct fib_nh *nh) ··· 195 194 struct rtable *rt; 196 195 197 196 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); 198 - if (rt) 199 - dst_free(&rt->dst); 197 + if (rt) { 198 + dst_dev_put(&rt->dst); 199 + dst_release_immediate(&rt->dst); 200 + } 200 201 } 201 202 free_percpu(rtp); 202 203 }
+33 -29
net/ipv4/route.c
··· 589 589 build_sk_flow_key(fl4, sk); 590 590 } 591 591 592 - static inline void rt_free(struct rtable *rt) 593 - { 594 - call_rcu(&rt->dst.rcu_head, dst_rcu_free); 595 - } 596 - 597 592 static DEFINE_SPINLOCK(fnhe_lock); 598 593 599 594 static void fnhe_flush_routes(struct fib_nh_exception *fnhe) ··· 598 603 rt = rcu_dereference(fnhe->fnhe_rth_input); 599 604 if (rt) { 600 605 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); 601 - rt_free(rt); 606 + dst_dev_put(&rt->dst); 607 + dst_release(&rt->dst); 602 608 } 603 609 rt = rcu_dereference(fnhe->fnhe_rth_output); 604 610 if (rt) { 605 611 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); 606 - rt_free(rt); 612 + dst_dev_put(&rt->dst); 613 + dst_release(&rt->dst); 607 614 } 608 615 } 609 616 ··· 1299 1302 } 1300 1303 1301 1304 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1302 - __be32 daddr) 1305 + __be32 daddr, const bool do_cache) 1303 1306 { 1304 1307 bool ret = false; 1305 1308 ··· 1328 1331 if (!rt->rt_gateway) 1329 1332 rt->rt_gateway = daddr; 1330 1333 1331 - if (!(rt->dst.flags & DST_NOCACHE)) { 1334 + if (do_cache) { 1335 + dst_hold(&rt->dst); 1332 1336 rcu_assign_pointer(*porig, rt); 1333 - if (orig) 1334 - rt_free(orig); 1337 + if (orig) { 1338 + dst_dev_put(&orig->dst); 1339 + dst_release(&orig->dst); 1340 + } 1335 1341 ret = true; 1336 1342 } 1337 1343 ··· 1357 1357 } 1358 1358 orig = *p; 1359 1359 1360 + /* hold dst before doing cmpxchg() to avoid race condition 1361 + * on this dst 1362 + */ 1363 + dst_hold(&rt->dst); 1360 1364 prev = cmpxchg(p, orig, rt); 1361 1365 if (prev == orig) { 1362 - if (orig) 1363 - rt_free(orig); 1364 - } else 1366 + if (orig) { 1367 + dst_dev_put(&orig->dst); 1368 + dst_release(&orig->dst); 1369 + } 1370 + } else { 1371 + dst_release(&rt->dst); 1365 1372 ret = false; 1373 + } 1366 1374 1367 1375 return ret; 1368 1376 } ··· 1441 1433 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1442 1434 const struct fib_result *res, 1443 1435 struct fib_nh_exception *fnhe, 1444 - struct fib_info *fi, u16 type, u32 itag) 1436 + struct fib_info *fi, u16 type, u32 itag, 1437 + const bool do_cache) 1445 1438 { 1446 1439 bool cached = false; 1447 1440 ··· 1463 1454 #endif 1464 1455 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); 1465 1456 if (unlikely(fnhe)) 1466 - cached = rt_bind_exception(rt, fnhe, daddr); 1467 - else if (!(rt->dst.flags & DST_NOCACHE)) 1457 + cached = rt_bind_exception(rt, fnhe, daddr, do_cache); 1458 + else if (do_cache) 1468 1459 cached = rt_cache_route(nh, rt); 1469 1460 if (unlikely(!cached)) { 1470 1461 /* Routes we intend to cache in nexthop exception or ··· 1472 1463 * However, if we are unsuccessful at storing this 1473 1464 * route into the cache we really need to set it. 1474 1465 */ 1475 - rt->dst.flags |= DST_NOCACHE; 1476 1466 if (!rt->rt_gateway) 1477 1467 rt->rt_gateway = daddr; 1478 1468 rt_add_uncached_list(rt); ··· 1494 1486 struct rtable *rt; 1495 1487 1496 1488 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1497 - (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | 1489 + (will_cache ? 0 : DST_HOST) | 1498 1490 (nopolicy ? DST_NOPOLICY : 0) | 1499 1491 (noxfrm ? DST_NOXFRM : 0)); 1500 1492 ··· 1738 1730 1739 1731 rth->dst.input = ip_forward; 1740 1732 1741 - rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); 1733 + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, 1734 + do_cache); 1742 1735 set_lwt_redirect(rth); 1743 1736 skb_dst_set(skb, &rth->dst); 1744 1737 out: ··· 2027 2018 rth->dst.input = lwtunnel_input; 2028 2019 } 2029 2020 2030 - if (unlikely(!rt_cache_route(nh, rth))) { 2031 - rth->dst.flags |= DST_NOCACHE; 2021 + if (unlikely(!rt_cache_route(nh, rth))) 2032 2022 rt_add_uncached_list(rth); 2033 - } 2034 2023 } 2035 2024 skb_dst_set(skb, &rth->dst); 2036 2025 err = 0; ··· 2224 2217 rth = rcu_dereference(*prth); 2225 2218 2226 2219 rt_cache: 2227 - if (rt_cache_valid(rth)) { 2228 - dst_hold(&rth->dst); 2220 + if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) 2229 2221 return rth; 2230 - } 2231 2222 } 2232 2223 2233 2224 add: ··· 2259 2254 #endif 2260 2255 } 2261 2256 2262 - rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); 2257 + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); 2263 2258 set_lwt_redirect(rth); 2264 2259 2265 2260 return rth; ··· 2509 2504 new->input = dst_discard; 2510 2505 new->output = dst_discard_out; 2511 2506 2512 - new->dev = ort->dst.dev; 2507 + new->dev = net->loopback_dev; 2513 2508 if (new->dev) 2514 2509 dev_hold(new->dev); 2515 2510 ··· 2524 2519 rt->rt_uses_gateway = ort->rt_uses_gateway; 2525 2520 2526 2521 INIT_LIST_HEAD(&rt->rt_uncached); 2527 - dst_free(new); 2528 2522 } 2529 2523 2530 2524 dst_release(dst_orig);
+9 -10
net/ipv4/udp.c
··· 1977 1977 { 1978 1978 struct dst_entry *old; 1979 1979 1980 - dst_hold(dst); 1981 - old = xchg(&sk->sk_rx_dst, dst); 1982 - dst_release(old); 1980 + if (dst_hold_safe(dst)) { 1981 + old = xchg(&sk->sk_rx_dst, dst); 1982 + dst_release(old); 1983 + } 1983 1984 } 1984 1985 1985 1986 /* ··· 2304 2303 if (dst) 2305 2304 dst = dst_check(dst, 0); 2306 2305 if (dst) { 2307 - /* DST_NOCACHE can not be used without taking a reference */ 2308 - if (dst->flags & DST_NOCACHE) { 2309 - if (likely(atomic_inc_not_zero(&dst->__refcnt))) 2310 - skb_dst_set(skb, dst); 2311 - } else { 2312 - skb_dst_set_noref(skb, dst); 2313 - } 2306 + /* set noref for now. 2307 + * any place which wants to hold dst has to call 2308 + * dst_hold_safe() 2309 + */ 2310 + skb_dst_set_noref(skb, dst); 2314 2311 } 2315 2312 } 2316 2313
+2 -2
net/ipv6/addrconf.c
··· 5576 5576 ip6_del_rt(rt); 5577 5577 } 5578 5578 if (ifp->rt) { 5579 - dst_hold(&ifp->rt->dst); 5580 - ip6_del_rt(ifp->rt); 5579 + if (dst_hold_safe(&ifp->rt->dst)) 5580 + ip6_del_rt(ifp->rt); 5581 5581 } 5582 5582 rt_genid_bump_ipv6(net); 5583 5583 break;
+15 -17
net/ipv6/ip6_fib.c
··· 153 153 kmem_cache_free(fib6_node_kmem, fn); 154 154 } 155 155 156 - static void rt6_rcu_free(struct rt6_info *rt) 157 - { 158 - call_rcu(&rt->dst.rcu_head, dst_rcu_free); 159 - } 160 - 161 156 static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) 162 157 { 163 158 int cpu; ··· 167 172 ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); 168 173 pcpu_rt = *ppcpu_rt; 169 174 if (pcpu_rt) { 170 - rt6_rcu_free(pcpu_rt); 175 + dst_dev_put(&pcpu_rt->dst); 176 + dst_release(&pcpu_rt->dst); 171 177 *ppcpu_rt = NULL; 172 178 } 173 179 } ··· 181 185 { 182 186 if (atomic_dec_and_test(&rt->rt6i_ref)) { 183 187 rt6_free_pcpu(rt); 184 - rt6_rcu_free(rt); 188 + dst_dev_put(&rt->dst); 189 + dst_release(&rt->dst); 185 190 } 186 191 } 187 192 ··· 975 978 int replace_required = 0; 976 979 int sernum = fib6_new_sernum(info->nl_net); 977 980 978 - if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && 979 - !atomic_read(&rt->dst.__refcnt))) 981 + if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) 980 982 return -EINVAL; 981 983 982 984 if (info->nlh) { ··· 1072 1076 fib6_start_gc(info->nl_net, rt); 1073 1077 if (!(rt->rt6i_flags & RTF_CACHE)) 1074 1078 fib6_prune_clones(info->nl_net, pn); 1075 - rt->dst.flags &= ~DST_NOCACHE; 1076 1079 } 1077 1080 1078 1081 out: ··· 1096 1101 atomic_inc(&pn->leaf->rt6i_ref); 1097 1102 } 1098 1103 #endif 1099 - if (!(rt->dst.flags & DST_NOCACHE)) 1100 - dst_free(&rt->dst); 1104 + /* Always release dst as dst->__refcnt is guaranteed 1105 + * to be taken before entering this function 1106 + */ 1107 + dst_release_immediate(&rt->dst); 1101 1108 } 1102 1109 return err; 1103 1110 ··· 1110 1113 st_failure: 1111 1114 if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) 1112 1115 fib6_repair_tree(info->nl_net, fn); 1113 - if (!(rt->dst.flags & DST_NOCACHE)) 1114 - dst_free(&rt->dst); 1116 + /* Always release dst as dst->__refcnt is guaranteed 1117 + * to be taken before entering this function 1118 + */ 1119 + dst_release_immediate(&rt->dst); 1115 1120 return err; 1116 1121 #endif 1117 1122 } ··· 1782 1783 } 1783 1784 gc_args->more++; 1784 1785 } else if (rt->rt6i_flags & RTF_CACHE) { 1785 - if (atomic_read(&rt->dst.__refcnt) == 0 && 1786 + if (atomic_read(&rt->dst.__refcnt) == 1 && 1786 1787 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1787 1788 RT6_TRACE("aging clone %p\n", rt); 1788 1789 return -1; ··· 1820 1821 } 1821 1822 gc_args.timeout = expires ? (int)expires : 1822 1823 net->ipv6.sysctl.ip6_rt_gc_interval; 1823 - 1824 - gc_args.more = icmp6_dst_gc(); 1824 + gc_args.more = 0; 1825 1825 1826 1826 fib6_clean_all(net, fib6_age, &gc_args); 1827 1827 now = jiffies;
-4
net/ipv6/ip6_output.c
··· 698 698 ipv6_hdr(skb)->payload_len = htons(first_len - 699 699 sizeof(struct ipv6hdr)); 700 700 701 - dst_hold(&rt->dst); 702 - 703 701 for (;;) { 704 702 /* Prepare header of the next frame, 705 703 * before previous one went down. */ ··· 740 742 if (err == 0) { 741 743 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 742 744 IPSTATS_MIB_FRAGOKS); 743 - ip6_rt_put(rt); 744 745 return 0; 745 746 } 746 747 ··· 747 750 748 751 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 749 752 IPSTATS_MIB_FRAGFAILS); 750 - ip6_rt_put(rt); 751 753 return err; 752 754 753 755 slow_path_clean:
+43 -84
net/ipv6/route.c
··· 128 128 { 129 129 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 130 130 131 - rt->dst.flags |= DST_NOCACHE; 132 131 rt->rt6i_uncached_list = ul; 133 132 134 133 spin_lock_bh(&ul->lock); ··· 353 354 int flags) 354 355 { 355 356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 - 0, DST_OBSOLETE_FORCE_CHK, flags); 357 + 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 358 359 if (rt) 359 360 rt6_info_init(rt); ··· 380 381 *p = NULL; 381 382 } 382 383 } else { 383 - dst_destroy((struct dst_entry *)rt); 384 + dst_release_immediate(&rt->dst); 384 385 return NULL; 385 386 } 386 387 } ··· 931 932 EXPORT_SYMBOL(rt6_lookup); 932 933 933 934 /* ip6_ins_rt is called with FREE table->tb6_lock. 934 - It takes new route entry, the addition fails by any reason the 935 - route is freed. In any case, if caller does not hold it, it may 936 - be destroyed. 935 + * It takes new route entry, the addition fails by any reason the 936 + * route is released. 937 + * Caller must hold dst before calling it. 937 938 */ 938 939 939 940 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, ··· 956 957 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 957 958 struct mx6_config mxc = { .mx = NULL, }; 958 959 960 + /* Hold dst to account for the reference from the fib6 tree */ 961 + dst_hold(&rt->dst); 959 962 return __ip6_ins_rt(rt, &info, &mxc, NULL); 960 963 } 961 964 ··· 1050 1049 prev = cmpxchg(p, NULL, pcpu_rt); 1051 1050 if (prev) { 1052 1051 /* If someone did it before us, return prev instead */ 1053 - dst_destroy(&pcpu_rt->dst); 1052 + dst_release_immediate(&pcpu_rt->dst); 1054 1053 pcpu_rt = prev; 1055 1054 } 1056 1055 } else { ··· 1060 1059 * since rt is going away anyway. The next 1061 1060 * dst_check() will trigger a re-lookup. 1062 1061 */ 1063 - dst_destroy(&pcpu_rt->dst); 1062 + dst_release_immediate(&pcpu_rt->dst); 1064 1063 pcpu_rt = rt; 1065 1064 } 1066 1065 dst_hold(&pcpu_rt->dst); ··· 1130 1129 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1131 1130 dst_release(&rt->dst); 1132 1131 1133 - if (uncached_rt) 1132 + if (uncached_rt) { 1133 + /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1134 + * No need for another dst_hold() 1135 + */ 1134 1136 rt6_uncached_list_add(uncached_rt); 1135 - else 1137 + } else { 1136 1138 uncached_rt = net->ipv6.ip6_null_entry; 1137 - 1138 - dst_hold(&uncached_rt->dst); 1139 + dst_hold(&uncached_rt->dst); 1140 + } 1139 1141 1140 1142 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); 1141 1143 return uncached_rt; ··· 1249 1245 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 1250 1246 { 1251 1247 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 1248 + struct net_device *loopback_dev = net->loopback_dev; 1252 1249 struct dst_entry *new = NULL; 1253 1250 1254 - rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); 1251 + rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 1252 + DST_OBSOLETE_NONE, 0); 1255 1253 if (rt) { 1256 1254 rt6_info_init(rt); 1257 1255 ··· 1263 1257 new->output = dst_discard_out; 1264 1258 1265 1259 dst_copy_metrics(new, &ort->dst); 1266 - rt->rt6i_idev = ort->rt6i_idev; 1267 - if (rt->rt6i_idev) 1268 - in6_dev_hold(rt->rt6i_idev); 1269 1260 1261 + rt->rt6i_idev = in6_dev_get(loopback_dev); 1270 1262 rt->rt6i_gateway = ort->rt6i_gateway; 1271 1263 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 1272 1264 rt->rt6i_metric = 0; ··· 1273 1269 #ifdef CONFIG_IPV6_SUBTREES 1274 1270 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1275 1271 #endif 1276 - 1277 - dst_free(new); 1278 1272 } 1279 1273 1280 1274 dst_release(dst_orig); ··· 1325 1323 rt6_dst_from_metrics_check(rt); 1326 1324 1327 1325 if (rt->rt6i_flags & RTF_PCPU || 1328 - (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from)) 1326 + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) 1329 1327 return rt6_dst_from_check(rt, cookie); 1330 1328 else 1331 1329 return rt6_check(rt, cookie); ··· 1358 1356 rt = (struct rt6_info *) skb_dst(skb); 1359 1357 if (rt) { 1360 1358 if (rt->rt6i_flags & RTF_CACHE) { 1361 - dst_hold(&rt->dst); 1362 - ip6_del_rt(rt); 1359 + if (dst_hold_safe(&rt->dst)) 1360 + ip6_del_rt(rt); 1363 1361 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { 1364 1362 rt->rt6i_node->fn_sernum = -1; 1365 1363 } ··· 1423 1421 * invalidate the sk->sk_dst_cache. 1424 1422 */ 1425 1423 ip6_ins_rt(nrt6); 1424 + /* Release the reference taken in 1425 + * ip6_rt_cache_alloc() 1426 + */ 1427 + dst_release(&nrt6->dst); 1426 1428 } 1427 1429 } 1428 1430 } ··· 1655 1649 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1656 1650 } 1657 1651 1658 - static struct dst_entry *icmp6_dst_gc_list; 1659 - static DEFINE_SPINLOCK(icmp6_dst_lock); 1660 - 1661 1652 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 1662 1653 struct flowi6 *fl6) 1663 1654 { ··· 1675 1672 1676 1673 rt->dst.flags |= DST_HOST; 1677 1674 rt->dst.output = ip6_output; 1678 - atomic_set(&rt->dst.__refcnt, 1); 1679 1675 rt->rt6i_gateway = fl6->daddr; 1680 1676 rt->rt6i_dst.addr = fl6->daddr; 1681 1677 rt->rt6i_dst.plen = 128; 1682 1678 rt->rt6i_idev = idev; 1683 1679 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 1684 1680 1685 - spin_lock_bh(&icmp6_dst_lock); 1686 - rt->dst.next = icmp6_dst_gc_list; 1687 - icmp6_dst_gc_list = &rt->dst; 1688 - spin_unlock_bh(&icmp6_dst_lock); 1689 - 1690 - fib6_force_start_gc(net); 1681 + /* Add this dst into uncached_list so that rt6_ifdown() can 1682 + * do proper release of the net_device 1683 + */ 1684 + rt6_uncached_list_add(rt); 1691 1685 1692 1686 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 1693 1687 1694 1688 out: 1695 1689 return dst; 1696 - } 1697 - 1698 - int icmp6_dst_gc(void) 1699 - { 1700 - struct dst_entry *dst, **pprev; 1701 - int more = 0; 1702 - 1703 - spin_lock_bh(&icmp6_dst_lock); 1704 - pprev = &icmp6_dst_gc_list; 1705 - 1706 - while ((dst = *pprev) != NULL) { 1707 - if (!atomic_read(&dst->__refcnt)) { 1708 - *pprev = dst->next; 1709 - dst_free(dst); 1710 - } else { 1711 - pprev = &dst->next; 1712 - ++more; 1713 - } 1714 - } 1715 - 1716 - spin_unlock_bh(&icmp6_dst_lock); 1717 - 1718 - return more; 1719 - } 1720 - 1721 - static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), 1722 - void *arg) 1723 - { 1724 - struct dst_entry *dst, **pprev; 1725 - 1726 - spin_lock_bh(&icmp6_dst_lock); 1727 - pprev = &icmp6_dst_gc_list; 1728 - while ((dst = *pprev) != NULL) { 1729 - struct rt6_info *rt = (struct rt6_info *) dst; 1730 - if (func(rt, arg)) { 1731 - *pprev = dst->next; 1732 - dst_free(dst); 1733 - } else { 1734 - pprev = &dst->next; 1735 - } 1736 - } 1737 - spin_unlock_bh(&icmp6_dst_lock); 1738 1690 } 1739 1691 1740 1692 static int ip6_dst_gc(struct dst_ops *ops) ··· 2088 2130 if (idev) 2089 2131 in6_dev_put(idev); 2090 2132 if (rt) 2091 - dst_free(&rt->dst); 2133 + dst_release_immediate(&rt->dst); 2092 2134 2093 2135 return ERR_PTR(err); 2094 2136 } ··· 2118 2160 return err; 2119 2161 out: 2120 2162 if (rt) 2121 - dst_free(&rt->dst); 2163 + dst_release_immediate(&rt->dst); 2122 2164 2123 2165 return err; 2124 2166 } ··· 2129 2171 struct fib6_table *table; 2130 2172 struct net *net = dev_net(rt->dst.dev); 2131 2173 2132 - if (rt == net->ipv6.ip6_null_entry || 2133 - rt->dst.flags & DST_NOCACHE) { 2174 + if (rt == net->ipv6.ip6_null_entry) { 2134 2175 err = -ENOENT; 2135 2176 goto out; 2136 2177 } ··· 2354 2397 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 2355 2398 2356 2399 if (ip6_ins_rt(nrt)) 2357 - goto out; 2400 + goto out_release; 2358 2401 2359 2402 netevent.old = &rt->dst; 2360 2403 netevent.new = &nrt->dst; ··· 2366 2409 rt = (struct rt6_info *) dst_clone(&rt->dst); 2367 2410 ip6_del_rt(rt); 2368 2411 } 2412 + 2413 + out_release: 2414 + /* Release the reference taken in 2415 + * ip6_rt_cache_alloc() 2416 + */ 2417 + dst_release(&nrt->dst); 2369 2418 2370 2419 out: 2371 2420 neigh_release(neigh); ··· 2720 2757 rt->rt6i_dst.plen = 128; 2721 2758 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 2722 2759 rt->rt6i_table = fib6_get_table(net, tb_id); 2723 - rt->dst.flags |= DST_NOCACHE; 2724 - 2725 - atomic_set(&rt->dst.__refcnt, 1); 2726 2760 2727 2761 return rt; 2728 2762 } ··· 2807 2847 }; 2808 2848 2809 2849 fib6_clean_all(net, fib6_ifdown, &adn); 2810 - icmp6_clean_all(fib6_ifdown, &adn); 2811 2850 if (dev) 2812 2851 rt6_uncached_list_flush_dev(net, dev); 2813 2852 } ··· 3144 3185 3145 3186 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); 3146 3187 if (err) { 3147 - dst_free(&rt->dst); 3188 + dst_release_immediate(&rt->dst); 3148 3189 goto cleanup; 3149 3190 } 3150 3191 ··· 3208 3249 cleanup: 3209 3250 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 3210 3251 if (nh->rt6_info) 3211 - dst_free(&nh->rt6_info->dst); 3252 + dst_release_immediate(&nh->rt6_info->dst); 3212 3253 kfree(nh->mxc.mx); 3213 3254 list_del(&nh->next); 3214 3255 kfree(nh);
+5 -6
net/ipv6/udp.c
··· 920 920 if (dst) 921 921 dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); 922 922 if (dst) { 923 - if (dst->flags & DST_NOCACHE) { 924 - if (likely(atomic_inc_not_zero(&dst->__refcnt))) 925 - skb_dst_set(skb, dst); 926 - } else { 927 - skb_dst_set_noref(skb, dst); 928 - } 923 + /* set noref for now. 924 + * any place which wants to hold dst has to call 925 + * dst_hold_safe() 926 + */ 927 + skb_dst_set_noref(skb, dst); 929 928 } 930 929 } 931 930
+30 -19
net/xfrm/xfrm_policy.c
··· 1590 1590 struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); 1591 1591 struct dst_entry *dst = &xdst->u.dst; 1592 1592 1593 - dst_free(dst); 1593 + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ 1594 + dst->obsolete = DST_OBSOLETE_DEAD; 1595 + dst_release_immediate(dst); 1594 1596 } 1595 1597 1596 1598 static const struct flow_cache_ops xfrm_bundle_fc_ops = { ··· 1622 1620 default: 1623 1621 BUG(); 1624 1622 } 1625 - xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0); 1623 + xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0); 1626 1624 1627 1625 if (likely(xdst)) { 1628 1626 struct dst_entry *dst = &xdst->u.dst; ··· 1725 1723 1726 1724 if (!dst_prev) 1727 1725 dst0 = dst1; 1728 - else { 1729 - dst_prev->child = dst_clone(dst1); 1730 - dst1->flags |= DST_NOHASH; 1731 - } 1726 + else 1727 + /* Ref count is taken during xfrm_alloc_dst() 1728 + * No need to do dst_clone() on dst1 1729 + */ 1730 + dst_prev->child = dst1; 1732 1731 1733 1732 xdst->route = dst; 1734 1733 dst_copy_metrics(dst1, dst); ··· 1795 1792 xfrm_state_put(xfrm[i]); 1796 1793 free_dst: 1797 1794 if (dst0) 1798 - dst_free(dst0); 1795 + dst_release_immediate(dst0); 1799 1796 dst0 = ERR_PTR(err); 1800 1797 goto out; 1801 1798 } ··· 2076 2073 pol_dead |= pols[i]->walk.dead; 2077 2074 } 2078 2075 if (pol_dead) { 2079 - dst_free(&xdst->u.dst); 2076 + /* Mark DST_OBSOLETE_DEAD to fail the next 2077 + * xfrm_dst_check() 2078 + */ 2079 + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; 2080 + dst_release_immediate(&xdst->u.dst); 2080 2081 xdst = NULL; 2081 2082 num_pols = 0; 2082 2083 num_xfrms = 0; ··· 2127 2120 if (xdst) { 2128 2121 /* The policies were stolen for newly generated bundle */ 2129 2122 xdst->num_pols = 0; 2130 - dst_free(&xdst->u.dst); 2123 + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ 2124 + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; 2125 + dst_release_immediate(&xdst->u.dst); 2131 2126 } 2132 2127 2133 - /* Flow cache does not have reference, it dst_free()'s, 2134 - * but we do need to return one reference for original caller */ 2128 + /* We do need to return one reference for original caller */ 2135 2129 dst_hold(&new_xdst->u.dst); 2136 2130 return &new_xdst->flo; 2137 2131 ··· 2155 2147 inc_error: 2156 2148 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); 2157 2149 error: 2158 - if (xdst != NULL) 2159 - dst_free(&xdst->u.dst); 2160 - else 2150 + if (xdst != NULL) { 2151 + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ 2152 + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; 2153 + dst_release_immediate(&xdst->u.dst); 2154 + } else 2161 2155 xfrm_pols_put(pols, num_pols); 2162 2156 return ERR_PTR(err); 2163 2157 } ··· 2231 2221 } 2232 2222 2233 2223 dst_hold(&xdst->u.dst); 2234 - xdst->u.dst.flags |= DST_NOCACHE; 2235 2224 route = xdst->route; 2236 2225 } 2237 2226 } ··· 2645 2636 * notice. That's what we are validating here via the 2646 2637 * stale_bundle() check. 2647 2638 * 2648 - * When a policy's bundle is pruned, we dst_free() the XFRM 2649 - * dst which causes it's ->obsolete field to be set to 2650 - * DST_OBSOLETE_DEAD. If an XFRM dst has been pruned like 2651 - * this, we want to force a new route lookup. 2639 + * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will 2640 + * be marked on it. 2641 + * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will 2642 + * be marked on it. 2643 + * Both will force stable_bundle() to fail on any xdst bundle with 2644 + * this dst linked in it. 2652 2645 */ 2653 2646 if (dst->obsolete < 0 && !stale_bundle(dst)) 2654 2647 return dst;