Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Linux INET6 implementation
4 * Forwarding Information Database
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Changes:
10 * Yuji SEKIYA @USAGI: Support default route on router node;
11 * remove ip6_null_entry from the top of
12 * routing table.
13 * Ville Nuorvala: Fixed routing subtrees.
14 */
15
16#define pr_fmt(fmt) "IPv6: " fmt
17
18#include <linux/bpf.h>
19#include <linux/errno.h>
20#include <linux/types.h>
21#include <linux/net.h>
22#include <linux/route.h>
23#include <linux/netdevice.h>
24#include <linux/in6.h>
25#include <linux/init.h>
26#include <linux/list.h>
27#include <linux/slab.h>
28
29#include <net/ip.h>
30#include <net/ipv6.h>
31#include <net/ndisc.h>
32#include <net/addrconf.h>
33#include <net/lwtunnel.h>
34#include <net/fib_notifier.h>
35
36#include <net/ip_fib.h>
37#include <net/ip6_fib.h>
38#include <net/ip6_route.h>
39
40static struct kmem_cache *fib6_node_kmem __read_mostly;
41
42struct fib6_cleaner {
43 struct fib6_walker w;
44 struct net *net;
45 int (*func)(struct fib6_info *, void *arg);
46 int sernum;
47 void *arg;
48 bool skip_notify;
49};
50
51#ifdef CONFIG_IPV6_SUBTREES
52#define FWS_INIT FWS_S
53#else
54#define FWS_INIT FWS_L
55#endif
56
57static struct fib6_info *fib6_find_prefix(struct net *net,
58 struct fib6_table *table,
59 struct fib6_node *fn);
60static struct fib6_node *fib6_repair_tree(struct net *net,
61 struct fib6_table *table,
62 struct fib6_node *fn);
63static int fib6_walk(struct net *net, struct fib6_walker *w);
64static int fib6_walk_continue(struct fib6_walker *w);
65
66/*
67 * A routing update causes an increase of the serial number on the
68 * affected subtree. This allows for cached routes to be asynchronously
69 * tested when modifications are made to the destination cache as a
70 * result of redirects, path MTU changes, etc.
71 */
72
73static void fib6_gc_timer_cb(struct timer_list *t);
74
75#define FOR_WALKERS(net, w) \
76 list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
77
78static void fib6_walker_link(struct net *net, struct fib6_walker *w)
79{
80 write_lock_bh(&net->ipv6.fib6_walker_lock);
81 list_add(&w->lh, &net->ipv6.fib6_walkers);
82 write_unlock_bh(&net->ipv6.fib6_walker_lock);
83}
84
85static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
86{
87 write_lock_bh(&net->ipv6.fib6_walker_lock);
88 list_del(&w->lh);
89 write_unlock_bh(&net->ipv6.fib6_walker_lock);
90}
91
92static int fib6_new_sernum(struct net *net)
93{
94 int new, old = atomic_read(&net->ipv6.fib6_sernum);
95
96 do {
97 new = old < INT_MAX ? old + 1 : 1;
98 } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));
99
100 return new;
101}
102
103enum {
104 FIB6_NO_SERNUM_CHANGE = 0,
105};
106
107void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
108{
109 struct fib6_node *fn;
110
111 fn = rcu_dereference_protected(f6i->fib6_node,
112 lockdep_is_held(&f6i->fib6_table->tb6_lock));
113 if (fn)
114 WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
115}
116
117/*
118 * Auxiliary address test functions for the radix tree.
119 *
120 * These assume a 32bit processor (although it will work on
121 * 64bit processors)
122 */
123
124/*
125 * test bit
126 */
127#if defined(__LITTLE_ENDIAN)
128# define BITOP_BE32_SWIZZLE (0x1F & ~7)
129#else
130# define BITOP_BE32_SWIZZLE 0
131#endif
132
133static __be32 addr_bit_set(const void *token, int fn_bit)
134{
135 const __be32 *addr = token;
136 /*
137 * Here,
138 * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
139 * is optimized version of
140 * htonl(1 << ((~fn_bit)&0x1F))
141 * See include/asm-generic/bitops/le.h.
142 */
143 return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
144 addr[fn_bit >> 5];
145}
146
147struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
148{
149 struct fib6_info *f6i;
150 size_t sz = sizeof(*f6i);
151
152 if (with_fib6_nh)
153 sz += sizeof(struct fib6_nh);
154
155 f6i = kzalloc(sz, gfp_flags);
156 if (!f6i)
157 return NULL;
158
159 /* fib6_siblings is a union with nh_list, so this initializes both */
160 INIT_LIST_HEAD(&f6i->fib6_siblings);
161 refcount_set(&f6i->fib6_ref, 1);
162
163 INIT_HLIST_NODE(&f6i->gc_link);
164
165 return f6i;
166}
167
168void fib6_info_destroy_rcu(struct rcu_head *head)
169{
170 struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
171
172 WARN_ON(f6i->fib6_node);
173
174 if (f6i->nh)
175 nexthop_put(f6i->nh);
176 else
177 fib6_nh_release(f6i->fib6_nh);
178
179 ip_fib_metrics_put(f6i->fib6_metrics);
180 kfree(f6i);
181}
182EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
183
184static struct fib6_node *node_alloc(struct net *net)
185{
186 struct fib6_node *fn;
187
188 fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
189 if (fn)
190 net->ipv6.rt6_stats->fib_nodes++;
191
192 return fn;
193}
194
195static void node_free_immediate(struct net *net, struct fib6_node *fn)
196{
197 kmem_cache_free(fib6_node_kmem, fn);
198 net->ipv6.rt6_stats->fib_nodes--;
199}
200
201static void node_free(struct net *net, struct fib6_node *fn)
202{
203 kfree_rcu(fn, rcu);
204 net->ipv6.rt6_stats->fib_nodes--;
205}
206
207static void fib6_free_table(struct fib6_table *table)
208{
209 inetpeer_invalidate_tree(&table->tb6_peers);
210 kfree(table);
211}
212
213static void fib6_link_table(struct net *net, struct fib6_table *tb)
214{
215 unsigned int h;
216
217 /*
218 * Initialize table lock at a single place to give lockdep a key,
219 * tables aren't visible prior to being linked to the list.
220 */
221 spin_lock_init(&tb->tb6_lock);
222 h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
223
224 /*
225 * No protection necessary, this is the only list mutatation
226 * operation, tables never disappear once they exist.
227 */
228 hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
229}
230
231#ifdef CONFIG_IPV6_MULTIPLE_TABLES
232
233static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
234{
235 struct fib6_table *table;
236
237 table = kzalloc(sizeof(*table), GFP_ATOMIC);
238 if (table) {
239 table->tb6_id = id;
240 rcu_assign_pointer(table->tb6_root.leaf,
241 net->ipv6.fib6_null_entry);
242 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
243 inet_peer_base_init(&table->tb6_peers);
244 INIT_HLIST_HEAD(&table->tb6_gc_hlist);
245 }
246
247 return table;
248}
249
250struct fib6_table *fib6_new_table(struct net *net, u32 id)
251{
252 struct fib6_table *tb, *new_tb;
253
254 if (id == 0)
255 id = RT6_TABLE_MAIN;
256
257 tb = fib6_get_table(net, id);
258 if (tb)
259 return tb;
260
261 new_tb = fib6_alloc_table(net, id);
262 if (!new_tb)
263 return NULL;
264
265 spin_lock_bh(&net->ipv6.fib_table_hash_lock);
266
267 tb = fib6_get_table(net, id);
268 if (unlikely(tb)) {
269 spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
270 kfree(new_tb);
271 return tb;
272 }
273
274 fib6_link_table(net, new_tb);
275
276 spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
277
278 return new_tb;
279}
280EXPORT_SYMBOL_GPL(fib6_new_table);
281
282struct fib6_table *fib6_get_table(struct net *net, u32 id)
283{
284 struct hlist_head *head;
285 struct fib6_table *tb;
286
287 if (!id)
288 id = RT6_TABLE_MAIN;
289
290 head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)];
291
292 /* See comment in fib6_link_table(). RCU is not required,
293 * but rcu_dereference_raw() is used to avoid data-race.
294 */
295 hlist_for_each_entry_rcu(tb, head, tb6_hlist, true)
296 if (tb->tb6_id == id)
297 return tb;
298
299 return NULL;
300}
301EXPORT_SYMBOL_GPL(fib6_get_table);
302
303static void __net_init fib6_tables_init(struct net *net)
304{
305 fib6_link_table(net, net->ipv6.fib6_main_tbl);
306 fib6_link_table(net, net->ipv6.fib6_local_tbl);
307}
308#else
309
310struct fib6_table *fib6_new_table(struct net *net, u32 id)
311{
312 return fib6_get_table(net, id);
313}
314
315struct fib6_table *fib6_get_table(struct net *net, u32 id)
316{
317 return net->ipv6.fib6_main_tbl;
318}
319
320struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
321 const struct sk_buff *skb,
322 int flags, pol_lookup_t lookup)
323{
324 struct rt6_info *rt;
325
326 rt = pol_lookup_func(lookup,
327 net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
328 if (rt->dst.error == -EAGAIN) {
329 ip6_rt_put_flags(rt, flags);
330 rt = net->ipv6.ip6_null_entry;
331 if (!(flags & RT6_LOOKUP_F_DST_NOREF))
332 dst_hold(&rt->dst);
333 }
334
335 return &rt->dst;
336}
337
338/* called with rcu lock held; no reference taken on fib6_info */
339int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
340 struct fib6_result *res, int flags)
341{
342 return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
343 res, flags);
344}
345
346static void __net_init fib6_tables_init(struct net *net)
347{
348 fib6_link_table(net, net->ipv6.fib6_main_tbl);
349}
350
351#endif
352
353unsigned int fib6_tables_seq_read(const struct net *net)
354{
355 unsigned int h, fib_seq = 0;
356
357 rcu_read_lock();
358 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
359 const struct hlist_head *head = &net->ipv6.fib_table_hash[h];
360 const struct fib6_table *tb;
361
362 hlist_for_each_entry_rcu(tb, head, tb6_hlist)
363 fib_seq += READ_ONCE(tb->fib_seq);
364 }
365 rcu_read_unlock();
366
367 return fib_seq;
368}
369
370static int call_fib6_entry_notifier(struct notifier_block *nb,
371 enum fib_event_type event_type,
372 struct fib6_info *rt,
373 struct netlink_ext_ack *extack)
374{
375 struct fib6_entry_notifier_info info = {
376 .info.extack = extack,
377 .rt = rt,
378 };
379
380 return call_fib6_notifier(nb, event_type, &info.info);
381}
382
383static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
384 enum fib_event_type event_type,
385 struct fib6_info *rt,
386 unsigned int nsiblings,
387 struct netlink_ext_ack *extack)
388{
389 struct fib6_entry_notifier_info info = {
390 .info.extack = extack,
391 .rt = rt,
392 .nsiblings = nsiblings,
393 };
394
395 return call_fib6_notifier(nb, event_type, &info.info);
396}
397
398int call_fib6_entry_notifiers(struct net *net,
399 enum fib_event_type event_type,
400 struct fib6_info *rt,
401 struct netlink_ext_ack *extack)
402{
403 struct fib6_entry_notifier_info info = {
404 .info.extack = extack,
405 .rt = rt,
406 };
407
408 WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
409 return call_fib6_notifiers(net, event_type, &info.info);
410}
411
412int call_fib6_multipath_entry_notifiers(struct net *net,
413 enum fib_event_type event_type,
414 struct fib6_info *rt,
415 unsigned int nsiblings,
416 struct netlink_ext_ack *extack)
417{
418 struct fib6_entry_notifier_info info = {
419 .info.extack = extack,
420 .rt = rt,
421 .nsiblings = nsiblings,
422 };
423
424 WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
425 return call_fib6_notifiers(net, event_type, &info.info);
426}
427
428int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
429{
430 struct fib6_entry_notifier_info info = {
431 .rt = rt,
432 .nsiblings = rt->fib6_nsiblings,
433 };
434
435 WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
436 return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
437}
438
439struct fib6_dump_arg {
440 struct net *net;
441 struct notifier_block *nb;
442 struct netlink_ext_ack *extack;
443};
444
445static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
446{
447 enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
448 unsigned int nsiblings;
449 int err;
450
451 if (!rt || rt == arg->net->ipv6.fib6_null_entry)
452 return 0;
453
454 nsiblings = READ_ONCE(rt->fib6_nsiblings);
455 if (nsiblings)
456 err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
457 rt,
458 nsiblings,
459 arg->extack);
460 else
461 err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
462 arg->extack);
463
464 return err;
465}
466
467static int fib6_node_dump(struct fib6_walker *w)
468{
469 int err;
470
471 err = fib6_rt_dump(w->leaf, w->args);
472 w->leaf = NULL;
473 return err;
474}
475
476static int fib6_table_dump(struct net *net, struct fib6_table *tb,
477 struct fib6_walker *w)
478{
479 int err;
480
481 w->root = &tb->tb6_root;
482 spin_lock_bh(&tb->tb6_lock);
483 err = fib6_walk(net, w);
484 spin_unlock_bh(&tb->tb6_lock);
485 return err;
486}
487
488/* Called with rcu_read_lock() */
489int fib6_tables_dump(struct net *net, struct notifier_block *nb,
490 struct netlink_ext_ack *extack)
491{
492 struct fib6_dump_arg arg;
493 struct fib6_walker *w;
494 unsigned int h;
495 int err = 0;
496
497 w = kzalloc(sizeof(*w), GFP_ATOMIC);
498 if (!w)
499 return -ENOMEM;
500
501 w->func = fib6_node_dump;
502 arg.net = net;
503 arg.nb = nb;
504 arg.extack = extack;
505 w->args = &arg;
506
507 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
508 struct hlist_head *head = &net->ipv6.fib_table_hash[h];
509 struct fib6_table *tb;
510
511 hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
512 err = fib6_table_dump(net, tb, w);
513 if (err)
514 goto out;
515 }
516 }
517
518out:
519 kfree(w);
520
521 /* The tree traversal function should never return a positive value. */
522 return err > 0 ? -EINVAL : err;
523}
524
525static int fib6_dump_node(struct fib6_walker *w)
526{
527 int res;
528 struct fib6_info *rt;
529
530 for_each_fib6_walker_rt(w) {
531 res = rt6_dump_route(rt, w->args, w->skip_in_node);
532 if (res >= 0) {
533 /* Frame is full, suspend walking */
534 w->leaf = rt;
535
536 /* We'll restart from this node, so if some routes were
537 * already dumped, skip them next time.
538 */
539 w->skip_in_node += res;
540
541 return 1;
542 }
543 w->skip_in_node = 0;
544
545 /* Multipath routes are dumped in one route with the
546 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
547 * last sibling of this route (no need to dump the
548 * sibling routes again)
549 */
550 if (rt->fib6_nsiblings)
551 rt = list_last_entry(&rt->fib6_siblings,
552 struct fib6_info,
553 fib6_siblings);
554 }
555 w->leaf = NULL;
556 return 0;
557}
558
559static void fib6_dump_end(struct netlink_callback *cb)
560{
561 struct net *net = sock_net(cb->skb->sk);
562 struct fib6_walker *w = (void *)cb->args[2];
563
564 if (w) {
565 if (cb->args[4]) {
566 cb->args[4] = 0;
567 fib6_walker_unlink(net, w);
568 }
569 cb->args[2] = 0;
570 kfree(w);
571 }
572 cb->done = (void *)cb->args[3];
573 cb->args[1] = 3;
574}
575
576static int fib6_dump_done(struct netlink_callback *cb)
577{
578 fib6_dump_end(cb);
579 return cb->done ? cb->done(cb) : 0;
580}
581
582static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
583 struct netlink_callback *cb)
584{
585 struct net *net = sock_net(skb->sk);
586 struct fib6_walker *w;
587 int res;
588
589 w = (void *)cb->args[2];
590 w->root = &table->tb6_root;
591
592 if (cb->args[4] == 0) {
593 w->count = 0;
594 w->skip = 0;
595 w->skip_in_node = 0;
596
597 spin_lock_bh(&table->tb6_lock);
598 res = fib6_walk(net, w);
599 spin_unlock_bh(&table->tb6_lock);
600 if (res > 0) {
601 cb->args[4] = 1;
602 cb->args[5] = READ_ONCE(w->root->fn_sernum);
603 }
604 } else {
605 int sernum = READ_ONCE(w->root->fn_sernum);
606 if (cb->args[5] != sernum) {
607 /* Begin at the root if the tree changed */
608 cb->args[5] = sernum;
609 w->state = FWS_INIT;
610 w->node = w->root;
611 w->skip = w->count;
612 w->skip_in_node = 0;
613 } else
614 w->skip = 0;
615
616 spin_lock_bh(&table->tb6_lock);
617 res = fib6_walk_continue(w);
618 spin_unlock_bh(&table->tb6_lock);
619 if (res <= 0) {
620 fib6_walker_unlink(net, w);
621 cb->args[4] = 0;
622 }
623 }
624
625 return res;
626}
627
628static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
629{
630 struct rt6_rtnl_dump_arg arg = {
631 .filter.dump_exceptions = true,
632 .filter.dump_routes = true,
633 .filter.rtnl_held = false,
634 };
635 const struct nlmsghdr *nlh = cb->nlh;
636 struct net *net = sock_net(skb->sk);
637 unsigned int e = 0, s_e;
638 struct hlist_head *head;
639 struct fib6_walker *w;
640 struct fib6_table *tb;
641 unsigned int h, s_h;
642 int err = 0;
643
644 rcu_read_lock();
645 if (cb->strict_check) {
646 err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
647 if (err < 0)
648 goto unlock;
649 } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
650 struct rtmsg *rtm = nlmsg_data(nlh);
651
652 if (rtm->rtm_flags & RTM_F_PREFIX)
653 arg.filter.flags = RTM_F_PREFIX;
654 }
655
656 w = (void *)cb->args[2];
657 if (!w) {
658 /* New dump:
659 *
660 * 1. allocate and initialize walker.
661 */
662 w = kzalloc(sizeof(*w), GFP_ATOMIC);
663 if (!w) {
664 err = -ENOMEM;
665 goto unlock;
666 }
667 w->func = fib6_dump_node;
668 cb->args[2] = (long)w;
669
670 /* 2. hook callback destructor.
671 */
672 cb->args[3] = (long)cb->done;
673 cb->done = fib6_dump_done;
674
675 }
676
677 arg.skb = skb;
678 arg.cb = cb;
679 arg.net = net;
680 w->args = &arg;
681
682 if (arg.filter.table_id) {
683 tb = fib6_get_table(net, arg.filter.table_id);
684 if (!tb) {
685 if (rtnl_msg_family(cb->nlh) != PF_INET6)
686 goto unlock;
687
688 NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
689 err = -ENOENT;
690 goto unlock;
691 }
692
693 if (!cb->args[0]) {
694 err = fib6_dump_table(tb, skb, cb);
695 if (!err)
696 cb->args[0] = 1;
697 }
698 goto unlock;
699 }
700
701 s_h = cb->args[0];
702 s_e = cb->args[1];
703
704 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
705 e = 0;
706 head = &net->ipv6.fib_table_hash[h];
707 hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
708 if (e < s_e)
709 goto next;
710 err = fib6_dump_table(tb, skb, cb);
711 if (err != 0)
712 goto out;
713next:
714 e++;
715 }
716 }
717out:
718 cb->args[1] = e;
719 cb->args[0] = h;
720
721unlock:
722 rcu_read_unlock();
723 if (err <= 0)
724 fib6_dump_end(cb);
725 return err;
726}
727
728void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
729{
730 if (!f6i)
731 return;
732
733 if (f6i->fib6_metrics == &dst_default_metrics) {
734 struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
735
736 if (!p)
737 return;
738
739 refcount_set(&p->refcnt, 1);
740 f6i->fib6_metrics = p;
741 }
742
743 f6i->fib6_metrics->metrics[metric - 1] = val;
744}
745
746/*
747 * Routing Table
748 *
749 * return the appropriate node for a routing tree "add" operation
750 * by either creating and inserting or by returning an existing
751 * node.
752 */
753
754static struct fib6_node *fib6_add_1(struct net *net,
755 struct fib6_table *table,
756 struct fib6_node *root,
757 struct in6_addr *addr, int plen,
758 int offset, int allow_create,
759 int replace_required,
760 struct netlink_ext_ack *extack)
761{
762 struct fib6_node *fn, *in, *ln;
763 struct fib6_node *pn = NULL;
764 struct rt6key *key;
765 int bit;
766 __be32 dir = 0;
767
768 /* insert node in tree */
769
770 fn = root;
771
772 do {
773 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
774 lockdep_is_held(&table->tb6_lock));
775 key = (struct rt6key *)((u8 *)leaf + offset);
776
777 /*
778 * Prefix match
779 */
780 if (plen < fn->fn_bit ||
781 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
782 if (!allow_create) {
783 if (replace_required) {
784 NL_SET_ERR_MSG(extack,
785 "Can not replace route - no match found");
786 pr_warn("Can't replace route, no match found\n");
787 return ERR_PTR(-ENOENT);
788 }
789 pr_warn("NLM_F_CREATE should be set when creating new route\n");
790 }
791 goto insert_above;
792 }
793
794 /*
795 * Exact match ?
796 */
797
798 if (plen == fn->fn_bit) {
799 /* clean up an intermediate node */
800 if (!(fn->fn_flags & RTN_RTINFO)) {
801 RCU_INIT_POINTER(fn->leaf, NULL);
802 fib6_info_release(leaf);
803 /* remove null_entry in the root node */
804 } else if (fn->fn_flags & RTN_TL_ROOT &&
805 rcu_access_pointer(fn->leaf) ==
806 net->ipv6.fib6_null_entry) {
807 RCU_INIT_POINTER(fn->leaf, NULL);
808 }
809
810 return fn;
811 }
812
813 /*
814 * We have more bits to go
815 */
816
817 /* Try to walk down on tree. */
818 dir = addr_bit_set(addr, fn->fn_bit);
819 pn = fn;
820 fn = dir ?
821 rcu_dereference_protected(fn->right,
822 lockdep_is_held(&table->tb6_lock)) :
823 rcu_dereference_protected(fn->left,
824 lockdep_is_held(&table->tb6_lock));
825 } while (fn);
826
827 if (!allow_create) {
828 /* We should not create new node because
829 * NLM_F_REPLACE was specified without NLM_F_CREATE
830 * I assume it is safe to require NLM_F_CREATE when
831 * REPLACE flag is used! Later we may want to remove the
832 * check for replace_required, because according
833 * to netlink specification, NLM_F_CREATE
834 * MUST be specified if new route is created.
835 * That would keep IPv6 consistent with IPv4
836 */
837 if (replace_required) {
838 NL_SET_ERR_MSG(extack,
839 "Can not replace route - no match found");
840 pr_warn("Can't replace route, no match found\n");
841 return ERR_PTR(-ENOENT);
842 }
843 pr_warn("NLM_F_CREATE should be set when creating new route\n");
844 }
845 /*
846 * We walked to the bottom of tree.
847 * Create new leaf node without children.
848 */
849
850 ln = node_alloc(net);
851
852 if (!ln)
853 return ERR_PTR(-ENOMEM);
854 ln->fn_bit = plen;
855 RCU_INIT_POINTER(ln->parent, pn);
856
857 if (dir)
858 rcu_assign_pointer(pn->right, ln);
859 else
860 rcu_assign_pointer(pn->left, ln);
861
862 return ln;
863
864
865insert_above:
866 /*
867 * split since we don't have a common prefix anymore or
868 * we have a less significant route.
869 * we've to insert an intermediate node on the list
870 * this new node will point to the one we need to create
871 * and the current
872 */
873
874 pn = rcu_dereference_protected(fn->parent,
875 lockdep_is_held(&table->tb6_lock));
876
877 /* find 1st bit in difference between the 2 addrs.
878
879 See comment in __ipv6_addr_diff: bit may be an invalid value,
880 but if it is >= plen, the value is ignored in any case.
881 */
882
883 bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
884
885 /*
886 * (intermediate)[in]
887 * / \
888 * (new leaf node)[ln] (old node)[fn]
889 */
890 if (plen > bit) {
891 in = node_alloc(net);
892 ln = node_alloc(net);
893
894 if (!in || !ln) {
895 if (in)
896 node_free_immediate(net, in);
897 if (ln)
898 node_free_immediate(net, ln);
899 return ERR_PTR(-ENOMEM);
900 }
901
902 /*
903 * new intermediate node.
904 * RTN_RTINFO will
905 * be off since that an address that chooses one of
906 * the branches would not match less specific routes
907 * in the other branch
908 */
909
910 in->fn_bit = bit;
911
912 RCU_INIT_POINTER(in->parent, pn);
913 in->leaf = fn->leaf;
914 fib6_info_hold(rcu_dereference_protected(in->leaf,
915 lockdep_is_held(&table->tb6_lock)));
916
917 /* update parent pointer */
918 if (dir)
919 rcu_assign_pointer(pn->right, in);
920 else
921 rcu_assign_pointer(pn->left, in);
922
923 ln->fn_bit = plen;
924
925 RCU_INIT_POINTER(ln->parent, in);
926 rcu_assign_pointer(fn->parent, in);
927
928 if (addr_bit_set(addr, bit)) {
929 rcu_assign_pointer(in->right, ln);
930 rcu_assign_pointer(in->left, fn);
931 } else {
932 rcu_assign_pointer(in->left, ln);
933 rcu_assign_pointer(in->right, fn);
934 }
935 } else { /* plen <= bit */
936
937 /*
938 * (new leaf node)[ln]
939 * / \
940 * (old node)[fn] NULL
941 */
942
943 ln = node_alloc(net);
944
945 if (!ln)
946 return ERR_PTR(-ENOMEM);
947
948 ln->fn_bit = plen;
949
950 RCU_INIT_POINTER(ln->parent, pn);
951
952 if (addr_bit_set(&key->addr, plen))
953 RCU_INIT_POINTER(ln->right, fn);
954 else
955 RCU_INIT_POINTER(ln->left, fn);
956
957 rcu_assign_pointer(fn->parent, ln);
958
959 if (dir)
960 rcu_assign_pointer(pn->right, ln);
961 else
962 rcu_assign_pointer(pn->left, ln);
963 }
964 return ln;
965}
966
967static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
968 const struct fib6_info *match)
969{
970 int cpu;
971
972 if (!fib6_nh->rt6i_pcpu)
973 return;
974
975 rcu_read_lock();
976 /* release the reference to this fib entry from
977 * all of its cached pcpu routes
978 */
979 for_each_possible_cpu(cpu) {
980 struct rt6_info **ppcpu_rt;
981 struct rt6_info *pcpu_rt;
982
983 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
984
985 /* Paired with xchg() in rt6_get_pcpu_route() */
986 pcpu_rt = READ_ONCE(*ppcpu_rt);
987
988 /* only dropping the 'from' reference if the cached route
989 * is using 'match'. The cached pcpu_rt->from only changes
990 * from a fib6_info to NULL (ip6_dst_destroy); it can never
991 * change from one fib6_info reference to another
992 */
993 if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
994 struct fib6_info *from;
995
996 from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
997 fib6_info_release(from);
998 }
999 }
1000 rcu_read_unlock();
1001}
1002
1003static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
1004{
1005 struct fib6_info *arg = _arg;
1006
1007 __fib6_drop_pcpu_from(nh, arg);
1008 return 0;
1009}
1010
1011static void fib6_drop_pcpu_from(struct fib6_info *f6i)
1012{
1013 /* Make sure rt6_make_pcpu_route() wont add other percpu routes
1014 * while we are cleaning them here.
1015 */
1016 f6i->fib6_destroying = 1;
1017 mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
1018
1019 if (f6i->nh) {
1020 rcu_read_lock();
1021 nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i);
1022 rcu_read_unlock();
1023 } else {
1024 struct fib6_nh *fib6_nh;
1025
1026 fib6_nh = f6i->fib6_nh;
1027 __fib6_drop_pcpu_from(fib6_nh, f6i);
1028 }
1029}
1030
1031static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
1032 struct net *net)
1033{
1034 struct fib6_table *table = rt->fib6_table;
1035
1036 /* Flush all cached dst in exception table */
1037 rt6_flush_exceptions(rt);
1038 fib6_drop_pcpu_from(rt);
1039
1040 if (rt->nh) {
1041 spin_lock(&rt->nh->lock);
1042
1043 if (!list_empty(&rt->nh_list))
1044 list_del_init(&rt->nh_list);
1045
1046 spin_unlock(&rt->nh->lock);
1047 }
1048
1049 if (refcount_read(&rt->fib6_ref) != 1) {
1050 /* This route is used as dummy address holder in some split
1051 * nodes. It is not leaked, but it still holds other resources,
1052 * which must be released in time. So, scan ascendant nodes
1053 * and replace dummy references to this route with references
1054 * to still alive ones.
1055 */
1056 while (fn) {
1057 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1058 lockdep_is_held(&table->tb6_lock));
1059 struct fib6_info *new_leaf;
1060 if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
1061 new_leaf = fib6_find_prefix(net, table, fn);
1062 fib6_info_hold(new_leaf);
1063
1064 rcu_assign_pointer(fn->leaf, new_leaf);
1065 fib6_info_release(rt);
1066 }
1067 fn = rcu_dereference_protected(fn->parent,
1068 lockdep_is_held(&table->tb6_lock));
1069 }
1070 }
1071
1072 fib6_clean_expires(rt);
1073 fib6_remove_gc_list(rt);
1074}
1075
1076/*
1077 * Insert routing information in a node.
1078 */
1079
1080static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
1081 struct nl_info *info, struct netlink_ext_ack *extack,
1082 struct list_head *purge_list)
1083{
1084 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1085 lockdep_is_held(&rt->fib6_table->tb6_lock));
1086 struct fib6_info *iter = NULL;
1087 struct fib6_info __rcu **ins;
1088 struct fib6_info __rcu **fallback_ins = NULL;
1089 int replace = (info->nlh &&
1090 (info->nlh->nlmsg_flags & NLM_F_REPLACE));
1091 int add = (!info->nlh ||
1092 (info->nlh->nlmsg_flags & NLM_F_CREATE));
1093 int found = 0;
1094 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
1095 bool notify_sibling_rt = false;
1096 u16 nlflags = NLM_F_EXCL;
1097 int err;
1098
1099 if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
1100 nlflags |= NLM_F_APPEND;
1101
1102 ins = &fn->leaf;
1103
1104 for (iter = leaf; iter;
1105 iter = rcu_dereference_protected(iter->fib6_next,
1106 lockdep_is_held(&rt->fib6_table->tb6_lock))) {
1107 /*
1108 * Search for duplicates
1109 */
1110
1111 if (iter->fib6_metric == rt->fib6_metric) {
1112 /*
1113 * Same priority level
1114 */
1115 if (info->nlh &&
1116 (info->nlh->nlmsg_flags & NLM_F_EXCL))
1117 return -EEXIST;
1118
1119 nlflags &= ~NLM_F_EXCL;
1120 if (replace) {
1121 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
1122 found++;
1123 break;
1124 }
1125 fallback_ins = fallback_ins ?: ins;
1126 goto next_iter;
1127 }
1128
1129 if (rt6_duplicate_nexthop(iter, rt)) {
1130 if (rt->fib6_nsiblings)
1131 WRITE_ONCE(rt->fib6_nsiblings, 0);
1132 if (!(iter->fib6_flags & RTF_EXPIRES))
1133 return -EEXIST;
1134 if (!(rt->fib6_flags & RTF_EXPIRES)) {
1135 fib6_clean_expires(iter);
1136 fib6_remove_gc_list(iter);
1137 } else {
1138 fib6_set_expires(iter, rt->expires);
1139 fib6_add_gc_list(iter);
1140 }
1141 if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT))) {
1142 iter->fib6_flags &= ~RTF_ADDRCONF;
1143 iter->fib6_flags &= ~RTF_PREFIX_RT;
1144 }
1145
1146 if (rt->fib6_pmtu)
1147 fib6_metric_set(iter, RTAX_MTU,
1148 rt->fib6_pmtu);
1149 return -EEXIST;
1150 }
1151 /* If we have the same destination and the same metric,
1152 * but not the same gateway, then the route we try to
1153 * add is sibling to this route, increment our counter
1154 * of siblings, and later we will add our route to the
1155 * list.
1156 * Only static routes (which don't have flag
1157 * RTF_EXPIRES) are used for ECMPv6.
1158 *
1159 * To avoid long list, we only had siblings if the
1160 * route have a gateway.
1161 */
1162 if (rt_can_ecmp &&
1163 rt6_qualify_for_ecmp(iter))
1164 WRITE_ONCE(rt->fib6_nsiblings,
1165 rt->fib6_nsiblings + 1);
1166 }
1167
1168 if (iter->fib6_metric > rt->fib6_metric)
1169 break;
1170
1171next_iter:
1172 ins = &iter->fib6_next;
1173 }
1174
1175 if (fallback_ins && !found) {
1176 /* No matching route with same ecmp-able-ness found, replace
1177 * first matching route
1178 */
1179 ins = fallback_ins;
1180 iter = rcu_dereference_protected(*ins,
1181 lockdep_is_held(&rt->fib6_table->tb6_lock));
1182 found++;
1183 }
1184
1185 /* Reset round-robin state, if necessary */
1186 if (ins == &fn->leaf)
1187 fn->rr_ptr = NULL;
1188
1189 /* Link this route to others same route. */
1190 if (rt->fib6_nsiblings) {
1191 unsigned int fib6_nsiblings;
1192 struct fib6_info *sibling, *temp_sibling;
1193
1194 /* Find the first route that have the same metric */
1195 sibling = leaf;
1196 notify_sibling_rt = true;
1197 while (sibling) {
1198 if (sibling->fib6_metric == rt->fib6_metric &&
1199 rt6_qualify_for_ecmp(sibling)) {
1200 list_add_tail_rcu(&rt->fib6_siblings,
1201 &sibling->fib6_siblings);
1202 break;
1203 }
1204 sibling = rcu_dereference_protected(sibling->fib6_next,
1205 lockdep_is_held(&rt->fib6_table->tb6_lock));
1206 notify_sibling_rt = false;
1207 }
1208 /* For each sibling in the list, increment the counter of
1209 * siblings. BUG() if counters does not match, list of siblings
1210 * is broken!
1211 */
1212 fib6_nsiblings = 0;
1213 list_for_each_entry_safe(sibling, temp_sibling,
1214 &rt->fib6_siblings, fib6_siblings) {
1215 WRITE_ONCE(sibling->fib6_nsiblings,
1216 sibling->fib6_nsiblings + 1);
1217 BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1218 fib6_nsiblings++;
1219 }
1220 BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1221 rcu_read_lock();
1222 rt6_multipath_rebalance(temp_sibling);
1223 rcu_read_unlock();
1224 }
1225
1226 /*
1227 * insert node
1228 */
1229 if (!replace) {
1230 if (!add)
1231 pr_warn("NLM_F_CREATE should be set when creating new route\n");
1232
1233add:
1234 nlflags |= NLM_F_CREATE;
1235
1236 /* The route should only be notified if it is the first
1237 * route in the node or if it is added as a sibling
1238 * route to the first route in the node.
1239 */
1240 if (!info->skip_notify_kernel &&
1241 (notify_sibling_rt || ins == &fn->leaf)) {
1242 enum fib_event_type fib_event;
1243
1244 if (notify_sibling_rt)
1245 fib_event = FIB_EVENT_ENTRY_APPEND;
1246 else
1247 fib_event = FIB_EVENT_ENTRY_REPLACE;
1248 err = call_fib6_entry_notifiers(info->nl_net,
1249 fib_event, rt,
1250 extack);
1251 if (err) {
1252 struct fib6_info *sibling, *next_sibling;
1253
1254 /* If the route has siblings, then it first
1255 * needs to be unlinked from them.
1256 */
1257 if (!rt->fib6_nsiblings)
1258 return err;
1259
1260 list_for_each_entry_safe(sibling, next_sibling,
1261 &rt->fib6_siblings,
1262 fib6_siblings)
1263 WRITE_ONCE(sibling->fib6_nsiblings,
1264 sibling->fib6_nsiblings - 1);
1265 WRITE_ONCE(rt->fib6_nsiblings, 0);
1266 list_del_rcu(&rt->fib6_siblings);
1267 rcu_read_lock();
1268 rt6_multipath_rebalance(next_sibling);
1269 rcu_read_unlock();
1270 return err;
1271 }
1272 }
1273
1274 rcu_assign_pointer(rt->fib6_next, iter);
1275 fib6_info_hold(rt);
1276 rcu_assign_pointer(rt->fib6_node, fn);
1277 rcu_assign_pointer(*ins, rt);
1278 if (!info->skip_notify)
1279 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
1280 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
1281
1282 if (!(fn->fn_flags & RTN_RTINFO)) {
1283 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1284 fn->fn_flags |= RTN_RTINFO;
1285 }
1286
1287 } else {
1288 int nsiblings;
1289
1290 if (!found) {
1291 if (add)
1292 goto add;
1293 pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1294 return -ENOENT;
1295 }
1296
1297 if (!info->skip_notify_kernel && ins == &fn->leaf) {
1298 err = call_fib6_entry_notifiers(info->nl_net,
1299 FIB_EVENT_ENTRY_REPLACE,
1300 rt, extack);
1301 if (err)
1302 return err;
1303 }
1304
1305 fib6_info_hold(rt);
1306 rcu_assign_pointer(rt->fib6_node, fn);
1307 rt->fib6_next = iter->fib6_next;
1308 rcu_assign_pointer(*ins, rt);
1309 if (!info->skip_notify)
1310 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1311 if (!(fn->fn_flags & RTN_RTINFO)) {
1312 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1313 fn->fn_flags |= RTN_RTINFO;
1314 }
1315 nsiblings = iter->fib6_nsiblings;
1316 iter->fib6_node = NULL;
1317 list_add(&iter->purge_link, purge_list);
1318 if (rcu_access_pointer(fn->rr_ptr) == iter)
1319 fn->rr_ptr = NULL;
1320
1321 if (nsiblings) {
1322 /* Replacing an ECMP route, remove all siblings */
1323 ins = &rt->fib6_next;
1324 iter = rcu_dereference_protected(*ins,
1325 lockdep_is_held(&rt->fib6_table->tb6_lock));
1326 while (iter) {
1327 if (iter->fib6_metric > rt->fib6_metric)
1328 break;
1329 if (rt6_qualify_for_ecmp(iter)) {
1330 *ins = iter->fib6_next;
1331 iter->fib6_node = NULL;
1332 list_add(&iter->purge_link, purge_list);
1333 if (rcu_access_pointer(fn->rr_ptr) == iter)
1334 fn->rr_ptr = NULL;
1335 nsiblings--;
1336 info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1337 } else {
1338 ins = &iter->fib6_next;
1339 }
1340 iter = rcu_dereference_protected(*ins,
1341 lockdep_is_held(&rt->fib6_table->tb6_lock));
1342 }
1343 WARN_ON(nsiblings != 0);
1344 }
1345 }
1346
1347 return 0;
1348}
1349
1350static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt,
1351 struct nl_info *info, struct netlink_ext_ack *extack,
1352 struct list_head *purge_list)
1353{
1354 int err;
1355
1356 spin_lock(&rt->nh->lock);
1357
1358 if (rt->nh->dead) {
1359 NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
1360 err = -EINVAL;
1361 } else {
1362 err = fib6_add_rt2node(fn, rt, info, extack, purge_list);
1363 if (!err)
1364 list_add(&rt->nh_list, &rt->nh->f6i_list);
1365 }
1366
1367 spin_unlock(&rt->nh->lock);
1368
1369 return err;
1370}
1371
1372static void fib6_start_gc(struct net *net, struct fib6_info *rt)
1373{
1374 if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
1375 (rt->fib6_flags & RTF_EXPIRES))
1376 mod_timer(&net->ipv6.ip6_fib_timer,
1377 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1378}
1379
1380void fib6_force_start_gc(struct net *net)
1381{
1382 if (!timer_pending(&net->ipv6.ip6_fib_timer))
1383 mod_timer(&net->ipv6.ip6_fib_timer,
1384 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1385}
1386
1387static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
1388 int sernum)
1389{
1390 struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1391 lockdep_is_held(&rt->fib6_table->tb6_lock));
1392
1393 /* paired with smp_rmb() in fib6_get_cookie_safe() */
1394 smp_wmb();
1395 while (fn) {
1396 WRITE_ONCE(fn->fn_sernum, sernum);
1397 fn = rcu_dereference_protected(fn->parent,
1398 lockdep_is_held(&rt->fib6_table->tb6_lock));
1399 }
1400}
1401
1402void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
1403{
1404 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
1405}
1406
1407/* allow ipv4 to update sernum via ipv6_stub */
1408void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
1409{
1410 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1411 fib6_update_sernum_upto_root(net, f6i);
1412 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1413}
1414
1415/*
1416 * Add routing information to the routing tree.
1417 * <destination addr>/<source addr>
1418 * with source addr info in sub-trees
1419 * Need to own table->tb6_lock
1420 */
1421
1422int fib6_add(struct fib6_node *root, struct fib6_info *rt,
1423 struct nl_info *info, struct netlink_ext_ack *extack)
1424{
1425 struct fib6_table *table = rt->fib6_table;
1426 LIST_HEAD(purge_list);
1427 struct fib6_node *fn;
1428#ifdef CONFIG_IPV6_SUBTREES
1429 struct fib6_node *pn = NULL;
1430#endif
1431 int err = -ENOMEM;
1432 int allow_create = 1;
1433 int replace_required = 0;
1434
1435 if (info->nlh) {
1436 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
1437 allow_create = 0;
1438 if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
1439 replace_required = 1;
1440 }
1441 if (!allow_create && !replace_required)
1442 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1443
1444 fn = fib6_add_1(info->nl_net, table, root,
1445 &rt->fib6_dst.addr, rt->fib6_dst.plen,
1446 offsetof(struct fib6_info, fib6_dst), allow_create,
1447 replace_required, extack);
1448 if (IS_ERR(fn)) {
1449 err = PTR_ERR(fn);
1450 fn = NULL;
1451 goto out;
1452 }
1453
1454#ifdef CONFIG_IPV6_SUBTREES
1455 pn = fn;
1456
1457 if (rt->fib6_src.plen) {
1458 struct fib6_node *sn;
1459
1460 if (!rcu_access_pointer(fn->subtree)) {
1461 struct fib6_node *sfn;
1462
1463 /*
1464 * Create subtree.
1465 *
1466 * fn[main tree]
1467 * |
1468 * sfn[subtree root]
1469 * \
1470 * sn[new leaf node]
1471 */
1472
1473 /* Create subtree root node */
1474 sfn = node_alloc(info->nl_net);
1475 if (!sfn)
1476 goto failure;
1477
1478 fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
1479 rcu_assign_pointer(sfn->leaf,
1480 info->nl_net->ipv6.fib6_null_entry);
1481 sfn->fn_flags = RTN_ROOT;
1482
1483 /* Now add the first leaf node to new subtree */
1484
1485 sn = fib6_add_1(info->nl_net, table, sfn,
1486 &rt->fib6_src.addr, rt->fib6_src.plen,
1487 offsetof(struct fib6_info, fib6_src),
1488 allow_create, replace_required, extack);
1489
1490 if (IS_ERR(sn)) {
1491 /* If it is failed, discard just allocated
1492 root, and then (in failure) stale node
1493 in main tree.
1494 */
1495 node_free_immediate(info->nl_net, sfn);
1496 err = PTR_ERR(sn);
1497 goto failure;
1498 }
1499
1500 /* Now link new subtree to main tree */
1501 rcu_assign_pointer(sfn->parent, fn);
1502 rcu_assign_pointer(fn->subtree, sfn);
1503 } else {
1504 sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
1505 &rt->fib6_src.addr, rt->fib6_src.plen,
1506 offsetof(struct fib6_info, fib6_src),
1507 allow_create, replace_required, extack);
1508
1509 if (IS_ERR(sn)) {
1510 err = PTR_ERR(sn);
1511 goto failure;
1512 }
1513 }
1514
1515 if (!rcu_access_pointer(fn->leaf)) {
1516 if (fn->fn_flags & RTN_TL_ROOT) {
1517 /* put back null_entry for root node */
1518 rcu_assign_pointer(fn->leaf,
1519 info->nl_net->ipv6.fib6_null_entry);
1520 } else {
1521 fib6_info_hold(rt);
1522 rcu_assign_pointer(fn->leaf, rt);
1523 }
1524 }
1525 fn = sn;
1526 }
1527#endif
1528
1529 if (rt->nh)
1530 err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list);
1531 else
1532 err = fib6_add_rt2node(fn, rt, info, extack, &purge_list);
1533 if (!err) {
1534 struct fib6_info *iter, *next;
1535
1536 list_for_each_entry_safe(iter, next, &purge_list, purge_link) {
1537 list_del(&iter->purge_link);
1538 fib6_purge_rt(iter, fn, info->nl_net);
1539 fib6_info_release(iter);
1540 }
1541
1542 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
1543
1544 if (rt->fib6_flags & RTF_EXPIRES)
1545 fib6_add_gc_list(rt);
1546
1547 fib6_start_gc(info->nl_net, rt);
1548 }
1549
1550out:
1551 if (err) {
1552#ifdef CONFIG_IPV6_SUBTREES
1553 /*
1554 * If fib6_add_1 has cleared the old leaf pointer in the
1555 * super-tree leaf node we have to find a new one for it.
1556 */
1557 if (pn != fn) {
1558 struct fib6_info *pn_leaf =
1559 rcu_dereference_protected(pn->leaf,
1560 lockdep_is_held(&table->tb6_lock));
1561 if (pn_leaf == rt) {
1562 pn_leaf = NULL;
1563 RCU_INIT_POINTER(pn->leaf, NULL);
1564 fib6_info_release(rt);
1565 }
1566 if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
1567 pn_leaf = fib6_find_prefix(info->nl_net, table,
1568 pn);
1569 if (!pn_leaf)
1570 pn_leaf =
1571 info->nl_net->ipv6.fib6_null_entry;
1572 fib6_info_hold(pn_leaf);
1573 rcu_assign_pointer(pn->leaf, pn_leaf);
1574 }
1575 }
1576#endif
1577 goto failure;
1578 } else if (fib6_requires_src(rt)) {
1579 fib6_routes_require_src_inc(info->nl_net);
1580 }
1581 return err;
1582
1583failure:
1584 /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1585 * 1. fn is an intermediate node and we failed to add the new
1586 * route to it in both subtree creation failure and fib6_add_rt2node()
1587 * failure case.
1588 * 2. fn is the root node in the table and we fail to add the first
1589 * default route to it.
1590 */
1591 if (fn &&
1592 (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
1593 (fn->fn_flags & RTN_TL_ROOT &&
1594 !rcu_access_pointer(fn->leaf))))
1595 fib6_repair_tree(info->nl_net, table, fn);
1596 return err;
1597}
1598
1599/*
1600 * Routing tree lookup
1601 *
1602 */
1603
1604struct lookup_args {
1605 int offset; /* key offset on fib6_info */
1606 const struct in6_addr *addr; /* search key */
1607};
1608
1609static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
1610 struct lookup_args *args)
1611{
1612 struct fib6_node *fn;
1613 __be32 dir;
1614
1615 if (unlikely(args->offset == 0))
1616 return NULL;
1617
1618 /*
1619 * Descend on a tree
1620 */
1621
1622 fn = root;
1623
1624 for (;;) {
1625 struct fib6_node *next;
1626
1627 dir = addr_bit_set(args->addr, fn->fn_bit);
1628
1629 next = dir ? rcu_dereference(fn->right) :
1630 rcu_dereference(fn->left);
1631
1632 if (next) {
1633 fn = next;
1634 continue;
1635 }
1636 break;
1637 }
1638
1639 while (fn) {
1640 struct fib6_node *subtree = FIB6_SUBTREE(fn);
1641
1642 if (subtree || fn->fn_flags & RTN_RTINFO) {
1643 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1644 struct rt6key *key;
1645
1646 if (!leaf)
1647 goto backtrack;
1648
1649 key = (struct rt6key *) ((u8 *)leaf + args->offset);
1650
1651 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1652#ifdef CONFIG_IPV6_SUBTREES
1653 if (subtree) {
1654 struct fib6_node *sfn;
1655 sfn = fib6_node_lookup_1(subtree,
1656 args + 1);
1657 if (!sfn)
1658 goto backtrack;
1659 fn = sfn;
1660 }
1661#endif
1662 if (fn->fn_flags & RTN_RTINFO)
1663 return fn;
1664 }
1665 }
1666backtrack:
1667 if (fn->fn_flags & RTN_ROOT)
1668 break;
1669
1670 fn = rcu_dereference(fn->parent);
1671 }
1672
1673 return NULL;
1674}
1675
1676/* called with rcu_read_lock() held
1677 */
1678struct fib6_node *fib6_node_lookup(struct fib6_node *root,
1679 const struct in6_addr *daddr,
1680 const struct in6_addr *saddr)
1681{
1682 struct fib6_node *fn;
1683 struct lookup_args args[] = {
1684 {
1685 .offset = offsetof(struct fib6_info, fib6_dst),
1686 .addr = daddr,
1687 },
1688#ifdef CONFIG_IPV6_SUBTREES
1689 {
1690 .offset = offsetof(struct fib6_info, fib6_src),
1691 .addr = saddr,
1692 },
1693#endif
1694 {
1695 .offset = 0, /* sentinel */
1696 }
1697 };
1698
1699 fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
1700 if (!fn || fn->fn_flags & RTN_TL_ROOT)
1701 fn = root;
1702
1703 return fn;
1704}
1705
1706/*
1707 * Get node with specified destination prefix (and source prefix,
1708 * if subtrees are used)
1709 * exact_match == true means we try to find fn with exact match of
1710 * the passed in prefix addr
1711 * exact_match == false means we try to find fn with longest prefix
1712 * match of the passed in prefix addr. This is useful for finding fn
1713 * for cached route as it will be stored in the exception table under
1714 * the node with longest prefix length.
1715 */
1716
1717
1718static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1719 const struct in6_addr *addr,
1720 int plen, int offset,
1721 bool exact_match)
1722{
1723 struct fib6_node *fn, *prev = NULL;
1724
1725 for (fn = root; fn ; ) {
1726 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1727 struct rt6key *key;
1728
1729 /* This node is being deleted */
1730 if (!leaf) {
1731 if (plen <= fn->fn_bit)
1732 goto out;
1733 else
1734 goto next;
1735 }
1736
1737 key = (struct rt6key *)((u8 *)leaf + offset);
1738
1739 /*
1740 * Prefix match
1741 */
1742 if (plen < fn->fn_bit ||
1743 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1744 goto out;
1745
1746 if (plen == fn->fn_bit)
1747 return fn;
1748
1749 if (fn->fn_flags & RTN_RTINFO)
1750 prev = fn;
1751
1752next:
1753 /*
1754 * We have more bits to go
1755 */
1756 if (addr_bit_set(addr, fn->fn_bit))
1757 fn = rcu_dereference(fn->right);
1758 else
1759 fn = rcu_dereference(fn->left);
1760 }
1761out:
1762 if (exact_match)
1763 return NULL;
1764 else
1765 return prev;
1766}
1767
1768struct fib6_node *fib6_locate(struct fib6_node *root,
1769 const struct in6_addr *daddr, int dst_len,
1770 const struct in6_addr *saddr, int src_len,
1771 bool exact_match)
1772{
1773 struct fib6_node *fn;
1774
1775 fn = fib6_locate_1(root, daddr, dst_len,
1776 offsetof(struct fib6_info, fib6_dst),
1777 exact_match);
1778
1779#ifdef CONFIG_IPV6_SUBTREES
1780 if (src_len) {
1781 WARN_ON(saddr == NULL);
1782 if (fn) {
1783 struct fib6_node *subtree = FIB6_SUBTREE(fn);
1784
1785 if (subtree) {
1786 fn = fib6_locate_1(subtree, saddr, src_len,
1787 offsetof(struct fib6_info, fib6_src),
1788 exact_match);
1789 }
1790 }
1791 }
1792#endif
1793
1794 if (fn && fn->fn_flags & RTN_RTINFO)
1795 return fn;
1796
1797 return NULL;
1798}
1799
1800
1801/*
1802 * Deletion
1803 *
1804 */
1805
1806static struct fib6_info *fib6_find_prefix(struct net *net,
1807 struct fib6_table *table,
1808 struct fib6_node *fn)
1809{
1810 struct fib6_node *child_left, *child_right;
1811
1812 if (fn->fn_flags & RTN_ROOT)
1813 return net->ipv6.fib6_null_entry;
1814
1815 while (fn) {
1816 child_left = rcu_dereference_protected(fn->left,
1817 lockdep_is_held(&table->tb6_lock));
1818 child_right = rcu_dereference_protected(fn->right,
1819 lockdep_is_held(&table->tb6_lock));
1820 if (child_left)
1821 return rcu_dereference_protected(child_left->leaf,
1822 lockdep_is_held(&table->tb6_lock));
1823 if (child_right)
1824 return rcu_dereference_protected(child_right->leaf,
1825 lockdep_is_held(&table->tb6_lock));
1826
1827 fn = FIB6_SUBTREE(fn);
1828 }
1829 return NULL;
1830}
1831
1832/*
1833 * Called to trim the tree of intermediate nodes when possible. "fn"
1834 * is the node we want to try and remove.
1835 * Need to own table->tb6_lock
1836 */
1837
1838static struct fib6_node *fib6_repair_tree(struct net *net,
1839 struct fib6_table *table,
1840 struct fib6_node *fn)
1841{
1842 int children;
1843 int nstate;
1844 struct fib6_node *child;
1845 struct fib6_walker *w;
1846 int iter = 0;
1847
1848 /* Set fn->leaf to null_entry for root node. */
1849 if (fn->fn_flags & RTN_TL_ROOT) {
1850 rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
1851 return fn;
1852 }
1853
1854 for (;;) {
1855 struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
1856 lockdep_is_held(&table->tb6_lock));
1857 struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
1858 lockdep_is_held(&table->tb6_lock));
1859 struct fib6_node *pn = rcu_dereference_protected(fn->parent,
1860 lockdep_is_held(&table->tb6_lock));
1861 struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
1862 lockdep_is_held(&table->tb6_lock));
1863 struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
1864 lockdep_is_held(&table->tb6_lock));
1865 struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
1866 lockdep_is_held(&table->tb6_lock));
1867 struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
1868 lockdep_is_held(&table->tb6_lock));
1869 struct fib6_info *new_fn_leaf;
1870
1871 pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1872 iter++;
1873
1874 WARN_ON(fn->fn_flags & RTN_RTINFO);
1875 WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1876 WARN_ON(fn_leaf);
1877
1878 children = 0;
1879 child = NULL;
1880 if (fn_r) {
1881 child = fn_r;
1882 children |= 1;
1883 }
1884 if (fn_l) {
1885 child = fn_l;
1886 children |= 2;
1887 }
1888
1889 if (children == 3 || FIB6_SUBTREE(fn)
1890#ifdef CONFIG_IPV6_SUBTREES
1891 /* Subtree root (i.e. fn) may have one child */
1892 || (children && fn->fn_flags & RTN_ROOT)
1893#endif
1894 ) {
1895 new_fn_leaf = fib6_find_prefix(net, table, fn);
1896#if RT6_DEBUG >= 2
1897 if (!new_fn_leaf) {
1898 WARN_ON(!new_fn_leaf);
1899 new_fn_leaf = net->ipv6.fib6_null_entry;
1900 }
1901#endif
1902 fib6_info_hold(new_fn_leaf);
1903 rcu_assign_pointer(fn->leaf, new_fn_leaf);
1904 return pn;
1905 }
1906
1907#ifdef CONFIG_IPV6_SUBTREES
1908 if (FIB6_SUBTREE(pn) == fn) {
1909 WARN_ON(!(fn->fn_flags & RTN_ROOT));
1910 RCU_INIT_POINTER(pn->subtree, NULL);
1911 nstate = FWS_L;
1912 } else {
1913 WARN_ON(fn->fn_flags & RTN_ROOT);
1914#endif
1915 if (pn_r == fn)
1916 rcu_assign_pointer(pn->right, child);
1917 else if (pn_l == fn)
1918 rcu_assign_pointer(pn->left, child);
1919#if RT6_DEBUG >= 2
1920 else
1921 WARN_ON(1);
1922#endif
1923 if (child)
1924 rcu_assign_pointer(child->parent, pn);
1925 nstate = FWS_R;
1926#ifdef CONFIG_IPV6_SUBTREES
1927 }
1928#endif
1929
1930 read_lock(&net->ipv6.fib6_walker_lock);
1931 FOR_WALKERS(net, w) {
1932 if (!child) {
1933 if (w->node == fn) {
1934 pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
1935 w, w->state, nstate);
1936 w->node = pn;
1937 w->state = nstate;
1938 }
1939 } else {
1940 if (w->node == fn) {
1941 w->node = child;
1942 if (children&2) {
1943 pr_debug("W %p adjusted by delnode 2, s=%d\n",
1944 w, w->state);
1945 w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1946 } else {
1947 pr_debug("W %p adjusted by delnode 2, s=%d\n",
1948 w, w->state);
1949 w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1950 }
1951 }
1952 }
1953 }
1954 read_unlock(&net->ipv6.fib6_walker_lock);
1955
1956 node_free(net, fn);
1957 if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1958 return pn;
1959
1960 RCU_INIT_POINTER(pn->leaf, NULL);
1961 fib6_info_release(pn_leaf);
1962 fn = pn;
1963 }
1964}
1965
1966static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1967 struct fib6_info __rcu **rtp, struct nl_info *info)
1968{
1969 struct fib6_info *leaf, *replace_rt = NULL;
1970 struct fib6_walker *w;
1971 struct fib6_info *rt = rcu_dereference_protected(*rtp,
1972 lockdep_is_held(&table->tb6_lock));
1973 struct net *net = info->nl_net;
1974 bool notify_del = false;
1975
1976 /* If the deleted route is the first in the node and it is not part of
1977 * a multipath route, then we need to replace it with the next route
1978 * in the node, if exists.
1979 */
1980 leaf = rcu_dereference_protected(fn->leaf,
1981 lockdep_is_held(&table->tb6_lock));
1982 if (leaf == rt && !rt->fib6_nsiblings) {
1983 if (rcu_access_pointer(rt->fib6_next))
1984 replace_rt = rcu_dereference_protected(rt->fib6_next,
1985 lockdep_is_held(&table->tb6_lock));
1986 else
1987 notify_del = true;
1988 }
1989
1990 /* Unlink it */
1991 *rtp = rt->fib6_next;
1992 rt->fib6_node = NULL;
1993 net->ipv6.rt6_stats->fib_rt_entries--;
1994 net->ipv6.rt6_stats->fib_discarded_routes++;
1995
1996 /* Reset round-robin state, if necessary */
1997 if (rcu_access_pointer(fn->rr_ptr) == rt)
1998 fn->rr_ptr = NULL;
1999
2000 /* Remove this entry from other siblings */
2001 if (rt->fib6_nsiblings) {
2002 struct fib6_info *sibling, *next_sibling;
2003
2004 /* The route is deleted from a multipath route. If this
2005 * multipath route is the first route in the node, then we need
2006 * to emit a delete notification. Otherwise, we need to skip
2007 * the notification.
2008 */
2009 if (rt->fib6_metric == leaf->fib6_metric &&
2010 rt6_qualify_for_ecmp(leaf))
2011 notify_del = true;
2012 list_for_each_entry_safe(sibling, next_sibling,
2013 &rt->fib6_siblings, fib6_siblings)
2014 WRITE_ONCE(sibling->fib6_nsiblings,
2015 sibling->fib6_nsiblings - 1);
2016 WRITE_ONCE(rt->fib6_nsiblings, 0);
2017 list_del_rcu(&rt->fib6_siblings);
2018 rt6_multipath_rebalance(next_sibling);
2019 }
2020
2021 /* Adjust walkers */
2022 read_lock(&net->ipv6.fib6_walker_lock);
2023 FOR_WALKERS(net, w) {
2024 if (w->state == FWS_C && w->leaf == rt) {
2025 pr_debug("walker %p adjusted by delroute\n", w);
2026 w->leaf = rcu_dereference_protected(rt->fib6_next,
2027 lockdep_is_held(&table->tb6_lock));
2028 if (!w->leaf)
2029 w->state = FWS_U;
2030 }
2031 }
2032 read_unlock(&net->ipv6.fib6_walker_lock);
2033
2034 /* If it was last route, call fib6_repair_tree() to:
2035 * 1. For root node, put back null_entry as how the table was created.
2036 * 2. For other nodes, expunge its radix tree node.
2037 */
2038 if (!rcu_access_pointer(fn->leaf)) {
2039 if (!(fn->fn_flags & RTN_TL_ROOT)) {
2040 fn->fn_flags &= ~RTN_RTINFO;
2041 net->ipv6.rt6_stats->fib_route_nodes--;
2042 }
2043 fn = fib6_repair_tree(net, table, fn);
2044 }
2045
2046 fib6_purge_rt(rt, fn, net);
2047
2048 if (!info->skip_notify_kernel) {
2049 if (notify_del)
2050 call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
2051 rt, NULL);
2052 else if (replace_rt)
2053 call_fib6_entry_notifiers_replace(net, replace_rt);
2054 }
2055 if (!info->skip_notify)
2056 inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
2057
2058 fib6_info_release(rt);
2059}
2060
2061/* Need to own table->tb6_lock */
2062int fib6_del(struct fib6_info *rt, struct nl_info *info)
2063{
2064 struct net *net = info->nl_net;
2065 struct fib6_info __rcu **rtp;
2066 struct fib6_info __rcu **rtp_next;
2067 struct fib6_table *table;
2068 struct fib6_node *fn;
2069
2070 if (rt == net->ipv6.fib6_null_entry)
2071 return -ENOENT;
2072
2073 table = rt->fib6_table;
2074 fn = rcu_dereference_protected(rt->fib6_node,
2075 lockdep_is_held(&table->tb6_lock));
2076 if (!fn)
2077 return -ENOENT;
2078
2079 WARN_ON(!(fn->fn_flags & RTN_RTINFO));
2080
2081 /*
2082 * Walk the leaf entries looking for ourself
2083 */
2084
2085 for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
2086 struct fib6_info *cur = rcu_dereference_protected(*rtp,
2087 lockdep_is_held(&table->tb6_lock));
2088 if (rt == cur) {
2089 if (fib6_requires_src(cur))
2090 fib6_routes_require_src_dec(info->nl_net);
2091 fib6_del_route(table, fn, rtp, info);
2092 return 0;
2093 }
2094 rtp_next = &cur->fib6_next;
2095 }
2096 return -ENOENT;
2097}
2098
2099/*
2100 * Tree traversal function.
2101 *
2102 * Certainly, it is not interrupt safe.
2103 * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
2104 * It means, that we can modify tree during walking
2105 * and use this function for garbage collection, clone pruning,
2106 * cleaning tree when a device goes down etc. etc.
2107 *
2108 * It guarantees that every node will be traversed,
2109 * and that it will be traversed only once.
2110 *
2111 * Callback function w->func may return:
2112 * 0 -> continue walking.
2113 * positive value -> walking is suspended (used by tree dumps,
2114 * and probably by gc, if it will be split to several slices)
2115 * negative value -> terminate walking.
2116 *
2117 * The function itself returns:
2118 * 0 -> walk is complete.
2119 * >0 -> walk is incomplete (i.e. suspended)
2120 * <0 -> walk is terminated by an error.
2121 *
2122 * This function is called with tb6_lock held.
2123 */
2124
2125static int fib6_walk_continue(struct fib6_walker *w)
2126{
2127 struct fib6_node *fn, *pn, *left, *right;
2128
2129 /* w->root should always be table->tb6_root */
2130 WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
2131
2132 for (;;) {
2133 fn = w->node;
2134 if (!fn)
2135 return 0;
2136
2137 switch (w->state) {
2138#ifdef CONFIG_IPV6_SUBTREES
2139 case FWS_S:
2140 if (FIB6_SUBTREE(fn)) {
2141 w->node = FIB6_SUBTREE(fn);
2142 continue;
2143 }
2144 w->state = FWS_L;
2145 fallthrough;
2146#endif
2147 case FWS_L:
2148 left = rcu_dereference_protected(fn->left, 1);
2149 if (left) {
2150 w->node = left;
2151 w->state = FWS_INIT;
2152 continue;
2153 }
2154 w->state = FWS_R;
2155 fallthrough;
2156 case FWS_R:
2157 right = rcu_dereference_protected(fn->right, 1);
2158 if (right) {
2159 w->node = right;
2160 w->state = FWS_INIT;
2161 continue;
2162 }
2163 w->state = FWS_C;
2164 w->leaf = rcu_dereference_protected(fn->leaf, 1);
2165 fallthrough;
2166 case FWS_C:
2167 if (w->leaf && fn->fn_flags & RTN_RTINFO) {
2168 int err;
2169
2170 if (w->skip) {
2171 w->skip--;
2172 goto skip;
2173 }
2174
2175 err = w->func(w);
2176 if (err)
2177 return err;
2178
2179 w->count++;
2180 continue;
2181 }
2182skip:
2183 w->state = FWS_U;
2184 fallthrough;
2185 case FWS_U:
2186 if (fn == w->root)
2187 return 0;
2188 pn = rcu_dereference_protected(fn->parent, 1);
2189 left = rcu_dereference_protected(pn->left, 1);
2190 right = rcu_dereference_protected(pn->right, 1);
2191 w->node = pn;
2192#ifdef CONFIG_IPV6_SUBTREES
2193 if (FIB6_SUBTREE(pn) == fn) {
2194 WARN_ON(!(fn->fn_flags & RTN_ROOT));
2195 w->state = FWS_L;
2196 continue;
2197 }
2198#endif
2199 if (left == fn) {
2200 w->state = FWS_R;
2201 continue;
2202 }
2203 if (right == fn) {
2204 w->state = FWS_C;
2205 w->leaf = rcu_dereference_protected(w->node->leaf, 1);
2206 continue;
2207 }
2208#if RT6_DEBUG >= 2
2209 WARN_ON(1);
2210#endif
2211 }
2212 }
2213}
2214
2215static int fib6_walk(struct net *net, struct fib6_walker *w)
2216{
2217 int res;
2218
2219 w->state = FWS_INIT;
2220 w->node = w->root;
2221
2222 fib6_walker_link(net, w);
2223 res = fib6_walk_continue(w);
2224 if (res <= 0)
2225 fib6_walker_unlink(net, w);
2226 return res;
2227}
2228
2229static int fib6_clean_node(struct fib6_walker *w)
2230{
2231 int res;
2232 struct fib6_info *rt;
2233 struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
2234 struct nl_info info = {
2235 .nl_net = c->net,
2236 .skip_notify = c->skip_notify,
2237 };
2238
2239 if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
2240 READ_ONCE(w->node->fn_sernum) != c->sernum)
2241 WRITE_ONCE(w->node->fn_sernum, c->sernum);
2242
2243 if (!c->func) {
2244 WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
2245 w->leaf = NULL;
2246 return 0;
2247 }
2248
2249 for_each_fib6_walker_rt(w) {
2250 res = c->func(rt, c->arg);
2251 if (res == -1) {
2252 w->leaf = rt;
2253 res = fib6_del(rt, &info);
2254 if (res) {
2255#if RT6_DEBUG >= 2
2256 pr_debug("%s: del failed: rt=%p@%p err=%d\n",
2257 __func__, rt,
2258 rcu_access_pointer(rt->fib6_node),
2259 res);
2260#endif
2261 continue;
2262 }
2263 return 0;
2264 } else if (res == -2) {
2265 if (WARN_ON(!rt->fib6_nsiblings))
2266 continue;
2267 rt = list_last_entry(&rt->fib6_siblings,
2268 struct fib6_info, fib6_siblings);
2269 continue;
2270 }
2271 WARN_ON(res != 0);
2272 }
2273 w->leaf = rt;
2274 return 0;
2275}
2276
2277/*
2278 * Convenient frontend to tree walker.
2279 *
2280 * func is called on each route.
2281 * It may return -2 -> skip multipath route.
2282 * -1 -> delete this route.
2283 * 0 -> continue walking
2284 */
2285
2286static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2287 int (*func)(struct fib6_info *, void *arg),
2288 int sernum, void *arg, bool skip_notify)
2289{
2290 struct fib6_cleaner c;
2291
2292 c.w.root = root;
2293 c.w.func = fib6_clean_node;
2294 c.w.count = 0;
2295 c.w.skip = 0;
2296 c.w.skip_in_node = 0;
2297 c.func = func;
2298 c.sernum = sernum;
2299 c.arg = arg;
2300 c.net = net;
2301 c.skip_notify = skip_notify;
2302
2303 fib6_walk(net, &c.w);
2304}
2305
2306static void __fib6_clean_all(struct net *net,
2307 int (*func)(struct fib6_info *, void *),
2308 int sernum, void *arg, bool skip_notify)
2309{
2310 struct fib6_table *table;
2311 struct hlist_head *head;
2312 unsigned int h;
2313
2314 rcu_read_lock();
2315 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2316 head = &net->ipv6.fib_table_hash[h];
2317 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2318 spin_lock_bh(&table->tb6_lock);
2319 fib6_clean_tree(net, &table->tb6_root,
2320 func, sernum, arg, skip_notify);
2321 spin_unlock_bh(&table->tb6_lock);
2322 }
2323 }
2324 rcu_read_unlock();
2325}
2326
2327void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2328 void *arg)
2329{
2330 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
2331}
2332
2333void fib6_clean_all_skip_notify(struct net *net,
2334 int (*func)(struct fib6_info *, void *),
2335 void *arg)
2336{
2337 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
2338}
2339
2340static void fib6_flush_trees(struct net *net)
2341{
2342 int new_sernum = fib6_new_sernum(net);
2343
2344 __fib6_clean_all(net, NULL, new_sernum, NULL, false);
2345}
2346
2347/*
2348 * Garbage collection
2349 */
2350
2351static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
2352{
2353 unsigned long now = jiffies;
2354
2355 /*
2356 * check addrconf expiration here.
2357 * Routes are expired even if they are in use.
2358 */
2359
2360 if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
2361 if (time_after(now, rt->expires)) {
2362 pr_debug("expiring %p\n", rt);
2363 return -1;
2364 }
2365 gc_args->more++;
2366 }
2367
2368 /* Also age clones in the exception table.
2369 * Note, that clones are aged out
2370 * only if they are not in use now.
2371 */
2372 rt6_age_exceptions(rt, gc_args, now);
2373
2374 return 0;
2375}
2376
2377static void fib6_gc_table(struct net *net,
2378 struct fib6_table *tb6,
2379 struct fib6_gc_args *gc_args)
2380{
2381 struct fib6_info *rt;
2382 struct hlist_node *n;
2383 struct nl_info info = {
2384 .nl_net = net,
2385 .skip_notify = false,
2386 };
2387
2388 hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
2389 if (fib6_age(rt, gc_args) == -1)
2390 fib6_del(rt, &info);
2391}
2392
2393static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
2394{
2395 struct fib6_table *table;
2396 struct hlist_head *head;
2397 unsigned int h;
2398
2399 rcu_read_lock();
2400 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2401 head = &net->ipv6.fib_table_hash[h];
2402 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2403 spin_lock_bh(&table->tb6_lock);
2404
2405 fib6_gc_table(net, table, gc_args);
2406
2407 spin_unlock_bh(&table->tb6_lock);
2408 }
2409 }
2410 rcu_read_unlock();
2411}
2412
2413void fib6_run_gc(unsigned long expires, struct net *net, bool force)
2414{
2415 struct fib6_gc_args gc_args;
2416 unsigned long now;
2417
2418 if (force) {
2419 spin_lock_bh(&net->ipv6.fib6_gc_lock);
2420 } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
2421 mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
2422 return;
2423 }
2424 gc_args.timeout = expires ? (int)expires :
2425 net->ipv6.sysctl.ip6_rt_gc_interval;
2426 gc_args.more = 0;
2427
2428 fib6_gc_all(net, &gc_args);
2429 now = jiffies;
2430 net->ipv6.ip6_rt_last_gc = now;
2431
2432 if (gc_args.more)
2433 mod_timer(&net->ipv6.ip6_fib_timer,
2434 round_jiffies(now
2435 + net->ipv6.sysctl.ip6_rt_gc_interval));
2436 else
2437 timer_delete(&net->ipv6.ip6_fib_timer);
2438 spin_unlock_bh(&net->ipv6.fib6_gc_lock);
2439}
2440
2441static void fib6_gc_timer_cb(struct timer_list *t)
2442{
2443 struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer);
2444
2445 fib6_run_gc(0, arg, true);
2446}
2447
2448static int __net_init fib6_net_init(struct net *net)
2449{
2450 size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
2451 int err;
2452
2453 err = fib6_notifier_init(net);
2454 if (err)
2455 return err;
2456
2457 /* Default to 3-tuple */
2458 net->ipv6.sysctl.multipath_hash_fields =
2459 FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
2460
2461 spin_lock_init(&net->ipv6.fib6_gc_lock);
2462 rwlock_init(&net->ipv6.fib6_walker_lock);
2463 INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
2464 timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
2465
2466 net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
2467 if (!net->ipv6.rt6_stats)
2468 goto out_notifier;
2469
2470 /* Avoid false sharing : Use at least a full cache line */
2471 size = max_t(size_t, size, L1_CACHE_BYTES);
2472
2473 net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
2474 if (!net->ipv6.fib_table_hash)
2475 goto out_rt6_stats;
2476
2477 spin_lock_init(&net->ipv6.fib_table_hash_lock);
2478
2479 net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
2480 GFP_KERNEL);
2481 if (!net->ipv6.fib6_main_tbl)
2482 goto out_fib_table_hash;
2483
2484 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
2485 rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
2486 net->ipv6.fib6_null_entry);
2487 net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
2488 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2489 inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
2490 INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);
2491
2492#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2493 net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
2494 GFP_KERNEL);
2495 if (!net->ipv6.fib6_local_tbl)
2496 goto out_fib6_main_tbl;
2497 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
2498 rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
2499 net->ipv6.fib6_null_entry);
2500 net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
2501 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2502 inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
2503 INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
2504#endif
2505 fib6_tables_init(net);
2506
2507 return 0;
2508
2509#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2510out_fib6_main_tbl:
2511 kfree(net->ipv6.fib6_main_tbl);
2512#endif
2513out_fib_table_hash:
2514 kfree(net->ipv6.fib_table_hash);
2515out_rt6_stats:
2516 kfree(net->ipv6.rt6_stats);
2517out_notifier:
2518 fib6_notifier_exit(net);
2519 return -ENOMEM;
2520}
2521
2522static void fib6_net_exit(struct net *net)
2523{
2524 unsigned int i;
2525
2526 timer_delete_sync(&net->ipv6.ip6_fib_timer);
2527
2528 for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
2529 struct hlist_head *head = &net->ipv6.fib_table_hash[i];
2530 struct hlist_node *tmp;
2531 struct fib6_table *tb;
2532
2533 hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
2534 hlist_del(&tb->tb6_hlist);
2535 fib6_free_table(tb);
2536 }
2537 }
2538
2539 kfree(net->ipv6.fib_table_hash);
2540 kfree(net->ipv6.rt6_stats);
2541 fib6_notifier_exit(net);
2542}
2543
2544static struct pernet_operations fib6_net_ops = {
2545 .init = fib6_net_init,
2546 .exit = fib6_net_exit,
2547};
2548
2549static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = {
2550 {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
2551 .dumpit = inet6_dump_fib,
2552 .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
2553};
2554
2555int __init fib6_init(void)
2556{
2557 int ret = -ENOMEM;
2558
2559 fib6_node_kmem = KMEM_CACHE(fib6_node,
2560 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
2561 if (!fib6_node_kmem)
2562 goto out;
2563
2564 ret = register_pernet_subsys(&fib6_net_ops);
2565 if (ret)
2566 goto out_kmem_cache_create;
2567
2568 ret = rtnl_register_many(fib6_rtnl_msg_handlers);
2569 if (ret)
2570 goto out_unregister_subsys;
2571
2572 __fib6_flush_trees = fib6_flush_trees;
2573out:
2574 return ret;
2575
2576out_unregister_subsys:
2577 unregister_pernet_subsys(&fib6_net_ops);
2578out_kmem_cache_create:
2579 kmem_cache_destroy(fib6_node_kmem);
2580 goto out;
2581}
2582
2583void fib6_gc_cleanup(void)
2584{
2585 unregister_pernet_subsys(&fib6_net_ops);
2586 kmem_cache_destroy(fib6_node_kmem);
2587}
2588
2589#ifdef CONFIG_PROC_FS
2590static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
2591{
2592 struct fib6_info *rt = v;
2593 struct ipv6_route_iter *iter = seq->private;
2594 struct fib6_nh *fib6_nh = rt->fib6_nh;
2595 unsigned int flags = rt->fib6_flags;
2596 const struct net_device *dev;
2597
2598 if (rt->nh)
2599 fib6_nh = nexthop_fib6_nh(rt->nh);
2600
2601 seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
2602
2603#ifdef CONFIG_IPV6_SUBTREES
2604 seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
2605#else
2606 seq_puts(seq, "00000000000000000000000000000000 00 ");
2607#endif
2608 if (fib6_nh->fib_nh_gw_family) {
2609 flags |= RTF_GATEWAY;
2610 seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
2611 } else {
2612 seq_puts(seq, "00000000000000000000000000000000");
2613 }
2614
2615 dev = fib6_nh->fib_nh_dev;
2616 seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2617 rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
2618 flags, dev ? dev->name : "");
2619 iter->w.leaf = NULL;
2620 return 0;
2621}
2622
2623static int ipv6_route_yield(struct fib6_walker *w)
2624{
2625 struct ipv6_route_iter *iter = w->args;
2626
2627 if (!iter->skip)
2628 return 1;
2629
2630 do {
2631 iter->w.leaf = rcu_dereference_protected(
2632 iter->w.leaf->fib6_next,
2633 lockdep_is_held(&iter->tbl->tb6_lock));
2634 iter->skip--;
2635 if (!iter->skip && iter->w.leaf)
2636 return 1;
2637 } while (iter->w.leaf);
2638
2639 return 0;
2640}
2641
2642static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
2643 struct net *net)
2644{
2645 memset(&iter->w, 0, sizeof(iter->w));
2646 iter->w.func = ipv6_route_yield;
2647 iter->w.root = &iter->tbl->tb6_root;
2648 iter->w.state = FWS_INIT;
2649 iter->w.node = iter->w.root;
2650 iter->w.args = iter;
2651 iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
2652 INIT_LIST_HEAD(&iter->w.lh);
2653 fib6_walker_link(net, &iter->w);
2654}
2655
2656static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
2657 struct net *net)
2658{
2659 unsigned int h;
2660 struct hlist_node *node;
2661
2662 if (tbl) {
2663 h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
2664 node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
2665 } else {
2666 h = 0;
2667 node = NULL;
2668 }
2669
2670 while (!node && h < FIB6_TABLE_HASHSZ) {
2671 node = rcu_dereference(
2672 hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
2673 }
2674 return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
2675}
2676
2677static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
2678{
2679 int sernum = READ_ONCE(iter->w.root->fn_sernum);
2680
2681 if (iter->sernum != sernum) {
2682 iter->sernum = sernum;
2683 iter->w.state = FWS_INIT;
2684 iter->w.node = iter->w.root;
2685 WARN_ON(iter->w.skip);
2686 iter->w.skip = iter->w.count;
2687 }
2688}
2689
2690static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2691{
2692 int r;
2693 struct fib6_info *n;
2694 struct net *net = seq_file_net(seq);
2695 struct ipv6_route_iter *iter = seq->private;
2696
2697 ++(*pos);
2698 if (!v)
2699 goto iter_table;
2700
2701 n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
2702 if (n)
2703 return n;
2704
2705iter_table:
2706 ipv6_route_check_sernum(iter);
2707 spin_lock_bh(&iter->tbl->tb6_lock);
2708 r = fib6_walk_continue(&iter->w);
2709 spin_unlock_bh(&iter->tbl->tb6_lock);
2710 if (r > 0) {
2711 return iter->w.leaf;
2712 } else if (r < 0) {
2713 fib6_walker_unlink(net, &iter->w);
2714 return NULL;
2715 }
2716 fib6_walker_unlink(net, &iter->w);
2717
2718 iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
2719 if (!iter->tbl)
2720 return NULL;
2721
2722 ipv6_route_seq_setup_walk(iter, net);
2723 goto iter_table;
2724}
2725
2726static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
2727 __acquires(RCU)
2728{
2729 struct net *net = seq_file_net(seq);
2730 struct ipv6_route_iter *iter = seq->private;
2731
2732 rcu_read_lock();
2733 iter->tbl = ipv6_route_seq_next_table(NULL, net);
2734 iter->skip = *pos;
2735
2736 if (iter->tbl) {
2737 loff_t p = 0;
2738
2739 ipv6_route_seq_setup_walk(iter, net);
2740 return ipv6_route_seq_next(seq, NULL, &p);
2741 } else {
2742 return NULL;
2743 }
2744}
2745
2746static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
2747{
2748 struct fib6_walker *w = &iter->w;
2749 return w->node && !(w->state == FWS_U && w->node == w->root);
2750}
2751
2752static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
2753 __releases(RCU)
2754{
2755 struct net *net = seq_file_net(seq);
2756 struct ipv6_route_iter *iter = seq->private;
2757
2758 if (ipv6_route_iter_active(iter))
2759 fib6_walker_unlink(net, &iter->w);
2760
2761 rcu_read_unlock();
2762}
2763
2764#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
2765static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
2766 struct bpf_iter_meta *meta,
2767 void *v)
2768{
2769 struct bpf_iter__ipv6_route ctx;
2770
2771 ctx.meta = meta;
2772 ctx.rt = v;
2773 return bpf_iter_run_prog(prog, &ctx);
2774}
2775
2776static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2777{
2778 struct ipv6_route_iter *iter = seq->private;
2779 struct bpf_iter_meta meta;
2780 struct bpf_prog *prog;
2781 int ret;
2782
2783 meta.seq = seq;
2784 prog = bpf_iter_get_info(&meta, false);
2785 if (!prog)
2786 return ipv6_route_native_seq_show(seq, v);
2787
2788 ret = ipv6_route_prog_seq_show(prog, &meta, v);
2789 iter->w.leaf = NULL;
2790
2791 return ret;
2792}
2793
2794static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2795{
2796 struct bpf_iter_meta meta;
2797 struct bpf_prog *prog;
2798
2799 if (!v) {
2800 meta.seq = seq;
2801 prog = bpf_iter_get_info(&meta, true);
2802 if (prog)
2803 (void)ipv6_route_prog_seq_show(prog, &meta, v);
2804 }
2805
2806 ipv6_route_native_seq_stop(seq, v);
2807}
2808#else
2809static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2810{
2811 return ipv6_route_native_seq_show(seq, v);
2812}
2813
2814static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2815{
2816 ipv6_route_native_seq_stop(seq, v);
2817}
2818#endif
2819
2820const struct seq_operations ipv6_route_seq_ops = {
2821 .start = ipv6_route_seq_start,
2822 .next = ipv6_route_seq_next,
2823 .stop = ipv6_route_seq_stop,
2824 .show = ipv6_route_seq_show
2825};
2826#endif /* CONFIG_PROC_FS */