net/ipv4/route.c at v2.6.26-rc8

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / route.c
at v2.6.26-rc8 3095 lines 78 kB view raw
wrap content
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  13 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15 *
  16 * Fixes:
  17 *		Alan Cox	:	Verify area fixes.
  18 *		Alan Cox	:	cli() protects routing changes
  19 *		Rui Oliveira	:	ICMP routing table updates
  20 *		(rco@di.uminho.pt)	Routing table insertion and update
  21 *		Linus Torvalds	:	Rewrote bits to be sensible
  22 *		Alan Cox	:	Added BSD route gw semantics
  23 *		Alan Cox	:	Super /proc >4K
  24 *		Alan Cox	:	MTU in route table
  25 *		Alan Cox	: 	MSS actually. Also added the window
  26 *					clamper.
  27 *		Sam Lantinga	:	Fixed route matching in rt_del()
  28 *		Alan Cox	:	Routing cache support.
  29 *		Alan Cox	:	Removed compatibility cruft.
  30 *		Alan Cox	:	RTF_REJECT support.
  31 *		Alan Cox	:	TCP irtt support.
  32 *		Jonathan Naylor	:	Added Metric support.
  33 *	Miquel van Smoorenburg	:	BSD API fixes.
  34 *	Miquel van Smoorenburg	:	Metrics.
  35 *		Alan Cox	:	Use __u32 properly
  36 *		Alan Cox	:	Aligned routing errors more closely with BSD
  37 *					our system is still very different.
  38 *		Alan Cox	:	Faster /proc handling
  39 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  40 *					routing caches and better behaviour.
  41 *
  42 *		Olaf Erb	:	irtt wasn't being copied right.
  43 *		Bjorn Ekwall	:	Kerneld route support.
  44 *		Alan Cox	:	Multicast fixed (I hope)
  45 * 		Pavel Krauz	:	Limited broadcast fixed
  46 *		Mike McLagan	:	Routing by source
  47 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  48 *					route.c and rewritten from scratch.
  49 *		Andi Kleen	:	Load-limit warning messages.
  50 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  51 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  52 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  53 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  54 *		Marc Boucher	:	routing by fwmark
  55 *	Robert Olsson		:	Added rt_cache statistics
  56 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  57 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  58 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  59 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  60 *
  61 *		This program is free software; you can redistribute it and/or
  62 *		modify it under the terms of the GNU General Public License
  63 *		as published by the Free Software Foundation; either version
  64 *		2 of the License, or (at your option) any later version.
  65 */
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <asm/system.h>
  70#include <linux/bitops.h>
  71#include <linux/types.h>
  72#include <linux/kernel.h>
  73#include <linux/mm.h>
  74#include <linux/bootmem.h>
  75#include <linux/string.h>
  76#include <linux/socket.h>
  77#include <linux/sockios.h>
  78#include <linux/errno.h>
  79#include <linux/in.h>
  80#include <linux/inet.h>
  81#include <linux/netdevice.h>
  82#include <linux/proc_fs.h>
  83#include <linux/init.h>
  84#include <linux/workqueue.h>
  85#include <linux/skbuff.h>
  86#include <linux/inetdevice.h>
  87#include <linux/igmp.h>
  88#include <linux/pkt_sched.h>
  89#include <linux/mroute.h>
  90#include <linux/netfilter_ipv4.h>
  91#include <linux/random.h>
  92#include <linux/jhash.h>
  93#include <linux/rcupdate.h>
  94#include <linux/times.h>
  95#include <net/dst.h>
  96#include <net/net_namespace.h>
  97#include <net/protocol.h>
  98#include <net/ip.h>
  99#include <net/route.h>
 100#include <net/inetpeer.h>
 101#include <net/sock.h>
 102#include <net/ip_fib.h>
 103#include <net/arp.h>
 104#include <net/tcp.h>
 105#include <net/icmp.h>
 106#include <net/xfrm.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#endif
 112
 113#define RT_FL_TOS(oldflp) \
 114    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116#define IP_MAX_MTU	0xFFF0
 117
 118#define RT_GC_TIMEOUT (300*HZ)
 119
 120static int ip_rt_max_size;
 121static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 122static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
 123static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
 124static int ip_rt_redirect_number __read_mostly	= 9;
 125static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 126static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 127static int ip_rt_error_cost __read_mostly	= HZ;
 128static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 129static int ip_rt_gc_elasticity __read_mostly	= 8;
 130static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 131static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 132static int ip_rt_min_advmss __read_mostly	= 256;
 133static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
 134
 135static void rt_worker_func(struct work_struct *work);
 136static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 137static struct timer_list rt_secret_timer;
 138
 139/*
 140 *	Interface to generic destination cache.
 141 */
 142
 143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144static void		 ipv4_dst_destroy(struct dst_entry *dst);
 145static void		 ipv4_dst_ifdown(struct dst_entry *dst,
 146					 struct net_device *dev, int how);
 147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148static void		 ipv4_link_failure(struct sk_buff *skb);
 149static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150static int rt_garbage_collect(struct dst_ops *ops);
 151
 152
 153static struct dst_ops ipv4_dst_ops = {
 154	.family =		AF_INET,
 155	.protocol =		__constant_htons(ETH_P_IP),
 156	.gc =			rt_garbage_collect,
 157	.check =		ipv4_dst_check,
 158	.destroy =		ipv4_dst_destroy,
 159	.ifdown =		ipv4_dst_ifdown,
 160	.negative_advice =	ipv4_negative_advice,
 161	.link_failure =		ipv4_link_failure,
 162	.update_pmtu =		ip_rt_update_pmtu,
 163	.local_out =		__ip_local_out,
 164	.entry_size =		sizeof(struct rtable),
 165	.entries =		ATOMIC_INIT(0),
 166};
 167
 168#define ECN_OR_COST(class)	TC_PRIO_##class
 169
 170const __u8 ip_tos2prio[16] = {
 171	TC_PRIO_BESTEFFORT,
 172	ECN_OR_COST(FILLER),
 173	TC_PRIO_BESTEFFORT,
 174	ECN_OR_COST(BESTEFFORT),
 175	TC_PRIO_BULK,
 176	ECN_OR_COST(BULK),
 177	TC_PRIO_BULK,
 178	ECN_OR_COST(BULK),
 179	TC_PRIO_INTERACTIVE,
 180	ECN_OR_COST(INTERACTIVE),
 181	TC_PRIO_INTERACTIVE,
 182	ECN_OR_COST(INTERACTIVE),
 183	TC_PRIO_INTERACTIVE_BULK,
 184	ECN_OR_COST(INTERACTIVE_BULK),
 185	TC_PRIO_INTERACTIVE_BULK,
 186	ECN_OR_COST(INTERACTIVE_BULK)
 187};
 188
 189
 190/*
 191 * Route cache.
 192 */
 193
 194/* The locking scheme is rather straight forward:
 195 *
 196 * 1) Read-Copy Update protects the buckets of the central route hash.
 197 * 2) Only writers remove entries, and they hold the lock
 198 *    as they look at rtable reference counts.
 199 * 3) Only readers acquire references to rtable entries,
 200 *    they do so with atomic increments and with the
 201 *    lock held.
 202 */
 203
 204struct rt_hash_bucket {
 205	struct rtable	*chain;
 206};
 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 208	defined(CONFIG_PROVE_LOCKING)
 209/*
 210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 211 * The size of this table is a power of two and depends on the number of CPUS.
 212 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 213 */
 214#ifdef CONFIG_LOCKDEP
 215# define RT_HASH_LOCK_SZ	256
 216#else
 217# if NR_CPUS >= 32
 218#  define RT_HASH_LOCK_SZ	4096
 219# elif NR_CPUS >= 16
 220#  define RT_HASH_LOCK_SZ	2048
 221# elif NR_CPUS >= 8
 222#  define RT_HASH_LOCK_SZ	1024
 223# elif NR_CPUS >= 4
 224#  define RT_HASH_LOCK_SZ	512
 225# else
 226#  define RT_HASH_LOCK_SZ	256
 227# endif
 228#endif
 229
 230static spinlock_t	*rt_hash_locks;
 231# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 232
 233static __init void rt_hash_lock_init(void)
 234{
 235	int i;
 236
 237	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 238			GFP_KERNEL);
 239	if (!rt_hash_locks)
 240		panic("IP: failed to allocate rt_hash_locks\n");
 241
 242	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 243		spin_lock_init(&rt_hash_locks[i]);
 244}
 245#else
 246# define rt_hash_lock_addr(slot) NULL
 247
 248static inline void rt_hash_lock_init(void)
 249{
 250}
 251#endif
 252
 253static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
 254static unsigned			rt_hash_mask __read_mostly;
 255static unsigned int		rt_hash_log  __read_mostly;
 256static atomic_t			rt_genid __read_mostly;
 257
 258static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 259#define RT_CACHE_STAT_INC(field) \
 260	(__raw_get_cpu_var(rt_cache_stat).field++)
 261
 262static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
 263{
 264	return jhash_3words((__force u32)(__be32)(daddr),
 265			    (__force u32)(__be32)(saddr),
 266			    idx, atomic_read(&rt_genid))
 267		& rt_hash_mask;
 268}
 269
 270#ifdef CONFIG_PROC_FS
 271struct rt_cache_iter_state {
 272	struct seq_net_private p;
 273	int bucket;
 274	int genid;
 275};
 276
 277static struct rtable *rt_cache_get_first(struct seq_file *seq)
 278{
 279	struct rt_cache_iter_state *st = seq->private;
 280	struct rtable *r = NULL;
 281
 282	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 283		rcu_read_lock_bh();
 284		r = rcu_dereference(rt_hash_table[st->bucket].chain);
 285		while (r) {
 286			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 287			    r->rt_genid == st->genid)
 288				return r;
 289			r = rcu_dereference(r->u.dst.rt_next);
 290		}
 291		rcu_read_unlock_bh();
 292	}
 293	return r;
 294}
 295
 296static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 297					  struct rtable *r)
 298{
 299	struct rt_cache_iter_state *st = seq->private;
 300	r = r->u.dst.rt_next;
 301	while (!r) {
 302		rcu_read_unlock_bh();
 303		if (--st->bucket < 0)
 304			break;
 305		rcu_read_lock_bh();
 306		r = rt_hash_table[st->bucket].chain;
 307	}
 308	return rcu_dereference(r);
 309}
 310
 311static struct rtable *rt_cache_get_next(struct seq_file *seq,
 312					struct rtable *r)
 313{
 314	struct rt_cache_iter_state *st = seq->private;
 315	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 316		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 317			continue;
 318		if (r->rt_genid == st->genid)
 319			break;
 320	}
 321	return r;
 322}
 323
 324static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 325{
 326	struct rtable *r = rt_cache_get_first(seq);
 327
 328	if (r)
 329		while (pos && (r = rt_cache_get_next(seq, r)))
 330			--pos;
 331	return pos ? NULL : r;
 332}
 333
 334static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 335{
 336	struct rt_cache_iter_state *st = seq->private;
 337	if (*pos)
 338		return rt_cache_get_idx(seq, *pos - 1);
 339	st->genid = atomic_read(&rt_genid);
 340	return SEQ_START_TOKEN;
 341}
 342
 343static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 344{
 345	struct rtable *r;
 346
 347	if (v == SEQ_START_TOKEN)
 348		r = rt_cache_get_first(seq);
 349	else
 350		r = rt_cache_get_next(seq, v);
 351	++*pos;
 352	return r;
 353}
 354
 355static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 356{
 357	if (v && v != SEQ_START_TOKEN)
 358		rcu_read_unlock_bh();
 359}
 360
 361static int rt_cache_seq_show(struct seq_file *seq, void *v)
 362{
 363	if (v == SEQ_START_TOKEN)
 364		seq_printf(seq, "%-127s\n",
 365			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 366			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 367			   "HHUptod\tSpecDst");
 368	else {
 369		struct rtable *r = v;
 370		int len;
 371
 372		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 373			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 374			r->u.dst.dev ? r->u.dst.dev->name : "*",
 375			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 376			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 377			r->u.dst.__use, 0, (unsigned long)r->rt_src,
 378			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 379			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 380			dst_metric(&r->u.dst, RTAX_WINDOW),
 381			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 382			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
 383			r->fl.fl4_tos,
 384			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 385			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 386				       dev_queue_xmit) : 0,
 387			r->rt_spec_dst, &len);
 388
 389		seq_printf(seq, "%*s\n", 127 - len, "");
 390	}
 391	return 0;
 392}
 393
 394static const struct seq_operations rt_cache_seq_ops = {
 395	.start  = rt_cache_seq_start,
 396	.next   = rt_cache_seq_next,
 397	.stop   = rt_cache_seq_stop,
 398	.show   = rt_cache_seq_show,
 399};
 400
 401static int rt_cache_seq_open(struct inode *inode, struct file *file)
 402{
 403	return seq_open_net(inode, file, &rt_cache_seq_ops,
 404			sizeof(struct rt_cache_iter_state));
 405}
 406
 407static const struct file_operations rt_cache_seq_fops = {
 408	.owner	 = THIS_MODULE,
 409	.open	 = rt_cache_seq_open,
 410	.read	 = seq_read,
 411	.llseek	 = seq_lseek,
 412	.release = seq_release_net,
 413};
 414
 415
 416static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 417{
 418	int cpu;
 419
 420	if (*pos == 0)
 421		return SEQ_START_TOKEN;
 422
 423	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 424		if (!cpu_possible(cpu))
 425			continue;
 426		*pos = cpu+1;
 427		return &per_cpu(rt_cache_stat, cpu);
 428	}
 429	return NULL;
 430}
 431
 432static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 433{
 434	int cpu;
 435
 436	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 437		if (!cpu_possible(cpu))
 438			continue;
 439		*pos = cpu+1;
 440		return &per_cpu(rt_cache_stat, cpu);
 441	}
 442	return NULL;
 443
 444}
 445
 446static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 447{
 448
 449}
 450
 451static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 452{
 453	struct rt_cache_stat *st = v;
 454
 455	if (v == SEQ_START_TOKEN) {
 456		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 457		return 0;
 458	}
 459
 460	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 461		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 462		   atomic_read(&ipv4_dst_ops.entries),
 463		   st->in_hit,
 464		   st->in_slow_tot,
 465		   st->in_slow_mc,
 466		   st->in_no_route,
 467		   st->in_brd,
 468		   st->in_martian_dst,
 469		   st->in_martian_src,
 470
 471		   st->out_hit,
 472		   st->out_slow_tot,
 473		   st->out_slow_mc,
 474
 475		   st->gc_total,
 476		   st->gc_ignored,
 477		   st->gc_goal_miss,
 478		   st->gc_dst_overflow,
 479		   st->in_hlist_search,
 480		   st->out_hlist_search
 481		);
 482	return 0;
 483}
 484
 485static const struct seq_operations rt_cpu_seq_ops = {
 486	.start  = rt_cpu_seq_start,
 487	.next   = rt_cpu_seq_next,
 488	.stop   = rt_cpu_seq_stop,
 489	.show   = rt_cpu_seq_show,
 490};
 491
 492
 493static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 494{
 495	return seq_open(file, &rt_cpu_seq_ops);
 496}
 497
 498static const struct file_operations rt_cpu_seq_fops = {
 499	.owner	 = THIS_MODULE,
 500	.open	 = rt_cpu_seq_open,
 501	.read	 = seq_read,
 502	.llseek	 = seq_lseek,
 503	.release = seq_release,
 504};
 505
 506#ifdef CONFIG_NET_CLS_ROUTE
 507static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 508			   int length, int *eof, void *data)
 509{
 510	unsigned int i;
 511
 512	if ((offset & 3) || (length & 3))
 513		return -EIO;
 514
 515	if (offset >= sizeof(struct ip_rt_acct) * 256) {
 516		*eof = 1;
 517		return 0;
 518	}
 519
 520	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 521		length = sizeof(struct ip_rt_acct) * 256 - offset;
 522		*eof = 1;
 523	}
 524
 525	offset /= sizeof(u32);
 526
 527	if (length > 0) {
 528		u32 *dst = (u32 *) buffer;
 529
 530		*start = buffer;
 531		memset(dst, 0, length);
 532
 533		for_each_possible_cpu(i) {
 534			unsigned int j;
 535			u32 *src;
 536
 537			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 538			for (j = 0; j < length/4; j++)
 539				dst[j] += src[j];
 540		}
 541	}
 542	return length;
 543}
 544#endif
 545
 546static int __net_init ip_rt_do_proc_init(struct net *net)
 547{
 548	struct proc_dir_entry *pde;
 549
 550	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 551			&rt_cache_seq_fops);
 552	if (!pde)
 553		goto err1;
 554
 555	pde = proc_create("rt_cache", S_IRUGO,
 556			  net->proc_net_stat, &rt_cpu_seq_fops);
 557	if (!pde)
 558		goto err2;
 559
 560#ifdef CONFIG_NET_CLS_ROUTE
 561	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 562			ip_rt_acct_read, NULL);
 563	if (!pde)
 564		goto err3;
 565#endif
 566	return 0;
 567
 568#ifdef CONFIG_NET_CLS_ROUTE
 569err3:
 570	remove_proc_entry("rt_cache", net->proc_net_stat);
 571#endif
 572err2:
 573	remove_proc_entry("rt_cache", net->proc_net);
 574err1:
 575	return -ENOMEM;
 576}
 577
 578static void __net_exit ip_rt_do_proc_exit(struct net *net)
 579{
 580	remove_proc_entry("rt_cache", net->proc_net_stat);
 581	remove_proc_entry("rt_cache", net->proc_net);
 582	remove_proc_entry("rt_acct", net->proc_net);
 583}
 584
 585static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 586	.init = ip_rt_do_proc_init,
 587	.exit = ip_rt_do_proc_exit,
 588};
 589
 590static int __init ip_rt_proc_init(void)
 591{
 592	return register_pernet_subsys(&ip_rt_proc_ops);
 593}
 594
 595#else
 596static inline int ip_rt_proc_init(void)
 597{
 598	return 0;
 599}
 600#endif /* CONFIG_PROC_FS */
 601
 602static inline void rt_free(struct rtable *rt)
 603{
 604	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 605}
 606
 607static inline void rt_drop(struct rtable *rt)
 608{
 609	ip_rt_put(rt);
 610	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 611}
 612
 613static inline int rt_fast_clean(struct rtable *rth)
 614{
 615	/* Kill broadcast/multicast entries very aggresively, if they
 616	   collide in hash table with more useful entries */
 617	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 618		rth->fl.iif && rth->u.dst.rt_next;
 619}
 620
 621static inline int rt_valuable(struct rtable *rth)
 622{
 623	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 624		rth->u.dst.expires;
 625}
 626
 627static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 628{
 629	unsigned long age;
 630	int ret = 0;
 631
 632	if (atomic_read(&rth->u.dst.__refcnt))
 633		goto out;
 634
 635	ret = 1;
 636	if (rth->u.dst.expires &&
 637	    time_after_eq(jiffies, rth->u.dst.expires))
 638		goto out;
 639
 640	age = jiffies - rth->u.dst.lastuse;
 641	ret = 0;
 642	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 643	    (age <= tmo2 && rt_valuable(rth)))
 644		goto out;
 645	ret = 1;
 646out:	return ret;
 647}
 648
 649/* Bits of score are:
 650 * 31: very valuable
 651 * 30: not quite useless
 652 * 29..0: usage counter
 653 */
 654static inline u32 rt_score(struct rtable *rt)
 655{
 656	u32 score = jiffies - rt->u.dst.lastuse;
 657
 658	score = ~score & ~(3<<30);
 659
 660	if (rt_valuable(rt))
 661		score |= (1<<31);
 662
 663	if (!rt->fl.iif ||
 664	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 665		score |= (1<<30);
 666
 667	return score;
 668}
 669
 670static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 671{
 672	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 673		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 674		(fl1->mark ^ fl2->mark) |
 675		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
 676		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
 677		(fl1->oif ^ fl2->oif) |
 678		(fl1->iif ^ fl2->iif)) == 0;
 679}
 680
 681static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 682{
 683	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 684}
 685
 686/*
 687 * Perform a full scan of hash table and free all entries.
 688 * Can be called by a softirq or a process.
 689 * In the later case, we want to be reschedule if necessary
 690 */
 691static void rt_do_flush(int process_context)
 692{
 693	unsigned int i;
 694	struct rtable *rth, *next;
 695
 696	for (i = 0; i <= rt_hash_mask; i++) {
 697		if (process_context && need_resched())
 698			cond_resched();
 699		rth = rt_hash_table[i].chain;
 700		if (!rth)
 701			continue;
 702
 703		spin_lock_bh(rt_hash_lock_addr(i));
 704		rth = rt_hash_table[i].chain;
 705		rt_hash_table[i].chain = NULL;
 706		spin_unlock_bh(rt_hash_lock_addr(i));
 707
 708		for (; rth; rth = next) {
 709			next = rth->u.dst.rt_next;
 710			rt_free(rth);
 711		}
 712	}
 713}
 714
 715static void rt_check_expire(void)
 716{
 717	static unsigned int rover;
 718	unsigned int i = rover, goal;
 719	struct rtable *rth, **rthp;
 720	u64 mult;
 721
 722	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 723	if (ip_rt_gc_timeout > 1)
 724		do_div(mult, ip_rt_gc_timeout);
 725	goal = (unsigned int)mult;
 726	if (goal > rt_hash_mask)
 727		goal = rt_hash_mask + 1;
 728	for (; goal > 0; goal--) {
 729		unsigned long tmo = ip_rt_gc_timeout;
 730
 731		i = (i + 1) & rt_hash_mask;
 732		rthp = &rt_hash_table[i].chain;
 733
 734		if (need_resched())
 735			cond_resched();
 736
 737		if (*rthp == NULL)
 738			continue;
 739		spin_lock_bh(rt_hash_lock_addr(i));
 740		while ((rth = *rthp) != NULL) {
 741			if (rth->rt_genid != atomic_read(&rt_genid)) {
 742				*rthp = rth->u.dst.rt_next;
 743				rt_free(rth);
 744				continue;
 745			}
 746			if (rth->u.dst.expires) {
 747				/* Entry is expired even if it is in use */
 748				if (time_before_eq(jiffies, rth->u.dst.expires)) {
 749					tmo >>= 1;
 750					rthp = &rth->u.dst.rt_next;
 751					continue;
 752				}
 753			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 754				tmo >>= 1;
 755				rthp = &rth->u.dst.rt_next;
 756				continue;
 757			}
 758
 759			/* Cleanup aged off entries. */
 760			*rthp = rth->u.dst.rt_next;
 761			rt_free(rth);
 762		}
 763		spin_unlock_bh(rt_hash_lock_addr(i));
 764	}
 765	rover = i;
 766}
 767
 768/*
 769 * rt_worker_func() is run in process context.
 770 * we call rt_check_expire() to scan part of the hash table
 771 */
 772static void rt_worker_func(struct work_struct *work)
 773{
 774	rt_check_expire();
 775	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 776}
 777
 778/*
 779 * Pertubation of rt_genid by a small quantity [1..256]
 780 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 781 * many times (2^24) without giving recent rt_genid.
 782 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 783 */
 784static void rt_cache_invalidate(void)
 785{
 786	unsigned char shuffle;
 787
 788	get_random_bytes(&shuffle, sizeof(shuffle));
 789	atomic_add(shuffle + 1U, &rt_genid);
 790}
 791
 792/*
 793 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 794 * delay >= 0 : invalidate & flush cache (can be long)
 795 */
 796void rt_cache_flush(int delay)
 797{
 798	rt_cache_invalidate();
 799	if (delay >= 0)
 800		rt_do_flush(!in_softirq());
 801}
 802
 803/*
 804 * We change rt_genid and let gc do the cleanup
 805 */
 806static void rt_secret_rebuild(unsigned long dummy)
 807{
 808	rt_cache_invalidate();
 809	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 810}
 811
 812/*
 813   Short description of GC goals.
 814
 815   We want to build algorithm, which will keep routing cache
 816   at some equilibrium point, when number of aged off entries
 817   is kept approximately equal to newly generated ones.
 818
 819   Current expiration strength is variable "expire".
 820   We try to adjust it dynamically, so that if networking
 821   is idle expires is large enough to keep enough of warm entries,
 822   and when load increases it reduces to limit cache size.
 823 */
 824
 825static int rt_garbage_collect(struct dst_ops *ops)
 826{
 827	static unsigned long expire = RT_GC_TIMEOUT;
 828	static unsigned long last_gc;
 829	static int rover;
 830	static int equilibrium;
 831	struct rtable *rth, **rthp;
 832	unsigned long now = jiffies;
 833	int goal;
 834
 835	/*
 836	 * Garbage collection is pretty expensive,
 837	 * do not make it too frequently.
 838	 */
 839
 840	RT_CACHE_STAT_INC(gc_total);
 841
 842	if (now - last_gc < ip_rt_gc_min_interval &&
 843	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 844		RT_CACHE_STAT_INC(gc_ignored);
 845		goto out;
 846	}
 847
 848	/* Calculate number of entries, which we want to expire now. */
 849	goal = atomic_read(&ipv4_dst_ops.entries) -
 850		(ip_rt_gc_elasticity << rt_hash_log);
 851	if (goal <= 0) {
 852		if (equilibrium < ipv4_dst_ops.gc_thresh)
 853			equilibrium = ipv4_dst_ops.gc_thresh;
 854		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 855		if (goal > 0) {
 856			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 857			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 858		}
 859	} else {
 860		/* We are in dangerous area. Try to reduce cache really
 861		 * aggressively.
 862		 */
 863		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 864		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 865	}
 866
 867	if (now - last_gc >= ip_rt_gc_min_interval)
 868		last_gc = now;
 869
 870	if (goal <= 0) {
 871		equilibrium += goal;
 872		goto work_done;
 873	}
 874
 875	do {
 876		int i, k;
 877
 878		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 879			unsigned long tmo = expire;
 880
 881			k = (k + 1) & rt_hash_mask;
 882			rthp = &rt_hash_table[k].chain;
 883			spin_lock_bh(rt_hash_lock_addr(k));
 884			while ((rth = *rthp) != NULL) {
 885				if (rth->rt_genid == atomic_read(&rt_genid) &&
 886					!rt_may_expire(rth, tmo, expire)) {
 887					tmo >>= 1;
 888					rthp = &rth->u.dst.rt_next;
 889					continue;
 890				}
 891				*rthp = rth->u.dst.rt_next;
 892				rt_free(rth);
 893				goal--;
 894			}
 895			spin_unlock_bh(rt_hash_lock_addr(k));
 896			if (goal <= 0)
 897				break;
 898		}
 899		rover = k;
 900
 901		if (goal <= 0)
 902			goto work_done;
 903
 904		/* Goal is not achieved. We stop process if:
 905
 906		   - if expire reduced to zero. Otherwise, expire is halfed.
 907		   - if table is not full.
 908		   - if we are called from interrupt.
 909		   - jiffies check is just fallback/debug loop breaker.
 910		     We will not spin here for long time in any case.
 911		 */
 912
 913		RT_CACHE_STAT_INC(gc_goal_miss);
 914
 915		if (expire == 0)
 916			break;
 917
 918		expire >>= 1;
 919#if RT_CACHE_DEBUG >= 2
 920		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 921				atomic_read(&ipv4_dst_ops.entries), goal, i);
 922#endif
 923
 924		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 925			goto out;
 926	} while (!in_softirq() && time_before_eq(jiffies, now));
 927
 928	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 929		goto out;
 930	if (net_ratelimit())
 931		printk(KERN_WARNING "dst cache overflow\n");
 932	RT_CACHE_STAT_INC(gc_dst_overflow);
 933	return 1;
 934
 935work_done:
 936	expire += ip_rt_gc_min_interval;
 937	if (expire > ip_rt_gc_timeout ||
 938	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 939		expire = ip_rt_gc_timeout;
 940#if RT_CACHE_DEBUG >= 2
 941	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 942			atomic_read(&ipv4_dst_ops.entries), goal, rover);
 943#endif
 944out:	return 0;
 945}
 946
 947static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 948{
 949	struct rtable	*rth, **rthp;
 950	unsigned long	now;
 951	struct rtable *cand, **candp;
 952	u32 		min_score;
 953	int		chain_length;
 954	int attempts = !in_softirq();
 955
 956restart:
 957	chain_length = 0;
 958	min_score = ~(u32)0;
 959	cand = NULL;
 960	candp = NULL;
 961	now = jiffies;
 962
 963	rthp = &rt_hash_table[hash].chain;
 964
 965	spin_lock_bh(rt_hash_lock_addr(hash));
 966	while ((rth = *rthp) != NULL) {
 967		if (rth->rt_genid != atomic_read(&rt_genid)) {
 968			*rthp = rth->u.dst.rt_next;
 969			rt_free(rth);
 970			continue;
 971		}
 972		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 973			/* Put it first */
 974			*rthp = rth->u.dst.rt_next;
 975			/*
 976			 * Since lookup is lockfree, the deletion
 977			 * must be visible to another weakly ordered CPU before
 978			 * the insertion at the start of the hash chain.
 979			 */
 980			rcu_assign_pointer(rth->u.dst.rt_next,
 981					   rt_hash_table[hash].chain);
 982			/*
 983			 * Since lookup is lockfree, the update writes
 984			 * must be ordered for consistency on SMP.
 985			 */
 986			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 987
 988			dst_use(&rth->u.dst, now);
 989			spin_unlock_bh(rt_hash_lock_addr(hash));
 990
 991			rt_drop(rt);
 992			*rp = rth;
 993			return 0;
 994		}
 995
 996		if (!atomic_read(&rth->u.dst.__refcnt)) {
 997			u32 score = rt_score(rth);
 998
 999			if (score <= min_score) {
1000				cand = rth;
1001				candp = rthp;
1002				min_score = score;
1003			}
1004		}
1005
1006		chain_length++;
1007
1008		rthp = &rth->u.dst.rt_next;
1009	}
1010
1011	if (cand) {
1012		/* ip_rt_gc_elasticity used to be average length of chain
1013		 * length, when exceeded gc becomes really aggressive.
1014		 *
1015		 * The second limit is less certain. At the moment it allows
1016		 * only 2 entries per bucket. We will see.
1017		 */
1018		if (chain_length > ip_rt_gc_elasticity) {
1019			*candp = cand->u.dst.rt_next;
1020			rt_free(cand);
1021		}
1022	}
1023
1024	/* Try to bind route to arp only if it is output
1025	   route or unicast forwarding path.
1026	 */
1027	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1028		int err = arp_bind_neighbour(&rt->u.dst);
1029		if (err) {
1030			spin_unlock_bh(rt_hash_lock_addr(hash));
1031
1032			if (err != -ENOBUFS) {
1033				rt_drop(rt);
1034				return err;
1035			}
1036
1037			/* Neighbour tables are full and nothing
1038			   can be released. Try to shrink route cache,
1039			   it is most likely it holds some neighbour records.
1040			 */
1041			if (attempts-- > 0) {
1042				int saved_elasticity = ip_rt_gc_elasticity;
1043				int saved_int = ip_rt_gc_min_interval;
1044				ip_rt_gc_elasticity	= 1;
1045				ip_rt_gc_min_interval	= 0;
1046				rt_garbage_collect(&ipv4_dst_ops);
1047				ip_rt_gc_min_interval	= saved_int;
1048				ip_rt_gc_elasticity	= saved_elasticity;
1049				goto restart;
1050			}
1051
1052			if (net_ratelimit())
1053				printk(KERN_WARNING "Neighbour table overflow.\n");
1054			rt_drop(rt);
1055			return -ENOBUFS;
1056		}
1057	}
1058
1059	rt->u.dst.rt_next = rt_hash_table[hash].chain;
1060#if RT_CACHE_DEBUG >= 2
1061	if (rt->u.dst.rt_next) {
1062		struct rtable *trt;
1063		printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1064		       NIPQUAD(rt->rt_dst));
1065		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1066			printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1067		printk("\n");
1068	}
1069#endif
1070	rt_hash_table[hash].chain = rt;
1071	spin_unlock_bh(rt_hash_lock_addr(hash));
1072	*rp = rt;
1073	return 0;
1074}
1075
1076void rt_bind_peer(struct rtable *rt, int create)
1077{
1078	static DEFINE_SPINLOCK(rt_peer_lock);
1079	struct inet_peer *peer;
1080
1081	peer = inet_getpeer(rt->rt_dst, create);
1082
1083	spin_lock_bh(&rt_peer_lock);
1084	if (rt->peer == NULL) {
1085		rt->peer = peer;
1086		peer = NULL;
1087	}
1088	spin_unlock_bh(&rt_peer_lock);
1089	if (peer)
1090		inet_putpeer(peer);
1091}
1092
1093/*
1094 * Peer allocation may fail only in serious out-of-memory conditions.  However
1095 * we still can generate some output.
1096 * Random ID selection looks a bit dangerous because we have no chances to
1097 * select ID being unique in a reasonable period of time.
1098 * But broken packet identifier may be better than no packet at all.
1099 */
1100static void ip_select_fb_ident(struct iphdr *iph)
1101{
1102	static DEFINE_SPINLOCK(ip_fb_id_lock);
1103	static u32 ip_fallback_id;
1104	u32 salt;
1105
1106	spin_lock_bh(&ip_fb_id_lock);
1107	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1108	iph->id = htons(salt & 0xFFFF);
1109	ip_fallback_id = salt;
1110	spin_unlock_bh(&ip_fb_id_lock);
1111}
1112
1113void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114{
1115	struct rtable *rt = (struct rtable *) dst;
1116
1117	if (rt) {
1118		if (rt->peer == NULL)
1119			rt_bind_peer(rt, 1);
1120
1121		/* If peer is attached to destination, it is never detached,
1122		   so that we need not to grab a lock to dereference it.
1123		 */
1124		if (rt->peer) {
1125			iph->id = htons(inet_getid(rt->peer, more));
1126			return;
1127		}
1128	} else
1129		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1130		       __builtin_return_address(0));
1131
1132	ip_select_fb_ident(iph);
1133}
1134
1135static void rt_del(unsigned hash, struct rtable *rt)
1136{
1137	struct rtable **rthp, *aux;
1138
1139	rthp = &rt_hash_table[hash].chain;
1140	spin_lock_bh(rt_hash_lock_addr(hash));
1141	ip_rt_put(rt);
1142	while ((aux = *rthp) != NULL) {
1143		if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1144			*rthp = aux->u.dst.rt_next;
1145			rt_free(aux);
1146			continue;
1147		}
1148		rthp = &aux->u.dst.rt_next;
1149	}
1150	spin_unlock_bh(rt_hash_lock_addr(hash));
1151}
1152
1153void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1154		    __be32 saddr, struct net_device *dev)
1155{
1156	int i, k;
1157	struct in_device *in_dev = in_dev_get(dev);
1158	struct rtable *rth, **rthp;
1159	__be32  skeys[2] = { saddr, 0 };
1160	int  ikeys[2] = { dev->ifindex, 0 };
1161	struct netevent_redirect netevent;
1162	struct net *net;
1163
1164	if (!in_dev)
1165		return;
1166
1167	net = dev_net(dev);
1168	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1169	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1170	    || ipv4_is_zeronet(new_gw))
1171		goto reject_redirect;
1172
1173	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1174		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1175			goto reject_redirect;
1176		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1177			goto reject_redirect;
1178	} else {
1179		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1180			goto reject_redirect;
1181	}
1182
1183	for (i = 0; i < 2; i++) {
1184		for (k = 0; k < 2; k++) {
1185			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1186
1187			rthp=&rt_hash_table[hash].chain;
1188
1189			rcu_read_lock();
1190			while ((rth = rcu_dereference(*rthp)) != NULL) {
1191				struct rtable *rt;
1192
1193				if (rth->fl.fl4_dst != daddr ||
1194				    rth->fl.fl4_src != skeys[i] ||
1195				    rth->fl.oif != ikeys[k] ||
1196				    rth->fl.iif != 0 ||
1197				    rth->rt_genid != atomic_read(&rt_genid) ||
1198				    !net_eq(dev_net(rth->u.dst.dev), net)) {
1199					rthp = &rth->u.dst.rt_next;
1200					continue;
1201				}
1202
1203				if (rth->rt_dst != daddr ||
1204				    rth->rt_src != saddr ||
1205				    rth->u.dst.error ||
1206				    rth->rt_gateway != old_gw ||
1207				    rth->u.dst.dev != dev)
1208					break;
1209
1210				dst_hold(&rth->u.dst);
1211				rcu_read_unlock();
1212
1213				rt = dst_alloc(&ipv4_dst_ops);
1214				if (rt == NULL) {
1215					ip_rt_put(rth);
1216					in_dev_put(in_dev);
1217					return;
1218				}
1219
1220				/* Copy all the information. */
1221				*rt = *rth;
1222				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1223				rt->u.dst.__use		= 1;
1224				atomic_set(&rt->u.dst.__refcnt, 1);
1225				rt->u.dst.child		= NULL;
1226				if (rt->u.dst.dev)
1227					dev_hold(rt->u.dst.dev);
1228				if (rt->idev)
1229					in_dev_hold(rt->idev);
1230				rt->u.dst.obsolete	= 0;
1231				rt->u.dst.lastuse	= jiffies;
1232				rt->u.dst.path		= &rt->u.dst;
1233				rt->u.dst.neighbour	= NULL;
1234				rt->u.dst.hh		= NULL;
1235				rt->u.dst.xfrm		= NULL;
1236				rt->rt_genid		= atomic_read(&rt_genid);
1237				rt->rt_flags		|= RTCF_REDIRECTED;
1238
1239				/* Gateway is different ... */
1240				rt->rt_gateway		= new_gw;
1241
1242				/* Redirect received -> path was valid */
1243				dst_confirm(&rth->u.dst);
1244
1245				if (rt->peer)
1246					atomic_inc(&rt->peer->refcnt);
1247
1248				if (arp_bind_neighbour(&rt->u.dst) ||
1249				    !(rt->u.dst.neighbour->nud_state &
1250					    NUD_VALID)) {
1251					if (rt->u.dst.neighbour)
1252						neigh_event_send(rt->u.dst.neighbour, NULL);
1253					ip_rt_put(rth);
1254					rt_drop(rt);
1255					goto do_next;
1256				}
1257
1258				netevent.old = &rth->u.dst;
1259				netevent.new = &rt->u.dst;
1260				call_netevent_notifiers(NETEVENT_REDIRECT,
1261							&netevent);
1262
1263				rt_del(hash, rth);
1264				if (!rt_intern_hash(hash, rt, &rt))
1265					ip_rt_put(rt);
1266				goto do_next;
1267			}
1268			rcu_read_unlock();
1269		do_next:
1270			;
1271		}
1272	}
1273	in_dev_put(in_dev);
1274	return;
1275
1276reject_redirect:
1277#ifdef CONFIG_IP_ROUTE_VERBOSE
1278	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1279		printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1280			NIPQUAD_FMT " ignored.\n"
1281			"  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1282		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1283		       NIPQUAD(saddr), NIPQUAD(daddr));
1284#endif
1285	in_dev_put(in_dev);
1286}
1287
1288static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289{
1290	struct rtable *rt = (struct rtable *)dst;
1291	struct dst_entry *ret = dst;
1292
1293	if (rt) {
1294		if (dst->obsolete) {
1295			ip_rt_put(rt);
1296			ret = NULL;
1297		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298			   rt->u.dst.expires) {
1299			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1300						rt->fl.oif);
1301#if RT_CACHE_DEBUG >= 1
1302			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1303					  NIPQUAD_FMT "/%02x dropped\n",
1304				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1305#endif
1306			rt_del(hash, rt);
1307			ret = NULL;
1308		}
1309	}
1310	return ret;
1311}
1312
1313/*
1314 * Algorithm:
1315 *	1. The first ip_rt_redirect_number redirects are sent
1316 *	   with exponential backoff, then we stop sending them at all,
1317 *	   assuming that the host ignores our redirects.
1318 *	2. If we did not see packets requiring redirects
1319 *	   during ip_rt_redirect_silence, we assume that the host
1320 *	   forgot redirected route and start to send redirects again.
1321 *
1322 * This algorithm is much cheaper and more intelligent than dumb load limiting
1323 * in icmp.c.
1324 *
1325 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1326 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1327 */
1328
1329void ip_rt_send_redirect(struct sk_buff *skb)
1330{
1331	struct rtable *rt = skb->rtable;
1332	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1333
1334	if (!in_dev)
1335		return;
1336
1337	if (!IN_DEV_TX_REDIRECTS(in_dev))
1338		goto out;
1339
1340	/* No redirected packets during ip_rt_redirect_silence;
1341	 * reset the algorithm.
1342	 */
1343	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1344		rt->u.dst.rate_tokens = 0;
1345
1346	/* Too many ignored redirects; do not send anything
1347	 * set u.dst.rate_last to the last seen redirected packet.
1348	 */
1349	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1350		rt->u.dst.rate_last = jiffies;
1351		goto out;
1352	}
1353
1354	/* Check for load limit; set rate_last to the latest sent
1355	 * redirect.
1356	 */
1357	if (rt->u.dst.rate_tokens == 0 ||
1358	    time_after(jiffies,
1359		       (rt->u.dst.rate_last +
1360			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1361		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1362		rt->u.dst.rate_last = jiffies;
1363		++rt->u.dst.rate_tokens;
1364#ifdef CONFIG_IP_ROUTE_VERBOSE
1365		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1366		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1367		    net_ratelimit())
1368			printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1369				"redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1370				NIPQUAD(rt->rt_src), rt->rt_iif,
1371				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1372#endif
1373	}
1374out:
1375	in_dev_put(in_dev);
1376}
1377
1378static int ip_error(struct sk_buff *skb)
1379{
1380	struct rtable *rt = skb->rtable;
1381	unsigned long now;
1382	int code;
1383
1384	switch (rt->u.dst.error) {
1385		case EINVAL:
1386		default:
1387			goto out;
1388		case EHOSTUNREACH:
1389			code = ICMP_HOST_UNREACH;
1390			break;
1391		case ENETUNREACH:
1392			code = ICMP_NET_UNREACH;
1393			IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1394			break;
1395		case EACCES:
1396			code = ICMP_PKT_FILTERED;
1397			break;
1398	}
1399
1400	now = jiffies;
1401	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1402	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1403		rt->u.dst.rate_tokens = ip_rt_error_burst;
1404	rt->u.dst.rate_last = now;
1405	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1406		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1407		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1408	}
1409
1410out:	kfree_skb(skb);
1411	return 0;
1412}
1413
1414/*
1415 *	The last two values are not from the RFC but
1416 *	are needed for AMPRnet AX.25 paths.
1417 */
1418
1419static const unsigned short mtu_plateau[] =
1420{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1421
1422static inline unsigned short guess_mtu(unsigned short old_mtu)
1423{
1424	int i;
1425
1426	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1427		if (old_mtu > mtu_plateau[i])
1428			return mtu_plateau[i];
1429	return 68;
1430}
1431
1432unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1433				 unsigned short new_mtu,
1434				 struct net_device *dev)
1435{
1436	int i, k;
1437	unsigned short old_mtu = ntohs(iph->tot_len);
1438	struct rtable *rth;
1439	int  ikeys[2] = { dev->ifindex, 0 };
1440	__be32  skeys[2] = { iph->saddr, 0, };
1441	__be32  daddr = iph->daddr;
1442	unsigned short est_mtu = 0;
1443
1444	if (ipv4_config.no_pmtu_disc)
1445		return 0;
1446
1447	for (k = 0; k < 2; k++) {
1448		for (i = 0; i < 2; i++) {
1449			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1450
1451			rcu_read_lock();
1452			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1453			     rth = rcu_dereference(rth->u.dst.rt_next)) {
1454				unsigned short mtu = new_mtu;
1455
1456				if (rth->fl.fl4_dst != daddr ||
1457				    rth->fl.fl4_src != skeys[i] ||
1458				    rth->rt_dst != daddr ||
1459				    rth->rt_src != iph->saddr ||
1460				    rth->fl.oif != ikeys[k] ||
1461				    rth->fl.iif != 0 ||
1462				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1463				    !net_eq(dev_net(rth->u.dst.dev), net) ||
1464				    rth->rt_genid != atomic_read(&rt_genid))
1465					continue;
1466
1467				if (new_mtu < 68 || new_mtu >= old_mtu) {
1468
1469					/* BSD 4.2 compatibility hack :-( */
1470					if (mtu == 0 &&
1471					    old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1472					    old_mtu >= 68 + (iph->ihl << 2))
1473						old_mtu -= iph->ihl << 2;
1474
1475					mtu = guess_mtu(old_mtu);
1476				}
1477				if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1478					if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1479						dst_confirm(&rth->u.dst);
1480						if (mtu < ip_rt_min_pmtu) {
1481							mtu = ip_rt_min_pmtu;
1482							rth->u.dst.metrics[RTAX_LOCK-1] |=
1483								(1 << RTAX_MTU);
1484						}
1485						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1486						dst_set_expires(&rth->u.dst,
1487							ip_rt_mtu_expires);
1488					}
1489					est_mtu = mtu;
1490				}
1491			}
1492			rcu_read_unlock();
1493		}
1494	}
1495	return est_mtu ? : new_mtu;
1496}
1497
1498static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1499{
1500	if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1501	    !(dst_metric_locked(dst, RTAX_MTU))) {
1502		if (mtu < ip_rt_min_pmtu) {
1503			mtu = ip_rt_min_pmtu;
1504			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1505		}
1506		dst->metrics[RTAX_MTU-1] = mtu;
1507		dst_set_expires(dst, ip_rt_mtu_expires);
1508		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1509	}
1510}
1511
1512static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1513{
1514	return NULL;
1515}
1516
1517static void ipv4_dst_destroy(struct dst_entry *dst)
1518{
1519	struct rtable *rt = (struct rtable *) dst;
1520	struct inet_peer *peer = rt->peer;
1521	struct in_device *idev = rt->idev;
1522
1523	if (peer) {
1524		rt->peer = NULL;
1525		inet_putpeer(peer);
1526	}
1527
1528	if (idev) {
1529		rt->idev = NULL;
1530		in_dev_put(idev);
1531	}
1532}
1533
1534static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1535			    int how)
1536{
1537	struct rtable *rt = (struct rtable *) dst;
1538	struct in_device *idev = rt->idev;
1539	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1540		struct in_device *loopback_idev =
1541			in_dev_get(dev_net(dev)->loopback_dev);
1542		if (loopback_idev) {
1543			rt->idev = loopback_idev;
1544			in_dev_put(idev);
1545		}
1546	}
1547}
1548
1549static void ipv4_link_failure(struct sk_buff *skb)
1550{
1551	struct rtable *rt;
1552
1553	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1554
1555	rt = skb->rtable;
1556	if (rt)
1557		dst_set_expires(&rt->u.dst, 0);
1558}
1559
1560static int ip_rt_bug(struct sk_buff *skb)
1561{
1562	printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1563		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1564		skb->dev ? skb->dev->name : "?");
1565	kfree_skb(skb);
1566	return 0;
1567}
1568
1569/*
1570   We do not cache source address of outgoing interface,
1571   because it is used only by IP RR, TS and SRR options,
1572   so that it out of fast path.
1573
1574   BTW remember: "addr" is allowed to be not aligned
1575   in IP options!
1576 */
1577
1578void ip_rt_get_source(u8 *addr, struct rtable *rt)
1579{
1580	__be32 src;
1581	struct fib_result res;
1582
1583	if (rt->fl.iif == 0)
1584		src = rt->rt_src;
1585	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1586		src = FIB_RES_PREFSRC(res);
1587		fib_res_put(&res);
1588	} else
1589		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1590					RT_SCOPE_UNIVERSE);
1591	memcpy(addr, &src, 4);
1592}
1593
1594#ifdef CONFIG_NET_CLS_ROUTE
1595static void set_class_tag(struct rtable *rt, u32 tag)
1596{
1597	if (!(rt->u.dst.tclassid & 0xFFFF))
1598		rt->u.dst.tclassid |= tag & 0xFFFF;
1599	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1600		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1601}
1602#endif
1603
1604static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1605{
1606	struct fib_info *fi = res->fi;
1607
1608	if (fi) {
1609		if (FIB_RES_GW(*res) &&
1610		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1611			rt->rt_gateway = FIB_RES_GW(*res);
1612		memcpy(rt->u.dst.metrics, fi->fib_metrics,
1613		       sizeof(rt->u.dst.metrics));
1614		if (fi->fib_mtu == 0) {
1615			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1616			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1617			    rt->rt_gateway != rt->rt_dst &&
1618			    rt->u.dst.dev->mtu > 576)
1619				rt->u.dst.metrics[RTAX_MTU-1] = 576;
1620		}
1621#ifdef CONFIG_NET_CLS_ROUTE
1622		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1623#endif
1624	} else
1625		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1626
1627	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1628		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1629	if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1630		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1631	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1632		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1633				       ip_rt_min_advmss);
1634	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1635		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1636
1637#ifdef CONFIG_NET_CLS_ROUTE
1638#ifdef CONFIG_IP_MULTIPLE_TABLES
1639	set_class_tag(rt, fib_rules_tclass(res));
1640#endif
1641	set_class_tag(rt, itag);
1642#endif
1643	rt->rt_type = res->type;
1644}
1645
1646static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1647				u8 tos, struct net_device *dev, int our)
1648{
1649	unsigned hash;
1650	struct rtable *rth;
1651	__be32 spec_dst;
1652	struct in_device *in_dev = in_dev_get(dev);
1653	u32 itag = 0;
1654
1655	/* Primary sanity checks. */
1656
1657	if (in_dev == NULL)
1658		return -EINVAL;
1659
1660	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1661	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1662		goto e_inval;
1663
1664	if (ipv4_is_zeronet(saddr)) {
1665		if (!ipv4_is_local_multicast(daddr))
1666			goto e_inval;
1667		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1668	} else if (fib_validate_source(saddr, 0, tos, 0,
1669					dev, &spec_dst, &itag) < 0)
1670		goto e_inval;
1671
1672	rth = dst_alloc(&ipv4_dst_ops);
1673	if (!rth)
1674		goto e_nobufs;
1675
1676	rth->u.dst.output= ip_rt_bug;
1677
1678	atomic_set(&rth->u.dst.__refcnt, 1);
1679	rth->u.dst.flags= DST_HOST;
1680	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1681		rth->u.dst.flags |= DST_NOPOLICY;
1682	rth->fl.fl4_dst	= daddr;
1683	rth->rt_dst	= daddr;
1684	rth->fl.fl4_tos	= tos;
1685	rth->fl.mark    = skb->mark;
1686	rth->fl.fl4_src	= saddr;
1687	rth->rt_src	= saddr;
1688#ifdef CONFIG_NET_CLS_ROUTE
1689	rth->u.dst.tclassid = itag;
1690#endif
1691	rth->rt_iif	=
1692	rth->fl.iif	= dev->ifindex;
1693	rth->u.dst.dev	= init_net.loopback_dev;
1694	dev_hold(rth->u.dst.dev);
1695	rth->idev	= in_dev_get(rth->u.dst.dev);
1696	rth->fl.oif	= 0;
1697	rth->rt_gateway	= daddr;
1698	rth->rt_spec_dst= spec_dst;
1699	rth->rt_genid	= atomic_read(&rt_genid);
1700	rth->rt_flags	= RTCF_MULTICAST;
1701	rth->rt_type	= RTN_MULTICAST;
1702	if (our) {
1703		rth->u.dst.input= ip_local_deliver;
1704		rth->rt_flags |= RTCF_LOCAL;
1705	}
1706
1707#ifdef CONFIG_IP_MROUTE
1708	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1709		rth->u.dst.input = ip_mr_input;
1710#endif
1711	RT_CACHE_STAT_INC(in_slow_mc);
1712
1713	in_dev_put(in_dev);
1714	hash = rt_hash(daddr, saddr, dev->ifindex);
1715	return rt_intern_hash(hash, rth, &skb->rtable);
1716
1717e_nobufs:
1718	in_dev_put(in_dev);
1719	return -ENOBUFS;
1720
1721e_inval:
1722	in_dev_put(in_dev);
1723	return -EINVAL;
1724}
1725
1726
1727static void ip_handle_martian_source(struct net_device *dev,
1728				     struct in_device *in_dev,
1729				     struct sk_buff *skb,
1730				     __be32 daddr,
1731				     __be32 saddr)
1732{
1733	RT_CACHE_STAT_INC(in_martian_src);
1734#ifdef CONFIG_IP_ROUTE_VERBOSE
1735	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1736		/*
1737		 *	RFC1812 recommendation, if source is martian,
1738		 *	the only hint is MAC header.
1739		 */
1740		printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1741			NIPQUAD_FMT", on dev %s\n",
1742			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1743		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1744			int i;
1745			const unsigned char *p = skb_mac_header(skb);
1746			printk(KERN_WARNING "ll header: ");
1747			for (i = 0; i < dev->hard_header_len; i++, p++) {
1748				printk("%02x", *p);
1749				if (i < (dev->hard_header_len - 1))
1750					printk(":");
1751			}
1752			printk("\n");
1753		}
1754	}
1755#endif
1756}
1757
1758static int __mkroute_input(struct sk_buff *skb,
1759			   struct fib_result *res,
1760			   struct in_device *in_dev,
1761			   __be32 daddr, __be32 saddr, u32 tos,
1762			   struct rtable **result)
1763{
1764
1765	struct rtable *rth;
1766	int err;
1767	struct in_device *out_dev;
1768	unsigned flags = 0;
1769	__be32 spec_dst;
1770	u32 itag;
1771
1772	/* get a working reference to the output device */
1773	out_dev = in_dev_get(FIB_RES_DEV(*res));
1774	if (out_dev == NULL) {
1775		if (net_ratelimit())
1776			printk(KERN_CRIT "Bug in ip_route_input" \
1777			       "_slow(). Please, report\n");
1778		return -EINVAL;
1779	}
1780
1781
1782	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1783				  in_dev->dev, &spec_dst, &itag);
1784	if (err < 0) {
1785		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1786					 saddr);
1787
1788		err = -EINVAL;
1789		goto cleanup;
1790	}
1791
1792	if (err)
1793		flags |= RTCF_DIRECTSRC;
1794
1795	if (out_dev == in_dev && err &&
1796	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1797	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1798		flags |= RTCF_DOREDIRECT;
1799
1800	if (skb->protocol != htons(ETH_P_IP)) {
1801		/* Not IP (i.e. ARP). Do not create route, if it is
1802		 * invalid for proxy arp. DNAT routes are always valid.
1803		 */
1804		if (out_dev == in_dev) {
1805			err = -EINVAL;
1806			goto cleanup;
1807		}
1808	}
1809
1810
1811	rth = dst_alloc(&ipv4_dst_ops);
1812	if (!rth) {
1813		err = -ENOBUFS;
1814		goto cleanup;
1815	}
1816
1817	atomic_set(&rth->u.dst.__refcnt, 1);
1818	rth->u.dst.flags= DST_HOST;
1819	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1820		rth->u.dst.flags |= DST_NOPOLICY;
1821	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1822		rth->u.dst.flags |= DST_NOXFRM;
1823	rth->fl.fl4_dst	= daddr;
1824	rth->rt_dst	= daddr;
1825	rth->fl.fl4_tos	= tos;
1826	rth->fl.mark    = skb->mark;
1827	rth->fl.fl4_src	= saddr;
1828	rth->rt_src	= saddr;
1829	rth->rt_gateway	= daddr;
1830	rth->rt_iif 	=
1831		rth->fl.iif	= in_dev->dev->ifindex;
1832	rth->u.dst.dev	= (out_dev)->dev;
1833	dev_hold(rth->u.dst.dev);
1834	rth->idev	= in_dev_get(rth->u.dst.dev);
1835	rth->fl.oif 	= 0;
1836	rth->rt_spec_dst= spec_dst;
1837
1838	rth->u.dst.input = ip_forward;
1839	rth->u.dst.output = ip_output;
1840	rth->rt_genid = atomic_read(&rt_genid);
1841
1842	rt_set_nexthop(rth, res, itag);
1843
1844	rth->rt_flags = flags;
1845
1846	*result = rth;
1847	err = 0;
1848 cleanup:
1849	/* release the working reference to the output device */
1850	in_dev_put(out_dev);
1851	return err;
1852}
1853
1854static int ip_mkroute_input(struct sk_buff *skb,
1855			    struct fib_result *res,
1856			    const struct flowi *fl,
1857			    struct in_device *in_dev,
1858			    __be32 daddr, __be32 saddr, u32 tos)
1859{
1860	struct rtable* rth = NULL;
1861	int err;
1862	unsigned hash;
1863
1864#ifdef CONFIG_IP_ROUTE_MULTIPATH
1865	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1866		fib_select_multipath(fl, res);
1867#endif
1868
1869	/* create a routing cache entry */
1870	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1871	if (err)
1872		return err;
1873
1874	/* put it into the cache */
1875	hash = rt_hash(daddr, saddr, fl->iif);
1876	return rt_intern_hash(hash, rth, &skb->rtable);
1877}
1878
1879/*
1880 *	NOTE. We drop all the packets that has local source
1881 *	addresses, because every properly looped back packet
1882 *	must have correct destination already attached by output routine.
1883 *
1884 *	Such approach solves two big problems:
1885 *	1. Not simplex devices are handled properly.
1886 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1887 */
1888
1889static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1890			       u8 tos, struct net_device *dev)
1891{
1892	struct fib_result res;
1893	struct in_device *in_dev = in_dev_get(dev);
1894	struct flowi fl = { .nl_u = { .ip4_u =
1895				      { .daddr = daddr,
1896					.saddr = saddr,
1897					.tos = tos,
1898					.scope = RT_SCOPE_UNIVERSE,
1899				      } },
1900			    .mark = skb->mark,
1901			    .iif = dev->ifindex };
1902	unsigned	flags = 0;
1903	u32		itag = 0;
1904	struct rtable * rth;
1905	unsigned	hash;
1906	__be32		spec_dst;
1907	int		err = -EINVAL;
1908	int		free_res = 0;
1909	struct net    * net = dev_net(dev);
1910
1911	/* IP on this device is disabled. */
1912
1913	if (!in_dev)
1914		goto out;
1915
1916	/* Check for the most weird martians, which can be not detected
1917	   by fib_lookup.
1918	 */
1919
1920	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1921	    ipv4_is_loopback(saddr))
1922		goto martian_source;
1923
1924	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1925		goto brd_input;
1926
1927	/* Accept zero addresses only to limited broadcast;
1928	 * I even do not know to fix it or not. Waiting for complains :-)
1929	 */
1930	if (ipv4_is_zeronet(saddr))
1931		goto martian_source;
1932
1933	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1934	    ipv4_is_loopback(daddr))
1935		goto martian_destination;
1936
1937	/*
1938	 *	Now we are ready to route packet.
1939	 */
1940	if ((err = fib_lookup(net, &fl, &res)) != 0) {
1941		if (!IN_DEV_FORWARD(in_dev))
1942			goto e_hostunreach;
1943		goto no_route;
1944	}
1945	free_res = 1;
1946
1947	RT_CACHE_STAT_INC(in_slow_tot);
1948
1949	if (res.type == RTN_BROADCAST)
1950		goto brd_input;
1951
1952	if (res.type == RTN_LOCAL) {
1953		int result;
1954		result = fib_validate_source(saddr, daddr, tos,
1955					     net->loopback_dev->ifindex,
1956					     dev, &spec_dst, &itag);
1957		if (result < 0)
1958			goto martian_source;
1959		if (result)
1960			flags |= RTCF_DIRECTSRC;
1961		spec_dst = daddr;
1962		goto local_input;
1963	}
1964
1965	if (!IN_DEV_FORWARD(in_dev))
1966		goto e_hostunreach;
1967	if (res.type != RTN_UNICAST)
1968		goto martian_destination;
1969
1970	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971done:
1972	in_dev_put(in_dev);
1973	if (free_res)
1974		fib_res_put(&res);
1975out:	return err;
1976
1977brd_input:
1978	if (skb->protocol != htons(ETH_P_IP))
1979		goto e_inval;
1980
1981	if (ipv4_is_zeronet(saddr))
1982		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1983	else {
1984		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1985					  &itag);
1986		if (err < 0)
1987			goto martian_source;
1988		if (err)
1989			flags |= RTCF_DIRECTSRC;
1990	}
1991	flags |= RTCF_BROADCAST;
1992	res.type = RTN_BROADCAST;
1993	RT_CACHE_STAT_INC(in_brd);
1994
1995local_input:
1996	rth = dst_alloc(&ipv4_dst_ops);
1997	if (!rth)
1998		goto e_nobufs;
1999
2000	rth->u.dst.output= ip_rt_bug;
2001	rth->rt_genid = atomic_read(&rt_genid);
2002
2003	atomic_set(&rth->u.dst.__refcnt, 1);
2004	rth->u.dst.flags= DST_HOST;
2005	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2006		rth->u.dst.flags |= DST_NOPOLICY;
2007	rth->fl.fl4_dst	= daddr;
2008	rth->rt_dst	= daddr;
2009	rth->fl.fl4_tos	= tos;
2010	rth->fl.mark    = skb->mark;
2011	rth->fl.fl4_src	= saddr;
2012	rth->rt_src	= saddr;
2013#ifdef CONFIG_NET_CLS_ROUTE
2014	rth->u.dst.tclassid = itag;
2015#endif
2016	rth->rt_iif	=
2017	rth->fl.iif	= dev->ifindex;
2018	rth->u.dst.dev	= net->loopback_dev;
2019	dev_hold(rth->u.dst.dev);
2020	rth->idev	= in_dev_get(rth->u.dst.dev);
2021	rth->rt_gateway	= daddr;
2022	rth->rt_spec_dst= spec_dst;
2023	rth->u.dst.input= ip_local_deliver;
2024	rth->rt_flags 	= flags|RTCF_LOCAL;
2025	if (res.type == RTN_UNREACHABLE) {
2026		rth->u.dst.input= ip_error;
2027		rth->u.dst.error= -err;
2028		rth->rt_flags 	&= ~RTCF_LOCAL;
2029	}
2030	rth->rt_type	= res.type;
2031	hash = rt_hash(daddr, saddr, fl.iif);
2032	err = rt_intern_hash(hash, rth, &skb->rtable);
2033	goto done;
2034
2035no_route:
2036	RT_CACHE_STAT_INC(in_no_route);
2037	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2038	res.type = RTN_UNREACHABLE;
2039	if (err == -ESRCH)
2040		err = -ENETUNREACH;
2041	goto local_input;
2042
2043	/*
2044	 *	Do not cache martian addresses: they should be logged (RFC1812)
2045	 */
2046martian_destination:
2047	RT_CACHE_STAT_INC(in_martian_dst);
2048#ifdef CONFIG_IP_ROUTE_VERBOSE
2049	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2050		printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2051			NIPQUAD_FMT ", dev %s\n",
2052			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2053#endif
2054
2055e_hostunreach:
2056	err = -EHOSTUNREACH;
2057	goto done;
2058
2059e_inval:
2060	err = -EINVAL;
2061	goto done;
2062
2063e_nobufs:
2064	err = -ENOBUFS;
2065	goto done;
2066
2067martian_source:
2068	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2069	goto e_inval;
2070}
2071
2072int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2073		   u8 tos, struct net_device *dev)
2074{
2075	struct rtable * rth;
2076	unsigned	hash;
2077	int iif = dev->ifindex;
2078	struct net *net;
2079
2080	net = dev_net(dev);
2081	tos &= IPTOS_RT_MASK;
2082	hash = rt_hash(daddr, saddr, iif);
2083
2084	rcu_read_lock();
2085	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2086	     rth = rcu_dereference(rth->u.dst.rt_next)) {
2087		if (((rth->fl.fl4_dst ^ daddr) |
2088		     (rth->fl.fl4_src ^ saddr) |
2089		     (rth->fl.iif ^ iif) |
2090		     rth->fl.oif |
2091		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2092		    rth->fl.mark == skb->mark &&
2093		    net_eq(dev_net(rth->u.dst.dev), net) &&
2094		    rth->rt_genid == atomic_read(&rt_genid)) {
2095			dst_use(&rth->u.dst, jiffies);
2096			RT_CACHE_STAT_INC(in_hit);
2097			rcu_read_unlock();
2098			skb->rtable = rth;
2099			return 0;
2100		}
2101		RT_CACHE_STAT_INC(in_hlist_search);
2102	}
2103	rcu_read_unlock();
2104
2105	/* Multicast recognition logic is moved from route cache to here.
2106	   The problem was that too many Ethernet cards have broken/missing
2107	   hardware multicast filters :-( As result the host on multicasting
2108	   network acquires a lot of useless route cache entries, sort of
2109	   SDR messages from all the world. Now we try to get rid of them.
2110	   Really, provided software IP multicast filter is organized
2111	   reasonably (at least, hashed), it does not result in a slowdown
2112	   comparing with route cache reject entries.
2113	   Note, that multicast routers are not affected, because
2114	   route cache entry is created eventually.
2115	 */
2116	if (ipv4_is_multicast(daddr)) {
2117		struct in_device *in_dev;
2118
2119		rcu_read_lock();
2120		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2121			int our = ip_check_mc(in_dev, daddr, saddr,
2122				ip_hdr(skb)->protocol);
2123			if (our
2124#ifdef CONFIG_IP_MROUTE
2125			    || (!ipv4_is_local_multicast(daddr) &&
2126				IN_DEV_MFORWARD(in_dev))
2127#endif
2128			    ) {
2129				rcu_read_unlock();
2130				return ip_route_input_mc(skb, daddr, saddr,
2131							 tos, dev, our);
2132			}
2133		}
2134		rcu_read_unlock();
2135		return -EINVAL;
2136	}
2137	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2138}
2139
2140static int __mkroute_output(struct rtable **result,
2141			    struct fib_result *res,
2142			    const struct flowi *fl,
2143			    const struct flowi *oldflp,
2144			    struct net_device *dev_out,
2145			    unsigned flags)
2146{
2147	struct rtable *rth;
2148	struct in_device *in_dev;
2149	u32 tos = RT_FL_TOS(oldflp);
2150	int err = 0;
2151
2152	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2153		return -EINVAL;
2154
2155	if (fl->fl4_dst == htonl(0xFFFFFFFF))
2156		res->type = RTN_BROADCAST;
2157	else if (ipv4_is_multicast(fl->fl4_dst))
2158		res->type = RTN_MULTICAST;
2159	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2160		return -EINVAL;
2161
2162	if (dev_out->flags & IFF_LOOPBACK)
2163		flags |= RTCF_LOCAL;
2164
2165	/* get work reference to inet device */
2166	in_dev = in_dev_get(dev_out);
2167	if (!in_dev)
2168		return -EINVAL;
2169
2170	if (res->type == RTN_BROADCAST) {
2171		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2172		if (res->fi) {
2173			fib_info_put(res->fi);
2174			res->fi = NULL;
2175		}
2176	} else if (res->type == RTN_MULTICAST) {
2177		flags |= RTCF_MULTICAST|RTCF_LOCAL;
2178		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2179				 oldflp->proto))
2180			flags &= ~RTCF_LOCAL;
2181		/* If multicast route do not exist use
2182		   default one, but do not gateway in this case.
2183		   Yes, it is hack.
2184		 */
2185		if (res->fi && res->prefixlen < 4) {
2186			fib_info_put(res->fi);
2187			res->fi = NULL;
2188		}
2189	}
2190
2191
2192	rth = dst_alloc(&ipv4_dst_ops);
2193	if (!rth) {
2194		err = -ENOBUFS;
2195		goto cleanup;
2196	}
2197
2198	atomic_set(&rth->u.dst.__refcnt, 1);
2199	rth->u.dst.flags= DST_HOST;
2200	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2201		rth->u.dst.flags |= DST_NOXFRM;
2202	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2203		rth->u.dst.flags |= DST_NOPOLICY;
2204
2205	rth->fl.fl4_dst	= oldflp->fl4_dst;
2206	rth->fl.fl4_tos	= tos;
2207	rth->fl.fl4_src	= oldflp->fl4_src;
2208	rth->fl.oif	= oldflp->oif;
2209	rth->fl.mark    = oldflp->mark;
2210	rth->rt_dst	= fl->fl4_dst;
2211	rth->rt_src	= fl->fl4_src;
2212	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2213	/* get references to the devices that are to be hold by the routing
2214	   cache entry */
2215	rth->u.dst.dev	= dev_out;
2216	dev_hold(dev_out);
2217	rth->idev	= in_dev_get(dev_out);
2218	rth->rt_gateway = fl->fl4_dst;
2219	rth->rt_spec_dst= fl->fl4_src;
2220
2221	rth->u.dst.output=ip_output;
2222	rth->rt_genid = atomic_read(&rt_genid);
2223
2224	RT_CACHE_STAT_INC(out_slow_tot);
2225
2226	if (flags & RTCF_LOCAL) {
2227		rth->u.dst.input = ip_local_deliver;
2228		rth->rt_spec_dst = fl->fl4_dst;
2229	}
2230	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2231		rth->rt_spec_dst = fl->fl4_src;
2232		if (flags & RTCF_LOCAL &&
2233		    !(dev_out->flags & IFF_LOOPBACK)) {
2234			rth->u.dst.output = ip_mc_output;
2235			RT_CACHE_STAT_INC(out_slow_mc);
2236		}
2237#ifdef CONFIG_IP_MROUTE
2238		if (res->type == RTN_MULTICAST) {
2239			if (IN_DEV_MFORWARD(in_dev) &&
2240			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2241				rth->u.dst.input = ip_mr_input;
2242				rth->u.dst.output = ip_mc_output;
2243			}
2244		}
2245#endif
2246	}
2247
2248	rt_set_nexthop(rth, res, 0);
2249
2250	rth->rt_flags = flags;
2251
2252	*result = rth;
2253 cleanup:
2254	/* release work reference to inet device */
2255	in_dev_put(in_dev);
2256
2257	return err;
2258}
2259
2260static int ip_mkroute_output(struct rtable **rp,
2261			     struct fib_result *res,
2262			     const struct flowi *fl,
2263			     const struct flowi *oldflp,
2264			     struct net_device *dev_out,
2265			     unsigned flags)
2266{
2267	struct rtable *rth = NULL;
2268	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2269	unsigned hash;
2270	if (err == 0) {
2271		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2272		err = rt_intern_hash(hash, rth, rp);
2273	}
2274
2275	return err;
2276}
2277
2278/*
2279 * Major route resolver routine.
2280 */
2281
2282static int ip_route_output_slow(struct net *net, struct rtable **rp,
2283				const struct flowi *oldflp)
2284{
2285	u32 tos	= RT_FL_TOS(oldflp);
2286	struct flowi fl = { .nl_u = { .ip4_u =
2287				      { .daddr = oldflp->fl4_dst,
2288					.saddr = oldflp->fl4_src,
2289					.tos = tos & IPTOS_RT_MASK,
2290					.scope = ((tos & RTO_ONLINK) ?
2291						  RT_SCOPE_LINK :
2292						  RT_SCOPE_UNIVERSE),
2293				      } },
2294			    .mark = oldflp->mark,
2295			    .iif = net->loopback_dev->ifindex,
2296			    .oif = oldflp->oif };
2297	struct fib_result res;
2298	unsigned flags = 0;
2299	struct net_device *dev_out = NULL;
2300	int free_res = 0;
2301	int err;
2302
2303
2304	res.fi		= NULL;
2305#ifdef CONFIG_IP_MULTIPLE_TABLES
2306	res.r		= NULL;
2307#endif
2308
2309	if (oldflp->fl4_src) {
2310		err = -EINVAL;
2311		if (ipv4_is_multicast(oldflp->fl4_src) ||
2312		    ipv4_is_lbcast(oldflp->fl4_src) ||
2313		    ipv4_is_zeronet(oldflp->fl4_src))
2314			goto out;
2315
2316		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2317		dev_out = ip_dev_find(net, oldflp->fl4_src);
2318		if (dev_out == NULL)
2319			goto out;
2320
2321		/* I removed check for oif == dev_out->oif here.
2322		   It was wrong for two reasons:
2323		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2324		      is assigned to multiple interfaces.
2325		   2. Moreover, we are allowed to send packets with saddr
2326		      of another iface. --ANK
2327		 */
2328
2329		if (oldflp->oif == 0
2330		    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2331			oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2332			/* Special hack: user can direct multicasts
2333			   and limited broadcast via necessary interface
2334			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2335			   This hack is not just for fun, it allows
2336			   vic,vat and friends to work.
2337			   They bind socket to loopback, set ttl to zero
2338			   and expect that it will work.
2339			   From the viewpoint of routing cache they are broken,
2340			   because we are not allowed to build multicast path
2341			   with loopback source addr (look, routing cache
2342			   cannot know, that ttl is zero, so that packet
2343			   will not leave this host and route is valid).
2344			   Luckily, this hack is good workaround.
2345			 */
2346
2347			fl.oif = dev_out->ifindex;
2348			goto make_route;
2349		}
2350		if (dev_out)
2351			dev_put(dev_out);
2352		dev_out = NULL;
2353	}
2354
2355
2356	if (oldflp->oif) {
2357		dev_out = dev_get_by_index(net, oldflp->oif);
2358		err = -ENODEV;
2359		if (dev_out == NULL)
2360			goto out;
2361
2362		/* RACE: Check return value of inet_select_addr instead. */
2363		if (__in_dev_get_rtnl(dev_out) == NULL) {
2364			dev_put(dev_out);
2365			goto out;	/* Wrong error code */
2366		}
2367
2368		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2369		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2370			if (!fl.fl4_src)
2371				fl.fl4_src = inet_select_addr(dev_out, 0,
2372							      RT_SCOPE_LINK);
2373			goto make_route;
2374		}
2375		if (!fl.fl4_src) {
2376			if (ipv4_is_multicast(oldflp->fl4_dst))
2377				fl.fl4_src = inet_select_addr(dev_out, 0,
2378							      fl.fl4_scope);
2379			else if (!oldflp->fl4_dst)
2380				fl.fl4_src = inet_select_addr(dev_out, 0,
2381							      RT_SCOPE_HOST);
2382		}
2383	}
2384
2385	if (!fl.fl4_dst) {
2386		fl.fl4_dst = fl.fl4_src;
2387		if (!fl.fl4_dst)
2388			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2389		if (dev_out)
2390			dev_put(dev_out);
2391		dev_out = net->loopback_dev;
2392		dev_hold(dev_out);
2393		fl.oif = net->loopback_dev->ifindex;
2394		res.type = RTN_LOCAL;
2395		flags |= RTCF_LOCAL;
2396		goto make_route;
2397	}
2398
2399	if (fib_lookup(net, &fl, &res)) {
2400		res.fi = NULL;
2401		if (oldflp->oif) {
2402			/* Apparently, routing tables are wrong. Assume,
2403			   that the destination is on link.
2404
2405			   WHY? DW.
2406			   Because we are allowed to send to iface
2407			   even if it has NO routes and NO assigned
2408			   addresses. When oif is specified, routing
2409			   tables are looked up with only one purpose:
2410			   to catch if destination is gatewayed, rather than
2411			   direct. Moreover, if MSG_DONTROUTE is set,
2412			   we send packet, ignoring both routing tables
2413			   and ifaddr state. --ANK
2414
2415
2416			   We could make it even if oif is unknown,
2417			   likely IPv6, but we do not.
2418			 */
2419
2420			if (fl.fl4_src == 0)
2421				fl.fl4_src = inet_select_addr(dev_out, 0,
2422							      RT_SCOPE_LINK);
2423			res.type = RTN_UNICAST;
2424			goto make_route;
2425		}
2426		if (dev_out)
2427			dev_put(dev_out);
2428		err = -ENETUNREACH;
2429		goto out;
2430	}
2431	free_res = 1;
2432
2433	if (res.type == RTN_LOCAL) {
2434		if (!fl.fl4_src)
2435			fl.fl4_src = fl.fl4_dst;
2436		if (dev_out)
2437			dev_put(dev_out);
2438		dev_out = net->loopback_dev;
2439		dev_hold(dev_out);
2440		fl.oif = dev_out->ifindex;
2441		if (res.fi)
2442			fib_info_put(res.fi);
2443		res.fi = NULL;
2444		flags |= RTCF_LOCAL;
2445		goto make_route;
2446	}
2447
2448#ifdef CONFIG_IP_ROUTE_MULTIPATH
2449	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2450		fib_select_multipath(&fl, &res);
2451	else
2452#endif
2453	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2454		fib_select_default(net, &fl, &res);
2455
2456	if (!fl.fl4_src)
2457		fl.fl4_src = FIB_RES_PREFSRC(res);
2458
2459	if (dev_out)
2460		dev_put(dev_out);
2461	dev_out = FIB_RES_DEV(res);
2462	dev_hold(dev_out);
2463	fl.oif = dev_out->ifindex;
2464
2465
2466make_route:
2467	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2468
2469
2470	if (free_res)
2471		fib_res_put(&res);
2472	if (dev_out)
2473		dev_put(dev_out);
2474out:	return err;
2475}
2476
2477int __ip_route_output_key(struct net *net, struct rtable **rp,
2478			  const struct flowi *flp)
2479{
2480	unsigned hash;
2481	struct rtable *rth;
2482
2483	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2484
2485	rcu_read_lock_bh();
2486	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2487		rth = rcu_dereference(rth->u.dst.rt_next)) {
2488		if (rth->fl.fl4_dst == flp->fl4_dst &&
2489		    rth->fl.fl4_src == flp->fl4_src &&
2490		    rth->fl.iif == 0 &&
2491		    rth->fl.oif == flp->oif &&
2492		    rth->fl.mark == flp->mark &&
2493		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2494			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2495		    net_eq(dev_net(rth->u.dst.dev), net) &&
2496		    rth->rt_genid == atomic_read(&rt_genid)) {
2497			dst_use(&rth->u.dst, jiffies);
2498			RT_CACHE_STAT_INC(out_hit);
2499			rcu_read_unlock_bh();
2500			*rp = rth;
2501			return 0;
2502		}
2503		RT_CACHE_STAT_INC(out_hlist_search);
2504	}
2505	rcu_read_unlock_bh();
2506
2507	return ip_route_output_slow(net, rp, flp);
2508}
2509
2510EXPORT_SYMBOL_GPL(__ip_route_output_key);
2511
2512static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2513{
2514}
2515
2516static struct dst_ops ipv4_dst_blackhole_ops = {
2517	.family			=	AF_INET,
2518	.protocol		=	__constant_htons(ETH_P_IP),
2519	.destroy		=	ipv4_dst_destroy,
2520	.check			=	ipv4_dst_check,
2521	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2522	.entry_size		=	sizeof(struct rtable),
2523	.entries		=	ATOMIC_INIT(0),
2524};
2525
2526
2527static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2528{
2529	struct rtable *ort = *rp;
2530	struct rtable *rt = (struct rtable *)
2531		dst_alloc(&ipv4_dst_blackhole_ops);
2532
2533	if (rt) {
2534		struct dst_entry *new = &rt->u.dst;
2535
2536		atomic_set(&new->__refcnt, 1);
2537		new->__use = 1;
2538		new->input = dst_discard;
2539		new->output = dst_discard;
2540		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2541
2542		new->dev = ort->u.dst.dev;
2543		if (new->dev)
2544			dev_hold(new->dev);
2545
2546		rt->fl = ort->fl;
2547
2548		rt->idev = ort->idev;
2549		if (rt->idev)
2550			in_dev_hold(rt->idev);
2551		rt->rt_genid = atomic_read(&rt_genid);
2552		rt->rt_flags = ort->rt_flags;
2553		rt->rt_type = ort->rt_type;
2554		rt->rt_dst = ort->rt_dst;
2555		rt->rt_src = ort->rt_src;
2556		rt->rt_iif = ort->rt_iif;
2557		rt->rt_gateway = ort->rt_gateway;
2558		rt->rt_spec_dst = ort->rt_spec_dst;
2559		rt->peer = ort->peer;
2560		if (rt->peer)
2561			atomic_inc(&rt->peer->refcnt);
2562
2563		dst_free(new);
2564	}
2565
2566	dst_release(&(*rp)->u.dst);
2567	*rp = rt;
2568	return (rt ? 0 : -ENOMEM);
2569}
2570
2571int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2572			 struct sock *sk, int flags)
2573{
2574	int err;
2575
2576	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2577		return err;
2578
2579	if (flp->proto) {
2580		if (!flp->fl4_src)
2581			flp->fl4_src = (*rp)->rt_src;
2582		if (!flp->fl4_dst)
2583			flp->fl4_dst = (*rp)->rt_dst;
2584		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2585				    flags ? XFRM_LOOKUP_WAIT : 0);
2586		if (err == -EREMOTE)
2587			err = ipv4_dst_blackhole(rp, flp);
2588
2589		return err;
2590	}
2591
2592	return 0;
2593}
2594
2595EXPORT_SYMBOL_GPL(ip_route_output_flow);
2596
2597int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2598{
2599	return ip_route_output_flow(net, rp, flp, NULL, 0);
2600}
2601
2602static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2603			int nowait, unsigned int flags)
2604{
2605	struct rtable *rt = skb->rtable;
2606	struct rtmsg *r;
2607	struct nlmsghdr *nlh;
2608	long expires;
2609	u32 id = 0, ts = 0, tsage = 0, error;
2610
2611	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2612	if (nlh == NULL)
2613		return -EMSGSIZE;
2614
2615	r = nlmsg_data(nlh);
2616	r->rtm_family	 = AF_INET;
2617	r->rtm_dst_len	= 32;
2618	r->rtm_src_len	= 0;
2619	r->rtm_tos	= rt->fl.fl4_tos;
2620	r->rtm_table	= RT_TABLE_MAIN;
2621	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2622	r->rtm_type	= rt->rt_type;
2623	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2624	r->rtm_protocol = RTPROT_UNSPEC;
2625	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2626	if (rt->rt_flags & RTCF_NOTIFY)
2627		r->rtm_flags |= RTM_F_NOTIFY;
2628
2629	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2630
2631	if (rt->fl.fl4_src) {
2632		r->rtm_src_len = 32;
2633		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2634	}
2635	if (rt->u.dst.dev)
2636		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2637#ifdef CONFIG_NET_CLS_ROUTE
2638	if (rt->u.dst.tclassid)
2639		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2640#endif
2641	if (rt->fl.iif)
2642		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2643	else if (rt->rt_src != rt->fl.fl4_src)
2644		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2645
2646	if (rt->rt_dst != rt->rt_gateway)
2647		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2648
2649	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2650		goto nla_put_failure;
2651
2652	error = rt->u.dst.error;
2653	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2654	if (rt->peer) {
2655		id = rt->peer->ip_id_count;
2656		if (rt->peer->tcp_ts_stamp) {
2657			ts = rt->peer->tcp_ts;
2658			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2659		}
2660	}
2661
2662	if (rt->fl.iif) {
2663#ifdef CONFIG_IP_MROUTE
2664		__be32 dst = rt->rt_dst;
2665
2666		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2667		    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2668			int err = ipmr_get_route(skb, r, nowait);
2669			if (err <= 0) {
2670				if (!nowait) {
2671					if (err == 0)
2672						return 0;
2673					goto nla_put_failure;
2674				} else {
2675					if (err == -EMSGSIZE)
2676						goto nla_put_failure;
2677					error = err;
2678				}
2679			}
2680		} else
2681#endif
2682			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2683	}
2684
2685	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2686			       expires, error) < 0)
2687		goto nla_put_failure;
2688
2689	return nlmsg_end(skb, nlh);
2690
2691nla_put_failure:
2692	nlmsg_cancel(skb, nlh);
2693	return -EMSGSIZE;
2694}
2695
2696static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2697{
2698	struct net *net = sock_net(in_skb->sk);
2699	struct rtmsg *rtm;
2700	struct nlattr *tb[RTA_MAX+1];
2701	struct rtable *rt = NULL;
2702	__be32 dst = 0;
2703	__be32 src = 0;
2704	u32 iif;
2705	int err;
2706	struct sk_buff *skb;
2707
2708	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2709	if (err < 0)
2710		goto errout;
2711
2712	rtm = nlmsg_data(nlh);
2713
2714	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2715	if (skb == NULL) {
2716		err = -ENOBUFS;
2717		goto errout;
2718	}
2719
2720	/* Reserve room for dummy headers, this skb can pass
2721	   through good chunk of routing engine.
2722	 */
2723	skb_reset_mac_header(skb);
2724	skb_reset_network_header(skb);
2725
2726	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2727	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2728	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2729
2730	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2731	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2732	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2733
2734	if (iif) {
2735		struct net_device *dev;
2736
2737		dev = __dev_get_by_index(net, iif);
2738		if (dev == NULL) {
2739			err = -ENODEV;
2740			goto errout_free;
2741		}
2742
2743		skb->protocol	= htons(ETH_P_IP);
2744		skb->dev	= dev;
2745		local_bh_disable();
2746		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2747		local_bh_enable();
2748
2749		rt = skb->rtable;
2750		if (err == 0 && rt->u.dst.error)
2751			err = -rt->u.dst.error;
2752	} else {
2753		struct flowi fl = {
2754			.nl_u = {
2755				.ip4_u = {
2756					.daddr = dst,
2757					.saddr = src,
2758					.tos = rtm->rtm_tos,
2759				},
2760			},
2761			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2762		};
2763		err = ip_route_output_key(net, &rt, &fl);
2764	}
2765
2766	if (err)
2767		goto errout_free;
2768
2769	skb->rtable = rt;
2770	if (rtm->rtm_flags & RTM_F_NOTIFY)
2771		rt->rt_flags |= RTCF_NOTIFY;
2772
2773	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2774			   RTM_NEWROUTE, 0, 0);
2775	if (err <= 0)
2776		goto errout_free;
2777
2778	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2779errout:
2780	return err;
2781
2782errout_free:
2783	kfree_skb(skb);
2784	goto errout;
2785}
2786
2787int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2788{
2789	struct rtable *rt;
2790	int h, s_h;
2791	int idx, s_idx;
2792	struct net *net;
2793
2794	net = sock_net(skb->sk);
2795
2796	s_h = cb->args[0];
2797	if (s_h < 0)
2798		s_h = 0;
2799	s_idx = idx = cb->args[1];
2800	for (h = s_h; h <= rt_hash_mask; h++) {
2801		rcu_read_lock_bh();
2802		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2803		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2804			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2805				continue;
2806			if (rt->rt_genid != atomic_read(&rt_genid))
2807				continue;
2808			skb->dst = dst_clone(&rt->u.dst);
2809			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2810					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2811					 1, NLM_F_MULTI) <= 0) {
2812				dst_release(xchg(&skb->dst, NULL));
2813				rcu_read_unlock_bh();
2814				goto done;
2815			}
2816			dst_release(xchg(&skb->dst, NULL));
2817		}
2818		rcu_read_unlock_bh();
2819		s_idx = 0;
2820	}
2821
2822done:
2823	cb->args[0] = h;
2824	cb->args[1] = idx;
2825	return skb->len;
2826}
2827
2828void ip_rt_multicast_event(struct in_device *in_dev)
2829{
2830	rt_cache_flush(0);
2831}
2832
2833#ifdef CONFIG_SYSCTL
2834static int flush_delay;
2835
2836static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2837					struct file *filp, void __user *buffer,
2838					size_t *lenp, loff_t *ppos)
2839{
2840	if (write) {
2841		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2842		rt_cache_flush(flush_delay);
2843		return 0;
2844	}
2845
2846	return -EINVAL;
2847}
2848
2849static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2850						int __user *name,
2851						int nlen,
2852						void __user *oldval,
2853						size_t __user *oldlenp,
2854						void __user *newval,
2855						size_t newlen)
2856{
2857	int delay;
2858	if (newlen != sizeof(int))
2859		return -EINVAL;
2860	if (get_user(delay, (int __user *)newval))
2861		return -EFAULT;
2862	rt_cache_flush(delay);
2863	return 0;
2864}
2865
2866ctl_table ipv4_route_table[] = {
2867	{
2868		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
2869		.procname	= "flush",
2870		.data		= &flush_delay,
2871		.maxlen		= sizeof(int),
2872		.mode		= 0200,
2873		.proc_handler	= &ipv4_sysctl_rtcache_flush,
2874		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
2875	},
2876	{
2877		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
2878		.procname	= "gc_thresh",
2879		.data		= &ipv4_dst_ops.gc_thresh,
2880		.maxlen		= sizeof(int),
2881		.mode		= 0644,
2882		.proc_handler	= &proc_dointvec,
2883	},
2884	{
2885		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
2886		.procname	= "max_size",
2887		.data		= &ip_rt_max_size,
2888		.maxlen		= sizeof(int),
2889		.mode		= 0644,
2890		.proc_handler	= &proc_dointvec,
2891	},
2892	{
2893		/*  Deprecated. Use gc_min_interval_ms */
2894
2895		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2896		.procname	= "gc_min_interval",
2897		.data		= &ip_rt_gc_min_interval,
2898		.maxlen		= sizeof(int),
2899		.mode		= 0644,
2900		.proc_handler	= &proc_dointvec_jiffies,
2901		.strategy	= &sysctl_jiffies,
2902	},
2903	{
2904		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2905		.procname	= "gc_min_interval_ms",
2906		.data		= &ip_rt_gc_min_interval,
2907		.maxlen		= sizeof(int),
2908		.mode		= 0644,
2909		.proc_handler	= &proc_dointvec_ms_jiffies,
2910		.strategy	= &sysctl_ms_jiffies,
2911	},
2912	{
2913		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
2914		.procname	= "gc_timeout",
2915		.data		= &ip_rt_gc_timeout,
2916		.maxlen		= sizeof(int),
2917		.mode		= 0644,
2918		.proc_handler	= &proc_dointvec_jiffies,
2919		.strategy	= &sysctl_jiffies,
2920	},
2921	{
2922		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
2923		.procname	= "gc_interval",
2924		.data		= &ip_rt_gc_interval,
2925		.maxlen		= sizeof(int),
2926		.mode		= 0644,
2927		.proc_handler	= &proc_dointvec_jiffies,
2928		.strategy	= &sysctl_jiffies,
2929	},
2930	{
2931		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
2932		.procname	= "redirect_load",
2933		.data		= &ip_rt_redirect_load,
2934		.maxlen		= sizeof(int),
2935		.mode		= 0644,
2936		.proc_handler	= &proc_dointvec,
2937	},
2938	{
2939		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
2940		.procname	= "redirect_number",
2941		.data		= &ip_rt_redirect_number,
2942		.maxlen		= sizeof(int),
2943		.mode		= 0644,
2944		.proc_handler	= &proc_dointvec,
2945	},
2946	{
2947		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
2948		.procname	= "redirect_silence",
2949		.data		= &ip_rt_redirect_silence,
2950		.maxlen		= sizeof(int),
2951		.mode		= 0644,
2952		.proc_handler	= &proc_dointvec,
2953	},
2954	{
2955		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
2956		.procname	= "error_cost",
2957		.data		= &ip_rt_error_cost,
2958		.maxlen		= sizeof(int),
2959		.mode		= 0644,
2960		.proc_handler	= &proc_dointvec,
2961	},
2962	{
2963		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
2964		.procname	= "error_burst",
2965		.data		= &ip_rt_error_burst,
2966		.maxlen		= sizeof(int),
2967		.mode		= 0644,
2968		.proc_handler	= &proc_dointvec,
2969	},
2970	{
2971		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
2972		.procname	= "gc_elasticity",
2973		.data		= &ip_rt_gc_elasticity,
2974		.maxlen		= sizeof(int),
2975		.mode		= 0644,
2976		.proc_handler	= &proc_dointvec,
2977	},
2978	{
2979		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
2980		.procname	= "mtu_expires",
2981		.data		= &ip_rt_mtu_expires,
2982		.maxlen		= sizeof(int),
2983		.mode		= 0644,
2984		.proc_handler	= &proc_dointvec_jiffies,
2985		.strategy	= &sysctl_jiffies,
2986	},
2987	{
2988		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
2989		.procname	= "min_pmtu",
2990		.data		= &ip_rt_min_pmtu,
2991		.maxlen		= sizeof(int),
2992		.mode		= 0644,
2993		.proc_handler	= &proc_dointvec,
2994	},
2995	{
2996		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
2997		.procname	= "min_adv_mss",
2998		.data		= &ip_rt_min_advmss,
2999		.maxlen		= sizeof(int),
3000		.mode		= 0644,
3001		.proc_handler	= &proc_dointvec,
3002	},
3003	{
3004		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
3005		.procname	= "secret_interval",
3006		.data		= &ip_rt_secret_interval,
3007		.maxlen		= sizeof(int),
3008		.mode		= 0644,
3009		.proc_handler	= &proc_dointvec_jiffies,
3010		.strategy	= &sysctl_jiffies,
3011	},
3012	{ .ctl_name = 0 }
3013};
3014#endif
3015
3016#ifdef CONFIG_NET_CLS_ROUTE
3017struct ip_rt_acct *ip_rt_acct __read_mostly;
3018#endif /* CONFIG_NET_CLS_ROUTE */
3019
3020static __initdata unsigned long rhash_entries;
3021static int __init set_rhash_entries(char *str)
3022{
3023	if (!str)
3024		return 0;
3025	rhash_entries = simple_strtoul(str, &str, 0);
3026	return 1;
3027}
3028__setup("rhash_entries=", set_rhash_entries);
3029
3030int __init ip_rt_init(void)
3031{
3032	int rc = 0;
3033
3034	atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3035			     (jiffies ^ (jiffies >> 7))));
3036
3037#ifdef CONFIG_NET_CLS_ROUTE
3038	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3039	if (!ip_rt_acct)
3040		panic("IP: failed to allocate ip_rt_acct\n");
3041#endif
3042
3043	ipv4_dst_ops.kmem_cachep =
3044		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3045				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3046
3047	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3048
3049	rt_hash_table = (struct rt_hash_bucket *)
3050		alloc_large_system_hash("IP route cache",
3051					sizeof(struct rt_hash_bucket),
3052					rhash_entries,
3053					(num_physpages >= 128 * 1024) ?
3054					15 : 17,
3055					0,
3056					&rt_hash_log,
3057					&rt_hash_mask,
3058					0);
3059	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3060	rt_hash_lock_init();
3061
3062	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3063	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3064
3065	devinet_init();
3066	ip_fib_init();
3067
3068	rt_secret_timer.function = rt_secret_rebuild;
3069	rt_secret_timer.data = 0;
3070	init_timer_deferrable(&rt_secret_timer);
3071
3072	/* All the timers, started at system startup tend
3073	   to synchronize. Perturb it a bit.
3074	 */
3075	schedule_delayed_work(&expires_work,
3076		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3077
3078	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3079		ip_rt_secret_interval;
3080	add_timer(&rt_secret_timer);
3081
3082	if (ip_rt_proc_init())
3083		printk(KERN_ERR "Unable to create route proc files\n");
3084#ifdef CONFIG_XFRM
3085	xfrm_init();
3086	xfrm4_init();
3087#endif
3088	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3089
3090	return rc;
3091}
3092
3093EXPORT_SYMBOL(__ip_select_ident);
3094EXPORT_SYMBOL(ip_route_input);
3095EXPORT_SYMBOL(ip_route_output_key);
Configure Feed

Configure Feed