File: /usr/src/linux/net/ipv4/route.c

1     /*
2      * INET		An implementation of the TCP/IP protocol suite for the LINUX
3      *		operating system.  INET is implemented using the  BSD Socket
4      *		interface as the means of communication with the user level.
5      *
6      *		ROUTE - implementation of the IP router.
7      *
8      * Version:	$Id: route.c,v 1.99 2001/09/18 22:29:09 davem Exp $
9      *
10      * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11      *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12      *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13      *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14      *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15      *
16      * Fixes:
17      *		Alan Cox	:	Verify area fixes.
18      *		Alan Cox	:	cli() protects routing changes
19      *		Rui Oliveira	:	ICMP routing table updates
20      *		(rco@di.uminho.pt)	Routing table insertion and update
21      *		Linus Torvalds	:	Rewrote bits to be sensible
22      *		Alan Cox	:	Added BSD route gw semantics
23      *		Alan Cox	:	Super /proc >4K 
24      *		Alan Cox	:	MTU in route table
25      *		Alan Cox	: 	MSS actually. Also added the window
26      *					clamper.
27      *		Sam Lantinga	:	Fixed route matching in rt_del()
28      *		Alan Cox	:	Routing cache support.
29      *		Alan Cox	:	Removed compatibility cruft.
30      *		Alan Cox	:	RTF_REJECT support.
31      *		Alan Cox	:	TCP irtt support.
32      *		Jonathan Naylor	:	Added Metric support.
33      *	Miquel van Smoorenburg	:	BSD API fixes.
34      *	Miquel van Smoorenburg	:	Metrics.
35      *		Alan Cox	:	Use __u32 properly
36      *		Alan Cox	:	Aligned routing errors more closely with BSD
37      *					our system is still very different.
38      *		Alan Cox	:	Faster /proc handling
39      *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40      *					routing caches and better behaviour.
41      *		
42      *		Olaf Erb	:	irtt wasn't being copied right.
43      *		Bjorn Ekwall	:	Kerneld route support.
44      *		Alan Cox	:	Multicast fixed (I hope)
45      * 		Pavel Krauz	:	Limited broadcast fixed
46      *		Mike McLagan	:	Routing by source
47      *	Alexey Kuznetsov	:	End of old history. Splitted to fib.c and
48      *					route.c and rewritten from scratch.
49      *		Andi Kleen	:	Load-limit warning messages.
50      *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51      *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52      *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53      *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54      *		Marc Boucher	:	routing by fwmark
55      *	Robert Olsson		:	Added rt_cache statistics
56      *
57      *		This program is free software; you can redistribute it and/or
58      *		modify it under the terms of the GNU General Public License
59      *		as published by the Free Software Foundation; either version
60      *		2 of the License, or (at your option) any later version.
61      */
62     
63     #include <linux/config.h>
64     #include <asm/uaccess.h>
65     #include <asm/system.h>
66     #include <asm/bitops.h>
67     #include <linux/types.h>
68     #include <linux/kernel.h>
69     #include <linux/sched.h>
70     #include <linux/mm.h>
71     #include <linux/string.h>
72     #include <linux/socket.h>
73     #include <linux/sockios.h>
74     #include <linux/errno.h>
75     #include <linux/in.h>
76     #include <linux/inet.h>
77     #include <linux/netdevice.h>
78     #include <linux/proc_fs.h>
79     #include <linux/init.h>
80     #include <linux/skbuff.h>
81     #include <linux/rtnetlink.h>
82     #include <linux/inetdevice.h>
83     #include <linux/igmp.h>
84     #include <linux/pkt_sched.h>
85     #include <linux/mroute.h>
86     #include <linux/netfilter_ipv4.h>
87     #include <linux/random.h>
88     #include <net/protocol.h>
89     #include <net/ip.h>
90     #include <net/route.h>
91     #include <net/inetpeer.h>
92     #include <net/sock.h>
93     #include <net/ip_fib.h>
94     #include <net/arp.h>
95     #include <net/tcp.h>
96     #include <net/icmp.h>
97     #ifdef CONFIG_SYSCTL
98     #include <linux/sysctl.h>
99     #endif
100     
101     #define IP_MAX_MTU	0xFFF0
102     
103     #define RT_GC_TIMEOUT (300*HZ)
104     
105     int ip_rt_min_delay		= 2 * HZ;
106     int ip_rt_max_delay		= 10 * HZ;
107     int ip_rt_max_size;
108     int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
109     int ip_rt_gc_interval		= 60 * HZ;
110     int ip_rt_gc_min_interval	= 5 * HZ;
111     int ip_rt_redirect_number	= 9;
112     int ip_rt_redirect_load		= HZ / 50;
113     int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
114     int ip_rt_error_cost		= HZ;
115     int ip_rt_error_burst		= 5 * HZ;
116     int ip_rt_gc_elasticity		= 8;
117     int ip_rt_mtu_expires		= 10 * 60 * HZ;
118     int ip_rt_min_pmtu		= 512 + 20 + 20;
119     int ip_rt_min_advmss		= 256;
120     
121     static unsigned long rt_deadline;
122     
123     #define RTprint(a...)	printk(KERN_DEBUG a)
124     
125     static struct timer_list rt_flush_timer;
126     static struct timer_list rt_periodic_timer;
127     
128     /*
129      *	Interface to generic destination cache.
130      */
131     
132     static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
133     static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
134     					   struct sk_buff *skb);
135     static void		 ipv4_dst_destroy(struct dst_entry *dst);
136     static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137     static void		 ipv4_link_failure(struct sk_buff *skb);
138     static int rt_garbage_collect(void);
139     
140     
141     struct dst_ops ipv4_dst_ops = {
142     	family:			AF_INET,
143     	protocol:		__constant_htons(ETH_P_IP),
144     	gc:			rt_garbage_collect,
145     	check:			ipv4_dst_check,
146     	reroute:		ipv4_dst_reroute,
147     	destroy:		ipv4_dst_destroy,
148     	negative_advice:	ipv4_negative_advice,
149     	link_failure:		ipv4_link_failure,
150     	entry_size:		sizeof(struct rtable),
151     };
152     
153     #ifdef CONFIG_INET_ECN
154     #define ECN_OR_COST(class)	TC_PRIO_##class
155     #else
156     #define ECN_OR_COST(class)	TC_PRIO_FILLER
157     #endif
158     
159     __u8 ip_tos2prio[16] = {
160     	TC_PRIO_BESTEFFORT,
161     	ECN_OR_COST(FILLER),
162     	TC_PRIO_BESTEFFORT,
163     	ECN_OR_COST(BESTEFFORT),
164     	TC_PRIO_BULK,
165     	ECN_OR_COST(BULK),
166     	TC_PRIO_BULK,
167     	ECN_OR_COST(BULK),
168     	TC_PRIO_INTERACTIVE,
169     	ECN_OR_COST(INTERACTIVE),
170     	TC_PRIO_INTERACTIVE,
171     	ECN_OR_COST(INTERACTIVE),
172     	TC_PRIO_INTERACTIVE_BULK,
173     	ECN_OR_COST(INTERACTIVE_BULK),
174     	TC_PRIO_INTERACTIVE_BULK,
175     	ECN_OR_COST(INTERACTIVE_BULK)
176     };
177     
178     
179     /*
180      * Route cache.
181      */
182     
183     /* The locking scheme is rather straight forward:
184      *
185      * 1) A BH protected rwlocks protect buckets of the central route hash.
186      * 2) Only writers remove entries, and they hold the lock
187      *    as they look at rtable reference counts.
188      * 3) Only readers acquire references to rtable entries,
189      *    they do so with atomic increments and with the
190      *    lock held.
191      */
192     
193     struct rt_hash_bucket {
194     	struct rtable	*chain;
195     	rwlock_t	lock;
196     } __attribute__((__aligned__(8)));
197     
198     static struct rt_hash_bucket 	*rt_hash_table;
199     static unsigned			rt_hash_mask;
200     static int			rt_hash_log;
201     
202     struct rt_cache_stat rt_cache_stat[NR_CPUS];
203     
204     static int rt_intern_hash(unsigned hash, struct rtable *rth,
205     				struct rtable **res);
206     
207     static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
208     {
209     	unsigned hash = ((daddr & 0xF0F0F0F0) >> 4) |
210     			((daddr & 0x0F0F0F0F) << 4);
211     	hash ^= saddr ^ tos;
212     	hash ^= (hash >> 16);
213     	return (hash ^ (hash >> 8)) & rt_hash_mask;
214     }
215     
216     static int rt_cache_get_info(char *buffer, char **start, off_t offset,
217     				int length)
218     {
219     	int len = 0;
220     	off_t pos = 128;
221     	char temp[129];
222     	struct rtable *r;
223     	int i;
224     
225     	if (offset < 128) {
226     		sprintf(buffer, "%-127s\n",
227     			"Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228     			"Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229     			"HHUptod\tSpecDst");
230     		len = 128;
231       	}
232     	
233     	for (i = rt_hash_mask; i >= 0; i--) {
234     		read_lock_bh(&rt_hash_table[i].lock);
235     		for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
236     			/*
237     			 *	Spin through entries until we are ready
238     			 */
239     			pos += 128;
240     
241     			if (pos <= offset) {
242     				len = 0;
243     				continue;
244     			}
245     			sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
246     				"%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
247     				r->u.dst.dev ? r->u.dst.dev->name : "*",
248     				(unsigned long)r->rt_dst,
249     				(unsigned long)r->rt_gateway,
250     				r->rt_flags,
251     				atomic_read(&r->u.dst.__refcnt),
252     				r->u.dst.__use,
253     				0,
254     				(unsigned long)r->rt_src,
255     				(int)r->u.dst.advmss + 40,
256     				r->u.dst.window,
257     				(int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
258     				r->key.tos,
259     				r->u.dst.hh ?
260     					atomic_read(&r->u.dst.hh->hh_refcnt) :
261     					-1,
262     				r->u.dst.hh ?
263     			       		(r->u.dst.hh->hh_output ==
264     					 dev_queue_xmit) : 0,
265     				r->rt_spec_dst);
266     			sprintf(buffer + len, "%-127s\n", temp);
267     			len += 128;
268     			if (pos >= offset+length) {
269     				read_unlock_bh(&rt_hash_table[i].lock);
270     				goto done;
271     			}
272     		}
273     		read_unlock_bh(&rt_hash_table[i].lock);
274             }
275     
276     done:
277       	*start = buffer + len - (pos - offset);
278       	len = pos - offset;
279       	if (len > length)
280       		len = length;
281       	return len;
282     }
283     
284     static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
285     {
286     	unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
287     	int i, lcpu;
288     	int len = 0;
289     
290             for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
291                     i = cpu_logical_map(lcpu);
292     
293     		len += sprintf(buffer+len, "%08x  %08x %08x %08x %08x %08x %08x %08x  %08x %08x %08x\n",
294     			       dst_entries,		       
295     			       rt_cache_stat[i].in_hit,
296     			       rt_cache_stat[i].in_slow_tot,
297     			       rt_cache_stat[i].in_slow_mc,
298     			       rt_cache_stat[i].in_no_route,
299     			       rt_cache_stat[i].in_brd,
300     			       rt_cache_stat[i].in_martian_dst,
301     			       rt_cache_stat[i].in_martian_src,
302     
303     			       rt_cache_stat[i].out_hit,
304     			       rt_cache_stat[i].out_slow_tot,
305     			       rt_cache_stat[i].out_slow_mc
306     			);
307     	}
308     	len -= offset;
309     
310     	if (len > length)
311     		len = length;
312     	if (len < 0)
313     		len = 0;
314     
315     	*start = buffer + offset;
316       	return len;
317     }
318       
319     static __inline__ void rt_free(struct rtable *rt)
320     {
321     	dst_free(&rt->u.dst);
322     }
323     
324     static __inline__ void rt_drop(struct rtable *rt)
325     {
326     	ip_rt_put(rt);
327     	dst_free(&rt->u.dst);
328     }
329     
330     static __inline__ int rt_fast_clean(struct rtable *rth)
331     {
332     	/* Kill broadcast/multicast entries very aggresively, if they
333     	   collide in hash table with more useful entries */
334     	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
335     		rth->key.iif && rth->u.rt_next;
336     }
337     
338     static __inline__ int rt_valuable(struct rtable *rth)
339     {
340     	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
341     		rth->u.dst.expires;
342     }
343     
344     static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
345     {
346     	int age;
347     	int ret = 0;
348     
349     	if (atomic_read(&rth->u.dst.__refcnt))
350     		goto out;
351     
352     	ret = 1;
353     	if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
354     		goto out;
355     
356     	age = jiffies - rth->u.dst.lastuse;
357     	ret = 0;
358     	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
359     	    (age <= tmo2 && rt_valuable(rth)))
360     		goto out;
361     	ret = 1;
362     out:	return ret;
363     }
364     
365     /* This runs via a timer and thus is always in BH context. */
366     static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
367     {
368     	static int rover;
369     	int i = rover, t;
370     	struct rtable *rth, **rthp;
371     	unsigned long now = jiffies;
372     
373     	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
374     	     t -= ip_rt_gc_timeout) {
375     		unsigned tmo = ip_rt_gc_timeout;
376     
377     		i = (i + 1) & rt_hash_mask;
378     		rthp = &rt_hash_table[i].chain;
379     
380     		write_lock(&rt_hash_table[i].lock);
381     		while ((rth = *rthp) != NULL) {
382     			if (rth->u.dst.expires) {
383     				/* Entry is expired even if it is in use */
384     				if ((long)(now - rth->u.dst.expires) <= 0) {
385     					tmo >>= 1;
386     					rthp = &rth->u.rt_next;
387     					continue;
388     				}
389     			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
390     				tmo >>= 1;
391     				rthp = &rth->u.rt_next;
392     				continue;
393     			}
394     
395     			/* Cleanup aged off entries. */
396     			*rthp = rth->u.rt_next;
397     			rt_free(rth);
398     		}
399     		write_unlock(&rt_hash_table[i].lock);
400     
401     		/* Fallback loop breaker. */
402     		if ((jiffies - now) > 0)
403     			break;
404     	}
405     	rover = i;
406     	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
407     }
408     
409     SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
410     
411     /* This can run from both BH and non-BH contexts, the latter
412      * in the case of a forced flush event.
413      */
414     static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
415     {
416     	int i;
417     	struct rtable *rth, *next;
418     
419     	rt_deadline = 0;
420     
421     	for (i = rt_hash_mask; i >= 0; i--) {
422     		write_lock_bh(&rt_hash_table[i].lock);
423     		rth = rt_hash_table[i].chain;
424     		if (rth)
425     			rt_hash_table[i].chain = NULL;
426     		write_unlock_bh(&rt_hash_table[i].lock);
427     
428     		for (; rth; rth = next) {
429     			next = rth->u.rt_next;
430     			rt_free(rth);
431     		}
432     	}
433     }
434     
435     SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
436       
437     static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
438     
439     void rt_cache_flush(int delay)
440     {
441     	unsigned long now = jiffies;
442     	int user_mode = !in_softirq();
443     
444     	if (delay < 0)
445     		delay = ip_rt_min_delay;
446     
447     	spin_lock_bh(&rt_flush_lock);
448     
449     	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
450     		long tmo = (long)(rt_deadline - now);
451     
452     		/* If flush timer is already running
453     		   and flush request is not immediate (delay > 0):
454     
455     		   if deadline is not achieved, prolongate timer to "delay",
456     		   otherwise fire it at deadline time.
457     		 */
458     
459     		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
460     			tmo = 0;
461     		
462     		if (delay > tmo)
463     			delay = tmo;
464     	}
465     
466     	if (delay <= 0) {
467     		spin_unlock_bh(&rt_flush_lock);
468     		SMP_TIMER_NAME(rt_run_flush)(0);
469     		return;
470     	}
471     
472     	if (rt_deadline == 0)
473     		rt_deadline = now + ip_rt_max_delay;
474     
475     	mod_timer(&rt_flush_timer, now+delay);
476     	spin_unlock_bh(&rt_flush_lock);
477     }
478     
479     /*
480        Short description of GC goals.
481     
482        We want to build algorithm, which will keep routing cache
483        at some equilibrium point, when number of aged off entries
484        is kept approximately equal to newly generated ones.
485     
486        Current expiration strength is variable "expire".
487        We try to adjust it dynamically, so that if networking
488        is idle expires is large enough to keep enough of warm entries,
489        and when load increases it reduces to limit cache size.
490      */
491     
492     static int rt_garbage_collect(void)
493     {
494     	static unsigned expire = RT_GC_TIMEOUT;
495     	static unsigned long last_gc;
496     	static int rover;
497     	static int equilibrium;
498     	struct rtable *rth, **rthp;
499     	unsigned long now = jiffies;
500     	int goal;
501     
502     	/*
503     	 * Garbage collection is pretty expensive,
504     	 * do not make it too frequently.
505     	 */
506     	if (now - last_gc < ip_rt_gc_min_interval &&
507     	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
508     		goto out;
509     
510     	/* Calculate number of entries, which we want to expire now. */
511     	goal = atomic_read(&ipv4_dst_ops.entries) -
512     		(ip_rt_gc_elasticity << rt_hash_log);
513     	if (goal <= 0) {
514     		if (equilibrium < ipv4_dst_ops.gc_thresh)
515     			equilibrium = ipv4_dst_ops.gc_thresh;
516     		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
517     		if (goal > 0) {
518     			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
519     			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
520     		}
521     	} else {
522     		/* We are in dangerous area. Try to reduce cache really
523     		 * aggressively.
524     		 */
525     		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
526     		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
527     	}
528     
529     	if (now - last_gc >= ip_rt_gc_min_interval)
530     		last_gc = now;
531     
532     	if (goal <= 0) {
533     		equilibrium += goal;
534     		goto work_done;
535     	}
536     
537     	do {
538     		int i, k;
539     
540     		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
541     			unsigned tmo = expire;
542     
543     			k = (k + 1) & rt_hash_mask;
544     			rthp = &rt_hash_table[k].chain;
545     			write_lock_bh(&rt_hash_table[k].lock);
546     			while ((rth = *rthp) != NULL) {
547     				if (!rt_may_expire(rth, tmo, expire)) {
548     					tmo >>= 1;
549     					rthp = &rth->u.rt_next;
550     					continue;
551     				}
552     				*rthp = rth->u.rt_next;
553     				rt_free(rth);
554     				goal--;
555     			}
556     			write_unlock_bh(&rt_hash_table[k].lock);
557     			if (goal <= 0)
558     				break;
559     		}
560     		rover = k;
561     
562     		if (goal <= 0)
563     			goto work_done;
564     
565     		/* Goal is not achieved. We stop process if:
566     
567     		   - if expire reduced to zero. Otherwise, expire is halfed.
568     		   - if table is not full.
569     		   - if we are called from interrupt.
570     		   - jiffies check is just fallback/debug loop breaker.
571     		     We will not spin here for long time in any case.
572     		 */
573     
574     		if (expire == 0)
575     			break;
576     
577     		expire >>= 1;
578     #if RT_CACHE_DEBUG >= 2
579     		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
580     				atomic_read(&ipv4_dst_ops.entries), goal, i);
581     #endif
582     
583     		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
584     			goto out;
585     	} while (!in_softirq() && jiffies - now < 1);
586     
587     	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
588     		goto out;
589     	if (net_ratelimit())
590     		printk("dst cache overflow\n");
591     	return 1;
592     
593     work_done:
594     	expire += ip_rt_gc_min_interval;
595     	if (expire > ip_rt_gc_timeout ||
596     	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
597     		expire = ip_rt_gc_timeout;
598     #if RT_CACHE_DEBUG >= 2
599     	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
600     			atomic_read(&ipv4_dst_ops.entries), goal, rover);
601     #endif
602     out:	return 0;
603     }
604     
605     static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
606     {
607     	struct rtable	*rth, **rthp;
608     	unsigned long	now = jiffies;
609     	int attempts = !in_softirq();
610     
611     restart:
612     	rthp = &rt_hash_table[hash].chain;
613     
614     	write_lock_bh(&rt_hash_table[hash].lock);
615     	while ((rth = *rthp) != NULL) {
616     		if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
617     			/* Put it first */
618     			*rthp = rth->u.rt_next;
619     			rth->u.rt_next = rt_hash_table[hash].chain;
620     			rt_hash_table[hash].chain = rth;
621     
622     			rth->u.dst.__use++;
623     			dst_hold(&rth->u.dst);
624     			rth->u.dst.lastuse = now;
625     			write_unlock_bh(&rt_hash_table[hash].lock);
626     
627     			rt_drop(rt);
628     			*rp = rth;
629     			return 0;
630     		}
631     
632     		rthp = &rth->u.rt_next;
633     	}
634     
635     	/* Try to bind route to arp only if it is output
636     	   route or unicast forwarding path.
637     	 */
638     	if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
639     		int err = arp_bind_neighbour(&rt->u.dst);
640     		if (err) {
641     			write_unlock_bh(&rt_hash_table[hash].lock);
642     
643     			if (err != -ENOBUFS) {
644     				rt_drop(rt);
645     				return err;
646     			}
647     
648     			/* Neighbour tables are full and nothing
649     			   can be released. Try to shrink route cache,
650     			   it is most likely it holds some neighbour records.
651     			 */
652     			if (attempts-- > 0) {
653     				int saved_elasticity = ip_rt_gc_elasticity;
654     				int saved_int = ip_rt_gc_min_interval;
655     				ip_rt_gc_elasticity	= 1;
656     				ip_rt_gc_min_interval	= 0;
657     				rt_garbage_collect();
658     				ip_rt_gc_min_interval	= saved_int;
659     				ip_rt_gc_elasticity	= saved_elasticity;
660     				goto restart;
661     			}
662     
663     			if (net_ratelimit())
664     				printk("Neighbour table overflow.\n");
665     			rt_drop(rt);
666     			return -ENOBUFS;
667     		}
668     	}
669     
670     	rt->u.rt_next = rt_hash_table[hash].chain;
671     #if RT_CACHE_DEBUG >= 2
672     	if (rt->u.rt_next) {
673     		struct rtable *trt;
674     		printk("rt_cache @%02x: %u.%u.%u.%u", hash,
675     				NIPQUAD(rt->rt_dst));
676     		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
677     			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
678     		printk("\n");
679     	}
680     #endif
681     	rt_hash_table[hash].chain = rt;
682     	write_unlock_bh(&rt_hash_table[hash].lock);
683     	*rp = rt;
684     	return 0;
685     }
686     
687     void rt_bind_peer(struct rtable *rt, int create)
688     {
689     	static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
690     	struct inet_peer *peer;
691     
692     	peer = inet_getpeer(rt->rt_dst, create);
693     
694     	spin_lock_bh(&rt_peer_lock);
695     	if (rt->peer == NULL) {
696     		rt->peer = peer;
697     		peer = NULL;
698     	}
699     	spin_unlock_bh(&rt_peer_lock);
700     	if (peer)
701     		inet_putpeer(peer);
702     }
703     
704     /*
705      * Peer allocation may fail only in serious out-of-memory conditions.  However
706      * we still can generate some output.
707      * Random ID selection looks a bit dangerous because we have no chances to
708      * select ID being unique in a reasonable period of time.
709      * But broken packet identifier may be better than no packet at all.
710      */
711     static void ip_select_fb_ident(struct iphdr *iph)
712     {
713     	static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
714     	static u32 ip_fallback_id;
715     	u32 salt;
716     
717     	spin_lock_bh(&ip_fb_id_lock);
718     	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
719     	iph->id = htons(salt & 0xFFFF);
720     	ip_fallback_id = salt;
721     	spin_unlock_bh(&ip_fb_id_lock);
722     }
723     
724     void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
725     {
726     	struct rtable *rt = (struct rtable *) dst;
727     
728     	if (rt) {
729     		if (rt->peer == NULL)
730     			rt_bind_peer(rt, 1);
731     
732     		/* If peer is attached to destination, it is never detached,
733     		   so that we need not to grab a lock to dereference it.
734     		 */
735     		if (rt->peer) {
736     			iph->id = htons(inet_getid(rt->peer));
737     			return;
738     		}
739     	} else
740     		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
741     
742     	ip_select_fb_ident(iph);
743     }
744     
745     static void rt_del(unsigned hash, struct rtable *rt)
746     {
747     	struct rtable **rthp;
748     
749     	write_lock_bh(&rt_hash_table[hash].lock);
750     	ip_rt_put(rt);
751     	for (rthp = &rt_hash_table[hash].chain; *rthp;
752     	     rthp = &(*rthp)->u.rt_next)
753     		if (*rthp == rt) {
754     			*rthp = rt->u.rt_next;
755     			rt_free(rt);
756     			break;
757     		}
758     	write_unlock_bh(&rt_hash_table[hash].lock);
759     }
760     
761     void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
762     		    u32 saddr, u8 tos, struct net_device *dev)
763     {
764     	int i, k;
765     	struct in_device *in_dev = in_dev_get(dev);
766     	struct rtable *rth, **rthp;
767     	u32  skeys[2] = { saddr, 0 };
768     	int  ikeys[2] = { dev->ifindex, 0 };
769     
770     	tos &= IPTOS_RT_MASK;
771     
772     	if (!in_dev)
773     		return;
774     
775     	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
776     	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
777     		goto reject_redirect;
778     
779     	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
780     		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
781     			goto reject_redirect;
782     		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
783     			goto reject_redirect;
784     	} else {
785     		if (inet_addr_type(new_gw) != RTN_UNICAST)
786     			goto reject_redirect;
787     	}
788     
789     	for (i = 0; i < 2; i++) {
790     		for (k = 0; k < 2; k++) {
791     			unsigned hash = rt_hash_code(daddr,
792     						     skeys[i] ^ (ikeys[k] << 5),
793     						     tos);
794     
795     			rthp=&rt_hash_table[hash].chain;
796     
797     			read_lock(&rt_hash_table[hash].lock);
798     			while ((rth = *rthp) != NULL) {
799     				struct rtable *rt;
800     
801     				if (rth->key.dst != daddr ||
802     				    rth->key.src != skeys[i] ||
803     				    rth->key.tos != tos ||
804     				    rth->key.oif != ikeys[k] ||
805     				    rth->key.iif != 0) {
806     					rthp = &rth->u.rt_next;
807     					continue;
808     				}
809     
810     				if (rth->rt_dst != daddr ||
811     				    rth->rt_src != saddr ||
812     				    rth->u.dst.error ||
813     				    rth->rt_gateway != old_gw ||
814     				    rth->u.dst.dev != dev)
815     					break;
816     
817     				dst_clone(&rth->u.dst);
818     				read_unlock(&rt_hash_table[hash].lock);
819     
820     				rt = dst_alloc(&ipv4_dst_ops);
821     				if (rt == NULL) {
822     					ip_rt_put(rth);
823     					in_dev_put(in_dev);
824     					return;
825     				}
826     
827     				/* Copy all the information. */
828     				*rt = *rth;
829     				rt->u.dst.__use		= 1;
830     				atomic_set(&rt->u.dst.__refcnt, 1);
831     				if (rt->u.dst.dev)
832     					dev_hold(rt->u.dst.dev);
833     				rt->u.dst.lastuse	= jiffies;
834     				rt->u.dst.neighbour	= NULL;
835     				rt->u.dst.hh		= NULL;
836     				rt->u.dst.obsolete	= 0;
837     
838     				rt->rt_flags		|= RTCF_REDIRECTED;
839     
840     				/* Gateway is different ... */
841     				rt->rt_gateway		= new_gw;
842     
843     				/* Redirect received -> path was valid */
844     				dst_confirm(&rth->u.dst);
845     
846     				if (rt->peer)
847     					atomic_inc(&rt->peer->refcnt);
848     
849     				if (arp_bind_neighbour(&rt->u.dst) ||
850     				    !(rt->u.dst.neighbour->nud_state &
851     					    NUD_VALID)) {
852     					if (rt->u.dst.neighbour)
853     						neigh_event_send(rt->u.dst.neighbour, NULL);
854     					ip_rt_put(rth);
855     					rt_drop(rt);
856     					goto do_next;
857     				}
858     
859     				rt_del(hash, rth);
860     				if (!rt_intern_hash(hash, rt, &rt))
861     					ip_rt_put(rt);
862     				goto do_next;
863     			}
864     			read_unlock(&rt_hash_table[hash].lock);
865     		do_next:
866     			;
867     		}
868     	}
869     	in_dev_put(in_dev);
870     	return;
871     
872     reject_redirect:
873     #ifdef CONFIG_IP_ROUTE_VERBOSE
874     	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
875     		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
876     			"%u.%u.%u.%u ignored.\n"
877     			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
878     			"tos %02x\n",
879     		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
880     		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
881     #endif
882     	in_dev_put(in_dev);
883     }
884     
885     static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
886     {
887     	struct rtable *rt = (struct rtable*)dst;
888     	struct dst_entry *ret = dst;
889     
890     	if (rt) {
891     		if (dst->obsolete) {
892     			ip_rt_put(rt);
893     			ret = NULL;
894     		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
895     			   rt->u.dst.expires) {
896     			unsigned hash = rt_hash_code(rt->key.dst,
897     						     rt->key.src ^
898     							(rt->key.oif << 5),
899     						     rt->key.tos);
900     #if RT_CACHE_DEBUG >= 1
901     			printk(KERN_DEBUG "ip_rt_advice: redirect to "
902     					  "%u.%u.%u.%u/%02x dropped\n",
903     				NIPQUAD(rt->rt_dst), rt->key.tos);
904     #endif
905     			rt_del(hash, rt);
906     			ret = NULL;
907     		}
908     	}
909     	return ret;
910     }
911     
912     /*
913      * Algorithm:
914      *	1. The first ip_rt_redirect_number redirects are sent
915      *	   with exponential backoff, then we stop sending them at all,
916      *	   assuming that the host ignores our redirects.
917      *	2. If we did not see packets requiring redirects
918      *	   during ip_rt_redirect_silence, we assume that the host
919      *	   forgot redirected route and start to send redirects again.
920      *
921      * This algorithm is much cheaper and more intelligent than dumb load limiting
922      * in icmp.c.
923      *
924      * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
925      * and "frag. need" (breaks PMTU discovery) in icmp.c.
926      */
927     
928     void ip_rt_send_redirect(struct sk_buff *skb)
929     {
930     	struct rtable *rt = (struct rtable*)skb->dst;
931     	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
932     
933     	if (!in_dev)
934     		return;
935     
936     	if (!IN_DEV_TX_REDIRECTS(in_dev))
937     		goto out;
938     
939     	/* No redirected packets during ip_rt_redirect_silence;
940     	 * reset the algorithm.
941     	 */
942     	if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
943     		rt->u.dst.rate_tokens = 0;
944     
945     	/* Too many ignored redirects; do not send anything
946     	 * set u.dst.rate_last to the last seen redirected packet.
947     	 */
948     	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
949     		rt->u.dst.rate_last = jiffies;
950     		goto out;
951     	}
952     
953     	/* Check for load limit; set rate_last to the latest sent
954     	 * redirect.
955     	 */
956     	if (jiffies - rt->u.dst.rate_last >
957     	    (ip_rt_redirect_load << rt->u.dst.rate_tokens)) {
958     		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
959     		rt->u.dst.rate_last = jiffies;
960     		++rt->u.dst.rate_tokens;
961     #ifdef CONFIG_IP_ROUTE_VERBOSE
962     		if (IN_DEV_LOG_MARTIANS(in_dev) &&
963     		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
964     		    net_ratelimit())
965     			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
966     				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
967     				NIPQUAD(rt->rt_src), rt->rt_iif,
968     				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
969     #endif
970     	}
971     out:
972             in_dev_put(in_dev);
973     }
974     
975     static int ip_error(struct sk_buff *skb)
976     {
977     	struct rtable *rt = (struct rtable*)skb->dst;
978     	unsigned long now;
979     	int code;
980     
981     	switch (rt->u.dst.error) {
982     		case EINVAL:
983     		default:
984     			goto out;
985     		case EHOSTUNREACH:
986     			code = ICMP_HOST_UNREACH;
987     			break;
988     		case ENETUNREACH:
989     			code = ICMP_NET_UNREACH;
990     			break;
991     		case EACCES:
992     			code = ICMP_PKT_FILTERED;
993     			break;
994     	}
995     
996     	now = jiffies;
997     	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
998     	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
999     		rt->u.dst.rate_tokens = ip_rt_error_burst;
1000     	rt->u.dst.rate_last = now;
1001     	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1002     		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1003     		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1004     	}
1005     
1006     out:	kfree_skb(skb);
1007     	return 0;
1008     } 
1009     
1010     /*
1011      *	The last two values are not from the RFC but
1012      *	are needed for AMPRnet AX.25 paths.
1013      */
1014     
1015     static unsigned short mtu_plateau[] =
1016     {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1017     
1018     static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1019     {
1020     	int i;
1021     	
1022     	for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1023     		if (old_mtu > mtu_plateau[i])
1024     			return mtu_plateau[i];
1025     	return 68;
1026     }
1027     
1028     unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1029     {
1030     	int i;
1031     	unsigned short old_mtu = ntohs(iph->tot_len);
1032     	struct rtable *rth;
1033     	u32  skeys[2] = { iph->saddr, 0, };
1034     	u32  daddr = iph->daddr;
1035     	u8   tos = iph->tos & IPTOS_RT_MASK;
1036     	unsigned short est_mtu = 0;
1037     
1038     	if (ipv4_config.no_pmtu_disc)
1039     		return 0;
1040     
1041     	for (i = 0; i < 2; i++) {
1042     		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1043     
1044     		read_lock(&rt_hash_table[hash].lock);
1045     		for (rth = rt_hash_table[hash].chain; rth;
1046     		     rth = rth->u.rt_next) {
1047     			if (rth->key.dst == daddr &&
1048     			    rth->key.src == skeys[i] &&
1049     			    rth->rt_dst  == daddr &&
1050     			    rth->rt_src  == iph->saddr &&
1051     			    rth->key.tos == tos &&
1052     			    rth->key.iif == 0 &&
1053     			    !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1054     				unsigned short mtu = new_mtu;
1055     
1056     				if (new_mtu < 68 || new_mtu >= old_mtu) {
1057     
1058     					/* BSD 4.2 compatibility hack :-( */
1059     					if (mtu == 0 &&
1060     					    old_mtu >= rth->u.dst.pmtu &&
1061     					    old_mtu >= 68 + (iph->ihl << 2))
1062     						old_mtu -= iph->ihl << 2;
1063     
1064     					mtu = guess_mtu(old_mtu);
1065     				}
1066     				if (mtu <= rth->u.dst.pmtu) {
1067     					if (mtu < rth->u.dst.pmtu) { 
1068     						dst_confirm(&rth->u.dst);
1069     						if (mtu < ip_rt_min_pmtu) {
1070     							mtu = ip_rt_min_pmtu;
1071     							rth->u.dst.mxlock |=
1072     								(1 << RTAX_MTU);
1073     						}
1074     						rth->u.dst.pmtu = mtu;
1075     						dst_set_expires(&rth->u.dst,
1076     							ip_rt_mtu_expires);
1077     					}
1078     					est_mtu = mtu;
1079     				}
1080     			}
1081     		}
1082     		read_unlock(&rt_hash_table[hash].lock);
1083     	}
1084     	return est_mtu ? : new_mtu;
1085     }
1086     
1087     void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1088     {
1089     	if (dst->pmtu > mtu && mtu >= 68 &&
1090     	    !(dst->mxlock & (1 << RTAX_MTU))) {
1091     		if (mtu < ip_rt_min_pmtu) {
1092     			mtu = ip_rt_min_pmtu;
1093     			dst->mxlock |= (1 << RTAX_MTU);
1094     		}
1095     		dst->pmtu = mtu;
1096     		dst_set_expires(dst, ip_rt_mtu_expires);
1097     	}
1098     }
1099     
1100     static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1101     {
1102     	dst_release(dst);
1103     	return NULL;
1104     }
1105     
1106     static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1107     					  struct sk_buff *skb)
1108     {
1109     	return NULL;
1110     }
1111     
1112     static void ipv4_dst_destroy(struct dst_entry *dst)
1113     {
1114     	struct rtable *rt = (struct rtable *) dst;
1115     	struct inet_peer *peer = rt->peer;
1116     
1117     	if (peer) {
1118     		rt->peer = NULL;
1119     		inet_putpeer(peer);
1120     	}
1121     }
1122     
1123     static void ipv4_link_failure(struct sk_buff *skb)
1124     {
1125     	struct rtable *rt;
1126     
1127     	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1128     
1129     	rt = (struct rtable *) skb->dst;
1130     	if (rt)
1131     		dst_set_expires(&rt->u.dst, 0);
1132     }
1133     
1134     static int ip_rt_bug(struct sk_buff *skb)
1135     {
1136     	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1137     		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1138     		skb->dev ? skb->dev->name : "?");
1139     	kfree_skb(skb);
1140     	return 0;
1141     }
1142     
1143     /*
1144        We do not cache source address of outgoing interface,
1145        because it is used only by IP RR, TS and SRR options,
1146        so that it out of fast path.
1147     
1148        BTW remember: "addr" is allowed to be not aligned
1149        in IP options!
1150      */
1151     
1152     void ip_rt_get_source(u8 *addr, struct rtable *rt)
1153     {
1154     	u32 src;
1155     	struct fib_result res;
1156     
1157     	if (rt->key.iif == 0)
1158     		src = rt->rt_src;
1159     	else if (fib_lookup(&rt->key, &res) == 0) {
1160     #ifdef CONFIG_IP_ROUTE_NAT
1161     		if (res.type == RTN_NAT)
1162     			src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1163     						RT_SCOPE_UNIVERSE);
1164     		else
1165     #endif
1166     			src = FIB_RES_PREFSRC(res);
1167     		fib_res_put(&res);
1168     	} else
1169     		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1170     					RT_SCOPE_UNIVERSE);
1171     	memcpy(addr, &src, 4);
1172     }
1173     
1174     #ifdef CONFIG_NET_CLS_ROUTE
1175     static void set_class_tag(struct rtable *rt, u32 tag)
1176     {
1177     	if (!(rt->u.dst.tclassid & 0xFFFF))
1178     		rt->u.dst.tclassid |= tag & 0xFFFF;
1179     	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1180     		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1181     }
1182     #endif
1183     
1184     static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1185     {
1186     	struct fib_info *fi = res->fi;
1187     
1188     	if (fi) {
1189     		if (FIB_RES_GW(*res) &&
1190     		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1191     			rt->rt_gateway = FIB_RES_GW(*res);
1192     		memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1193     			sizeof(fi->fib_metrics));
1194     		if (fi->fib_mtu == 0) {
1195     			rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1196     			if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1197     			    rt->rt_gateway != rt->rt_dst &&
1198     			    rt->u.dst.pmtu > 576)
1199     				rt->u.dst.pmtu = 576;
1200     		}
1201     #ifdef CONFIG_NET_CLS_ROUTE
1202     		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1203     #endif
1204     	} else
1205     		rt->u.dst.pmtu	= rt->u.dst.dev->mtu;
1206     
1207     	if (rt->u.dst.pmtu > IP_MAX_MTU)
1208     		rt->u.dst.pmtu = IP_MAX_MTU;
1209     	if (rt->u.dst.advmss == 0)
1210     		rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1211     				       ip_rt_min_advmss);
1212     	if (rt->u.dst.advmss > 65535 - 40)
1213     		rt->u.dst.advmss = 65535 - 40;
1214     
1215     #ifdef CONFIG_NET_CLS_ROUTE
1216     #ifdef CONFIG_IP_MULTIPLE_TABLES
1217     	set_class_tag(rt, fib_rules_tclass(res));
1218     #endif
1219     	set_class_tag(rt, itag);
1220     #endif
1221             rt->rt_type = res->type;
1222     }
1223     
1224     static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1225     				u8 tos, struct net_device *dev, int our)
1226     {
1227     	unsigned hash;
1228     	struct rtable *rth;
1229     	u32 spec_dst;
1230     	struct in_device *in_dev = in_dev_get(dev);
1231     	u32 itag = 0;
1232     
1233     	/* Primary sanity checks. */
1234     
1235     	if (in_dev == NULL)
1236     		return -EINVAL;
1237     
1238     	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1239     	    skb->protocol != __constant_htons(ETH_P_IP))
1240     		goto e_inval;
1241     
1242     	if (ZERONET(saddr)) {
1243     		if (!LOCAL_MCAST(daddr))
1244     			goto e_inval;
1245     		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1246     	} else if (fib_validate_source(saddr, 0, tos, 0,
1247     					dev, &spec_dst, &itag) < 0)
1248     		goto e_inval;
1249     
1250     	rth = dst_alloc(&ipv4_dst_ops);
1251     	if (!rth)
1252     		goto e_nobufs;
1253     
1254     	rth->u.dst.output= ip_rt_bug;
1255     
1256     	atomic_set(&rth->u.dst.__refcnt, 1);
1257     	rth->u.dst.flags= DST_HOST;
1258     	rth->key.dst	= daddr;
1259     	rth->rt_dst	= daddr;
1260     	rth->key.tos	= tos;
1261     #ifdef CONFIG_IP_ROUTE_FWMARK
1262     	rth->key.fwmark	= skb->nfmark;
1263     #endif
1264     	rth->key.src	= saddr;
1265     	rth->rt_src	= saddr;
1266     #ifdef CONFIG_IP_ROUTE_NAT
1267     	rth->rt_dst_map	= daddr;
1268     	rth->rt_src_map	= saddr;
1269     #endif
1270     #ifdef CONFIG_NET_CLS_ROUTE
1271     	rth->u.dst.tclassid = itag;
1272     #endif
1273     	rth->rt_iif	=
1274     	rth->key.iif	= dev->ifindex;
1275     	rth->u.dst.dev	= &loopback_dev;
1276     	dev_hold(rth->u.dst.dev);
1277     	rth->key.oif	= 0;
1278     	rth->rt_gateway	= daddr;
1279     	rth->rt_spec_dst= spec_dst;
1280     	rth->rt_type	= RTN_MULTICAST;
1281     	rth->rt_flags	= RTCF_MULTICAST;
1282     	if (our) {
1283     		rth->u.dst.input= ip_local_deliver;
1284     		rth->rt_flags |= RTCF_LOCAL;
1285     	}
1286     
1287     #ifdef CONFIG_IP_MROUTE
1288     	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1289     		rth->u.dst.input = ip_mr_input;
1290     #endif
1291     	rt_cache_stat[smp_processor_id()].in_slow_mc++;
1292     
1293     	in_dev_put(in_dev);
1294     	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1295     	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1296     
1297     e_nobufs:
1298     	in_dev_put(in_dev);
1299     	return -ENOBUFS;
1300     
1301     e_inval:
1302     	in_dev_put(in_dev);
1303     	return -EINVAL;
1304     }
1305     
1306     /*
1307      *	NOTE. We drop all the packets that has local source
1308      *	addresses, because every properly looped back packet
1309      *	must have correct destination already attached by output routine.
1310      *
1311      *	Such approach solves two big problems:
1312      *	1. Not simplex devices are handled properly.
1313      *	2. IP spoofing attempts are filtered with 100% of guarantee.
1314      */
1315     
1316     int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1317     			u8 tos, struct net_device *dev)
1318     {
1319     	struct rt_key	key;
1320     	struct fib_result res;
1321     	struct in_device *in_dev = in_dev_get(dev);
1322     	struct in_device *out_dev = NULL;
1323     	unsigned	flags = 0;
1324     	u32		itag = 0;
1325     	struct rtable * rth;
1326     	unsigned	hash;
1327     	u32		spec_dst;
1328     	int		err = -EINVAL;
1329     	int		free_res = 0;
1330     
1331     	/* IP on this device is disabled. */
1332     
1333     	if (!in_dev)
1334     		goto out;
1335     
1336     	key.dst		= daddr;
1337     	key.src		= saddr;
1338     	key.tos		= tos;
1339     #ifdef CONFIG_IP_ROUTE_FWMARK
1340     	key.fwmark	= skb->nfmark;
1341     #endif
1342     	key.iif		= dev->ifindex;
1343     	key.oif		= 0;
1344     	key.scope	= RT_SCOPE_UNIVERSE;
1345     
1346     	hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1347     
1348     	/* Check for the most weird martians, which can be not detected
1349     	   by fib_lookup.
1350     	 */
1351     
1352     	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1353     		goto martian_source;
1354     
1355     	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1356     		goto brd_input;
1357     
1358     	/* Accept zero addresses only to limited broadcast;
1359     	 * I even do not know to fix it or not. Waiting for complains :-)
1360     	 */
1361     	if (ZERONET(saddr))
1362     		goto martian_source;
1363     
1364     	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1365     		goto martian_destination;
1366     
1367     	/*
1368     	 *	Now we are ready to route packet.
1369     	 */
1370     	if ((err = fib_lookup(&key, &res)) != 0) {
1371     		if (!IN_DEV_FORWARD(in_dev))
1372     			goto e_inval;
1373     		goto no_route;
1374     	}
1375     	free_res = 1;
1376     
1377     	rt_cache_stat[smp_processor_id()].in_slow_tot++;
1378     
1379     #ifdef CONFIG_IP_ROUTE_NAT
1380     	/* Policy is applied before mapping destination,
1381     	   but rerouting after map should be made with old source.
1382     	 */
1383     
1384     	if (1) {
1385     		u32 src_map = saddr;
1386     		if (res.r)
1387     			src_map = fib_rules_policy(saddr, &res, &flags);
1388     
1389     		if (res.type == RTN_NAT) {
1390     			key.dst = fib_rules_map_destination(daddr, &res);
1391     			fib_res_put(&res);
1392     			free_res = 0;
1393     			if (fib_lookup(&key, &res))
1394     				goto e_inval;
1395     			free_res = 1;
1396     			if (res.type != RTN_UNICAST)
1397     				goto e_inval;
1398     			flags |= RTCF_DNAT;
1399     		}
1400     		key.src = src_map;
1401     	}
1402     #endif
1403     
1404     	if (res.type == RTN_BROADCAST)
1405     		goto brd_input;
1406     
1407     	if (res.type == RTN_LOCAL) {
1408     		int result;
1409     		result = fib_validate_source(saddr, daddr, tos,
1410     					     loopback_dev.ifindex,
1411     					     dev, &spec_dst, &itag);
1412     		if (result < 0)
1413     			goto martian_source;
1414     		if (result)
1415     			flags |= RTCF_DIRECTSRC;
1416     		spec_dst = daddr;
1417     		goto local_input;
1418     	}
1419     
1420     	if (!IN_DEV_FORWARD(in_dev))
1421     		goto e_inval;
1422     	if (res.type != RTN_UNICAST)
1423     		goto martian_destination;
1424     
1425     #ifdef CONFIG_IP_ROUTE_MULTIPATH
1426     	if (res.fi->fib_nhs > 1 && key.oif == 0)
1427     		fib_select_multipath(&key, &res);
1428     #endif
1429     	out_dev = in_dev_get(FIB_RES_DEV(res));
1430     	if (out_dev == NULL) {
1431     		if (net_ratelimit())
1432     			printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1433     					 "Please, report\n");
1434     		goto e_inval;
1435     	}
1436     
1437     	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1438     				  &spec_dst, &itag);
1439     	if (err < 0)
1440     		goto martian_source;
1441     
1442     	if (err)
1443     		flags |= RTCF_DIRECTSRC;
1444     
1445     	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1446     	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1447     	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1448     		flags |= RTCF_DOREDIRECT;
1449     
1450     	if (skb->protocol != __constant_htons(ETH_P_IP)) {
1451     		/* Not IP (i.e. ARP). Do not create route, if it is
1452     		 * invalid for proxy arp. DNAT routes are always valid.
1453     		 */
1454     		if (out_dev == in_dev && !(flags & RTCF_DNAT))
1455     			goto e_inval;
1456     	}
1457     
1458     	rth = dst_alloc(&ipv4_dst_ops);
1459     	if (!rth)
1460     		goto e_nobufs;
1461     
1462     	atomic_set(&rth->u.dst.__refcnt, 1);
1463     	rth->u.dst.flags= DST_HOST;
1464     	rth->key.dst	= daddr;
1465     	rth->rt_dst	= daddr;
1466     	rth->key.tos	= tos;
1467     #ifdef CONFIG_IP_ROUTE_FWMARK
1468     	rth->key.fwmark	= skb->nfmark;
1469     #endif
1470     	rth->key.src	= saddr;
1471     	rth->rt_src	= saddr;
1472     	rth->rt_gateway	= daddr;
1473     #ifdef CONFIG_IP_ROUTE_NAT
1474     	rth->rt_src_map	= key.src;
1475     	rth->rt_dst_map	= key.dst;
1476     	if (flags&RTCF_DNAT)
1477     		rth->rt_gateway	= key.dst;
1478     #endif
1479     	rth->rt_iif 	=
1480     	rth->key.iif	= dev->ifindex;
1481     	rth->u.dst.dev	= out_dev->dev;
1482     	dev_hold(rth->u.dst.dev);
1483     	rth->key.oif 	= 0;
1484     	rth->rt_spec_dst= spec_dst;
1485     
1486     	rth->u.dst.input = ip_forward;
1487     	rth->u.dst.output = ip_output;
1488     
1489     	rt_set_nexthop(rth, &res, itag);
1490     
1491     	rth->rt_flags = flags;
1492     
1493     #ifdef CONFIG_NET_FASTROUTE
1494     	if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1495     		struct net_device *odev = rth->u.dst.dev;
1496     		if (odev != dev &&
1497     		    dev->accept_fastpath &&
1498     		    odev->mtu >= dev->mtu &&
1499     		    dev->accept_fastpath(dev, &rth->u.dst) == 0)
1500     			rth->rt_flags |= RTCF_FAST;
1501     	}
1502     #endif
1503     
1504     intern:
1505     	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1506     done:
1507     	in_dev_put(in_dev);
1508     	if (out_dev)
1509     		in_dev_put(out_dev);
1510     	if (free_res)
1511     		fib_res_put(&res);
1512     out:	return err;
1513     
1514     brd_input:
1515     	if (skb->protocol != __constant_htons(ETH_P_IP))
1516     		goto e_inval;
1517     
1518     	if (ZERONET(saddr))
1519     		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1520     	else {
1521     		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1522     					  &itag);
1523     		if (err < 0)
1524     			goto martian_source;
1525     		if (err)
1526     			flags |= RTCF_DIRECTSRC;
1527     	}
1528     	flags |= RTCF_BROADCAST;
1529     	res.type = RTN_BROADCAST;
1530     	rt_cache_stat[smp_processor_id()].in_brd++;
1531     
1532     local_input:
1533     	rth = dst_alloc(&ipv4_dst_ops);
1534     	if (!rth)
1535     		goto e_nobufs;
1536     
1537     	rth->u.dst.output= ip_rt_bug;
1538     
1539     	atomic_set(&rth->u.dst.__refcnt, 1);
1540     	rth->u.dst.flags= DST_HOST;
1541     	rth->key.dst	= daddr;
1542     	rth->rt_dst	= daddr;
1543     	rth->key.tos	= tos;
1544     #ifdef CONFIG_IP_ROUTE_FWMARK
1545     	rth->key.fwmark	= skb->nfmark;
1546     #endif
1547     	rth->key.src	= saddr;
1548     	rth->rt_src	= saddr;
1549     #ifdef CONFIG_IP_ROUTE_NAT
1550     	rth->rt_dst_map	= key.dst;
1551     	rth->rt_src_map	= key.src;
1552     #endif
1553     #ifdef CONFIG_NET_CLS_ROUTE
1554     	rth->u.dst.tclassid = itag;
1555     #endif
1556     	rth->rt_iif	=
1557     	rth->key.iif	= dev->ifindex;
1558     	rth->u.dst.dev	= &loopback_dev;
1559     	dev_hold(rth->u.dst.dev);
1560     	rth->key.oif 	= 0;
1561     	rth->rt_gateway	= daddr;
1562     	rth->rt_spec_dst= spec_dst;
1563     	rth->u.dst.input= ip_local_deliver;
1564     	rth->rt_flags 	= flags|RTCF_LOCAL;
1565     	if (res.type == RTN_UNREACHABLE) {
1566     		rth->u.dst.input= ip_error;
1567     		rth->u.dst.error= -err;
1568     		rth->rt_flags 	&= ~RTCF_LOCAL;
1569     	}
1570     	rth->rt_type	= res.type;
1571     	goto intern;
1572     
1573     no_route:
1574     	rt_cache_stat[smp_processor_id()].in_no_route++;
1575     	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1576     	res.type = RTN_UNREACHABLE;
1577     	goto local_input;
1578     
1579     	/*
1580     	 *	Do not cache martian addresses: they should be logged (RFC1812)
1581     	 */
1582     martian_destination:
1583     	rt_cache_stat[smp_processor_id()].in_martian_dst++;
1584     #ifdef CONFIG_IP_ROUTE_VERBOSE
1585     	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1586     		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1587     			"%u.%u.%u.%u, dev %s\n",
1588     			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1589     #endif
1590     e_inval:
1591     	err = -EINVAL;
1592     	goto done;
1593     
1594     e_nobufs:
1595     	err = -ENOBUFS;
1596     	goto done;
1597     
1598     martian_source:
1599     
1600     	rt_cache_stat[smp_processor_id()].in_martian_src++;
1601     #ifdef CONFIG_IP_ROUTE_VERBOSE
1602     	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1603     		/*
1604     		 *	RFC1812 recommendation, if source is martian,
1605     		 *	the only hint is MAC header.
1606     		 */
1607     		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1608     			"%u.%u.%u.%u, on dev %s\n",
1609     			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1610     		if (dev->hard_header_len) {
1611     			int i;
1612     			unsigned char *p = skb->mac.raw;
1613     			printk(KERN_WARNING "ll header: ");
1614     			for (i = 0; i < dev->hard_header_len; i++, p++) {
1615     				printk("%02x", *p);
1616     				if (i < (dev->hard_header_len - 1))
1617     					printk(":");
1618     			}
1619     			printk("\n");
1620     		}
1621     	}
1622     #endif
1623     	goto e_inval;
1624     }
1625     
1626     int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1627     		   u8 tos, struct net_device *dev)
1628     {
1629     	struct rtable * rth;
1630     	unsigned	hash;
1631     	int iif = dev->ifindex;
1632     
1633     	tos &= IPTOS_RT_MASK;
1634     	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1635     
1636     	read_lock(&rt_hash_table[hash].lock);
1637     	for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1638     		if (rth->key.dst == daddr &&
1639     		    rth->key.src == saddr &&
1640     		    rth->key.iif == iif &&
1641     		    rth->key.oif == 0 &&
1642     #ifdef CONFIG_IP_ROUTE_FWMARK
1643     		    rth->key.fwmark == skb->nfmark &&
1644     #endif
1645     		    rth->key.tos == tos) {
1646     			rth->u.dst.lastuse = jiffies;
1647     			dst_hold(&rth->u.dst);
1648     			rth->u.dst.__use++;
1649     			rt_cache_stat[smp_processor_id()].in_hit++;
1650     			read_unlock(&rt_hash_table[hash].lock);
1651     			skb->dst = (struct dst_entry*)rth;
1652     			return 0;
1653     		}
1654     	}
1655     	read_unlock(&rt_hash_table[hash].lock);
1656     
1657     	/* Multicast recognition logic is moved from route cache to here.
1658     	   The problem was that too many Ethernet cards have broken/missing
1659     	   hardware multicast filters :-( As result the host on multicasting
1660     	   network acquires a lot of useless route cache entries, sort of
1661     	   SDR messages from all the world. Now we try to get rid of them.
1662     	   Really, provided software IP multicast filter is organized
1663     	   reasonably (at least, hashed), it does not result in a slowdown
1664     	   comparing with route cache reject entries.
1665     	   Note, that multicast routers are not affected, because
1666     	   route cache entry is created eventually.
1667     	 */
1668     	if (MULTICAST(daddr)) {
1669     		struct in_device *in_dev;
1670     
1671     		read_lock(&inetdev_lock);
1672     		if ((in_dev = __in_dev_get(dev)) != NULL) {
1673     			int our = ip_check_mc(in_dev, daddr);
1674     			if (our
1675     #ifdef CONFIG_IP_MROUTE
1676     			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1677     #endif
1678     			    ) {
1679     				read_unlock(&inetdev_lock);
1680     				return ip_route_input_mc(skb, daddr, saddr,
1681     							 tos, dev, our);
1682     			}
1683     		}
1684     		read_unlock(&inetdev_lock);
1685     		return -EINVAL;
1686     	}
1687     	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1688     }
1689     
1690     /*
1691      * Major route resolver routine.
1692      */
1693     
1694     int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1695     {
1696     	struct rt_key key;
1697     	struct fib_result res;
1698     	unsigned flags = 0;
1699     	struct rtable *rth;
1700     	struct net_device *dev_out = NULL;
1701     	unsigned hash;
1702     	int free_res = 0;
1703     	int err;
1704     	u32 tos;
1705     
1706     	tos		= oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1707     	key.dst		= oldkey->dst;
1708     	key.src		= oldkey->src;
1709     	key.tos		= tos & IPTOS_RT_MASK;
1710     	key.iif		= loopback_dev.ifindex;
1711     	key.oif		= oldkey->oif;
1712     #ifdef CONFIG_IP_ROUTE_FWMARK
1713     	key.fwmark	= oldkey->fwmark;
1714     #endif
1715     	key.scope	= (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1716     						RT_SCOPE_UNIVERSE;
1717     	res.fi		= NULL;
1718     #ifdef CONFIG_IP_MULTIPLE_TABLES
1719     	res.r		= NULL;
1720     #endif
1721     
1722     	if (oldkey->src) {
1723     		err = -EINVAL;
1724     		if (MULTICAST(oldkey->src) ||
1725     		    BADCLASS(oldkey->src) ||
1726     		    ZERONET(oldkey->src))
1727     			goto out;
1728     
1729     		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1730     		dev_out = ip_dev_find(oldkey->src);
1731     		if (dev_out == NULL)
1732     			goto out;
1733     
1734     		/* I removed check for oif == dev_out->oif here.
1735     		   It was wrong by three reasons:
1736     		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
1737     		      assigned to multiple interfaces.
1738     		   2. Moreover, we are allowed to send packets with saddr
1739     		      of another iface. --ANK
1740     		 */
1741     
1742     		if (oldkey->oif == 0
1743     		    && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1744     			/* Special hack: user can direct multicasts
1745     			   and limited broadcast via necessary interface
1746     			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1747     			   This hack is not just for fun, it allows
1748     			   vic,vat and friends to work.
1749     			   They bind socket to loopback, set ttl to zero
1750     			   and expect that it will work.
1751     			   From the viewpoint of routing cache they are broken,
1752     			   because we are not allowed to build multicast path
1753     			   with loopback source addr (look, routing cache
1754     			   cannot know, that ttl is zero, so that packet
1755     			   will not leave this host and route is valid).
1756     			   Luckily, this hack is good workaround.
1757     			 */
1758     
1759     			key.oif = dev_out->ifindex;
1760     			goto make_route;
1761     		}
1762     		if (dev_out)
1763     			dev_put(dev_out);
1764     		dev_out = NULL;
1765     	}
1766     	if (oldkey->oif) {
1767     		dev_out = dev_get_by_index(oldkey->oif);
1768     		err = -ENODEV;
1769     		if (dev_out == NULL)
1770     			goto out;
1771     		if (__in_dev_get(dev_out) == NULL) {
1772     			dev_put(dev_out);
1773     			goto out;	/* Wrong error code */
1774     		}
1775     
1776     		if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1777     			if (!key.src)
1778     				key.src = inet_select_addr(dev_out, 0,
1779     								RT_SCOPE_LINK);
1780     			goto make_route;
1781     		}
1782     		if (!key.src) {
1783     			if (MULTICAST(oldkey->dst))
1784     				key.src = inet_select_addr(dev_out, 0,
1785     								key.scope);
1786     			else if (!oldkey->dst)
1787     				key.src = inet_select_addr(dev_out, 0,
1788     								RT_SCOPE_HOST);
1789     		}
1790     	}
1791     
1792     	if (!key.dst) {
1793     		key.dst = key.src;
1794     		if (!key.dst)
1795     			key.dst = key.src = htonl(INADDR_LOOPBACK);
1796     		if (dev_out)
1797     			dev_put(dev_out);
1798     		dev_out = &loopback_dev;
1799     		dev_hold(dev_out);
1800     		key.oif = loopback_dev.ifindex;
1801     		res.type = RTN_LOCAL;
1802     		flags |= RTCF_LOCAL;
1803     		goto make_route;
1804     	}
1805     
1806     	if (fib_lookup(&key, &res)) {
1807     		res.fi = NULL;
1808     		if (oldkey->oif) {
1809     			/* Apparently, routing tables are wrong. Assume,
1810     			   that the destination is on link.
1811     
1812     			   WHY? DW.
1813     			   Because we are allowed to send to iface
1814     			   even if it has NO routes and NO assigned
1815     			   addresses. When oif is specified, routing
1816     			   tables are looked up with only one purpose:
1817     			   to catch if destination is gatewayed, rather than
1818     			   direct. Moreover, if MSG_DONTROUTE is set,
1819     			   we send packet, ignoring both routing tables
1820     			   and ifaddr state. --ANK
1821     
1822     
1823     			   We could make it even if oif is unknown,
1824     			   likely IPv6, but we do not.
1825     			 */
1826     
1827     			if (key.src == 0)
1828     				key.src = inet_select_addr(dev_out, 0,
1829     							   RT_SCOPE_LINK);
1830     			res.type = RTN_UNICAST;
1831     			goto make_route;
1832     		}
1833     		if (dev_out)
1834     			dev_put(dev_out);
1835     		err = -ENETUNREACH;
1836     		goto out;
1837     	}
1838     	free_res = 1;
1839     
1840     	if (res.type == RTN_NAT)
1841     		goto e_inval;
1842     
1843     	if (res.type == RTN_LOCAL) {
1844     		if (!key.src)
1845     			key.src = key.dst;
1846     		if (dev_out)
1847     			dev_put(dev_out);
1848     		dev_out = &loopback_dev;
1849     		dev_hold(dev_out);
1850     		key.oif = dev_out->ifindex;
1851     		if (res.fi)
1852     			fib_info_put(res.fi);
1853     		res.fi = NULL;
1854     		flags |= RTCF_LOCAL;
1855     		goto make_route;
1856     	}
1857     
1858     #ifdef CONFIG_IP_ROUTE_MULTIPATH
1859     	if (res.fi->fib_nhs > 1 && key.oif == 0)
1860     		fib_select_multipath(&key, &res);
1861     	else
1862     #endif
1863     	if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1864     		fib_select_default(&key, &res);
1865     
1866     	if (!key.src)
1867     		key.src = FIB_RES_PREFSRC(res);
1868     
1869     	if (dev_out)
1870     		dev_put(dev_out);
1871     	dev_out = FIB_RES_DEV(res);
1872     	dev_hold(dev_out);
1873     	key.oif = dev_out->ifindex;
1874     
1875     make_route:
1876     	if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1877     		goto e_inval;
1878     
1879     	if (key.dst == 0xFFFFFFFF)
1880     		res.type = RTN_BROADCAST;
1881     	else if (MULTICAST(key.dst))
1882     		res.type = RTN_MULTICAST;
1883     	else if (BADCLASS(key.dst) || ZERONET(key.dst))
1884     		goto e_inval;
1885     
1886     	if (dev_out->flags & IFF_LOOPBACK)
1887     		flags |= RTCF_LOCAL;
1888     
1889     	if (res.type == RTN_BROADCAST) {
1890     		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1891     		if (res.fi) {
1892     			fib_info_put(res.fi);
1893     			res.fi = NULL;
1894     		}
1895     	} else if (res.type == RTN_MULTICAST) {
1896     		flags |= RTCF_MULTICAST|RTCF_LOCAL;
1897     		read_lock(&inetdev_lock);
1898     		if (!__in_dev_get(dev_out) ||
1899     		    !ip_check_mc(__in_dev_get(dev_out), oldkey->dst))
1900     			flags &= ~RTCF_LOCAL;
1901     		read_unlock(&inetdev_lock);
1902     		/* If multicast route do not exist use
1903     		   default one, but do not gateway in this case.
1904     		   Yes, it is hack.
1905     		 */
1906     		if (res.fi && res.prefixlen < 4) {
1907     			fib_info_put(res.fi);
1908     			res.fi = NULL;
1909     		}
1910     	}
1911     
1912     	rth = dst_alloc(&ipv4_dst_ops);
1913     	if (!rth)
1914     		goto e_nobufs;
1915     
1916     	atomic_set(&rth->u.dst.__refcnt, 1);
1917     	rth->u.dst.flags= DST_HOST;
1918     	rth->key.dst	= oldkey->dst;
1919     	rth->key.tos	= tos;
1920     	rth->key.src	= oldkey->src;
1921     	rth->key.iif	= 0;
1922     	rth->key.oif	= oldkey->oif;
1923     #ifdef CONFIG_IP_ROUTE_FWMARK
1924     	rth->key.fwmark	= oldkey->fwmark;
1925     #endif
1926     	rth->rt_dst	= key.dst;
1927     	rth->rt_src	= key.src;
1928     #ifdef CONFIG_IP_ROUTE_NAT
1929     	rth->rt_dst_map	= key.dst;
1930     	rth->rt_src_map	= key.src;
1931     #endif
1932     	rth->rt_iif	= oldkey->oif ? : dev_out->ifindex;
1933     	rth->u.dst.dev	= dev_out;
1934     	dev_hold(dev_out);
1935     	rth->rt_gateway = key.dst;
1936     	rth->rt_spec_dst= key.src;
1937     
1938     	rth->u.dst.output=ip_output;
1939     
1940     	rt_cache_stat[smp_processor_id()].out_slow_tot++;
1941     
1942     	if (flags & RTCF_LOCAL) {
1943     		rth->u.dst.input = ip_local_deliver;
1944     		rth->rt_spec_dst = key.dst;
1945     	}
1946     	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1947     		rth->rt_spec_dst = key.src;
1948     		if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
1949     			rth->u.dst.output = ip_mc_output;
1950     			rt_cache_stat[smp_processor_id()].out_slow_mc++;
1951     		}
1952     #ifdef CONFIG_IP_MROUTE
1953     		if (res.type == RTN_MULTICAST) {
1954     			struct in_device *in_dev = in_dev_get(dev_out);
1955     			if (in_dev) {
1956     				if (IN_DEV_MFORWARD(in_dev) &&
1957     				    !LOCAL_MCAST(oldkey->dst)) {
1958     					rth->u.dst.input = ip_mr_input;
1959     					rth->u.dst.output = ip_mc_output;
1960     				}
1961     				in_dev_put(in_dev);
1962     			}
1963     		}
1964     #endif
1965     	}
1966     
1967     	rt_set_nexthop(rth, &res, 0);
1968     
1969     	rth->rt_flags = flags;
1970     
1971     	hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
1972     	err = rt_intern_hash(hash, rth, rp);
1973     done:
1974     	if (free_res)
1975     		fib_res_put(&res);
1976     	if (dev_out)
1977     		dev_put(dev_out);
1978     out:	return err;
1979     
1980     e_inval:
1981     	err = -EINVAL;
1982     	goto done;
1983     e_nobufs:
1984     	err = -ENOBUFS;
1985     	goto done;
1986     }
1987     
1988     int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
1989     {
1990     	unsigned hash;
1991     	struct rtable *rth;
1992     
1993     	hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
1994     
1995     	read_lock_bh(&rt_hash_table[hash].lock);
1996     	for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1997     		if (rth->key.dst == key->dst &&
1998     		    rth->key.src == key->src &&
1999     		    rth->key.iif == 0 &&
2000     		    rth->key.oif == key->oif &&
2001     #ifdef CONFIG_IP_ROUTE_FWMARK
2002     		    rth->key.fwmark == key->fwmark &&
2003     #endif
2004     		    !((rth->key.tos ^ key->tos) &
2005     			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2006     		    ((key->tos & RTO_TPROXY) ||
2007     		     !(rth->rt_flags & RTCF_TPROXY))) {
2008     			rth->u.dst.lastuse = jiffies;
2009     			dst_hold(&rth->u.dst);
2010     			rth->u.dst.__use++;
2011     			rt_cache_stat[smp_processor_id()].out_hit++;
2012     			read_unlock_bh(&rt_hash_table[hash].lock);
2013     			*rp = rth;
2014     			return 0;
2015     		}
2016     	}
2017     	read_unlock_bh(&rt_hash_table[hash].lock);
2018     
2019     	return ip_route_output_slow(rp, key);
2020     }	
2021     
2022     #ifdef CONFIG_RTNETLINK
2023     static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2024     			int nowait)
2025     {
2026     	struct rtable *rt = (struct rtable*)skb->dst;
2027     	struct rtmsg *r;
2028     	struct nlmsghdr  *nlh;
2029     	unsigned char	 *b = skb->tail;
2030     	struct rta_cacheinfo ci;
2031     #ifdef CONFIG_IP_MROUTE
2032     	struct rtattr *eptr;
2033     #endif
2034     	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2035     	r = NLMSG_DATA(nlh);
2036     	nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2037     	r->rtm_family	 = AF_INET;
2038     	r->rtm_dst_len	= 32;
2039     	r->rtm_src_len	= 0;
2040     	r->rtm_tos	= rt->key.tos;
2041     	r->rtm_table	= RT_TABLE_MAIN;
2042     	r->rtm_type	= rt->rt_type;
2043     	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2044     	r->rtm_protocol = RTPROT_UNSPEC;
2045     	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2046     	if (rt->rt_flags & RTCF_NOTIFY)
2047     		r->rtm_flags |= RTM_F_NOTIFY;
2048     	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2049     	if (rt->key.src) {
2050     		r->rtm_src_len = 32;
2051     		RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2052     	}
2053     	if (rt->u.dst.dev)
2054     		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2055     #ifdef CONFIG_NET_CLS_ROUTE
2056     	if (rt->u.dst.tclassid)
2057     		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2058     #endif
2059     	if (rt->key.iif)
2060     		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2061     	else if (rt->rt_src != rt->key.src)
2062     		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2063     	if (rt->rt_dst != rt->rt_gateway)
2064     		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2065     	if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2066     		goto rtattr_failure;
2067     	ci.rta_lastuse	= jiffies - rt->u.dst.lastuse;
2068     	ci.rta_used	= rt->u.dst.__use;
2069     	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2070     	if (rt->u.dst.expires)
2071     		ci.rta_expires = rt->u.dst.expires - jiffies;
2072     	else
2073     		ci.rta_expires = 0;
2074     	ci.rta_error	= rt->u.dst.error;
2075     	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2076     	if (rt->peer) {
2077     		ci.rta_id = rt->peer->ip_id_count;
2078     		if (rt->peer->tcp_ts_stamp) {
2079     			ci.rta_ts = rt->peer->tcp_ts;
2080     			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2081     		}
2082     	}
2083     #ifdef CONFIG_IP_MROUTE
2084     	eptr = (struct rtattr*)skb->tail;
2085     #endif
2086     	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2087     	if (rt->key.iif) {
2088     #ifdef CONFIG_IP_MROUTE
2089     		u32 dst = rt->rt_dst;
2090     
2091     		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2092     		    ipv4_devconf.mc_forwarding) {
2093     			int err = ipmr_get_route(skb, r, nowait);
2094     			if (err <= 0) {
2095     				if (!nowait) {
2096     					if (err == 0)
2097     						return 0;
2098     					goto nlmsg_failure;
2099     				} else {
2100     					if (err == -EMSGSIZE)
2101     						goto nlmsg_failure;
2102     					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2103     				}
2104     			}
2105     		} else
2106     #endif
2107     			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2108     	}
2109     
2110     	nlh->nlmsg_len = skb->tail - b;
2111     	return skb->len;
2112     
2113     nlmsg_failure:
2114     rtattr_failure:
2115     	skb_trim(skb, b - skb->data);
2116     	return -1;
2117     }
2118     
2119     int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2120     {
2121     	struct rtattr **rta = arg;
2122     	struct rtmsg *rtm = NLMSG_DATA(nlh);
2123     	struct rtable *rt = NULL;
2124     	u32 dst = 0;
2125     	u32 src = 0;
2126     	int iif = 0;
2127     	int err = -ENOBUFS;
2128     	struct sk_buff *skb;
2129     
2130     	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2131     	if (!skb)
2132     		goto out;
2133     
2134     	/* Reserve room for dummy headers, this skb can pass
2135     	   through good chunk of routing engine.
2136     	 */
2137     	skb->mac.raw = skb->data;
2138     	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2139     
2140     	if (rta[RTA_SRC - 1])
2141     		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2142     	if (rta[RTA_DST - 1])
2143     		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2144     	if (rta[RTA_IIF - 1])
2145     		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2146     
2147     	if (iif) {
2148     		struct net_device *dev = __dev_get_by_index(iif);
2149     		err = -ENODEV;
2150     		if (!dev)
2151     			goto out;
2152     		skb->protocol	= __constant_htons(ETH_P_IP);
2153     		skb->dev	= dev;
2154     		local_bh_disable();
2155     		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2156     		local_bh_enable();
2157     		rt = (struct rtable*)skb->dst;
2158     		if (!err && rt->u.dst.error)
2159     			err = -rt->u.dst.error;
2160     	} else {
2161     		int oif = 0;
2162     		if (rta[RTA_OIF - 1])
2163     			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2164     		err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2165     	}
2166     	if (err) {
2167     		kfree_skb(skb);
2168     		goto out;
2169     	}
2170     
2171     	skb->dst = &rt->u.dst;
2172     	if (rtm->rtm_flags & RTM_F_NOTIFY)
2173     		rt->rt_flags |= RTCF_NOTIFY;
2174     
2175     	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2176     
2177     	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2178     				RTM_NEWROUTE, 0);
2179     	if (!err)
2180     		goto out;
2181     	if (err < 0) {
2182     		err = -EMSGSIZE;
2183     		goto out;
2184     	}
2185     
2186     	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2187     	if (err > 0)
2188     		err = 0;
2189     out:	return err;
2190     }
2191     
2192     int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2193     {
2194     	struct rtable *rt;
2195     	int h, s_h;
2196     	int idx, s_idx;
2197     
2198     	s_h = cb->args[0];
2199     	s_idx = idx = cb->args[1];
2200     	for (h = 0; h <= rt_hash_mask; h++) {
2201     		if (h < s_h) continue;
2202     		if (h > s_h)
2203     			s_idx = 0;
2204     		read_lock_bh(&rt_hash_table[h].lock);
2205     		for (rt = rt_hash_table[h].chain, idx = 0; rt;
2206     		     rt = rt->u.rt_next, idx++) {
2207     			if (idx < s_idx)
2208     				continue;
2209     			skb->dst = dst_clone(&rt->u.dst);
2210     			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2211     					 cb->nlh->nlmsg_seq,
2212     					 RTM_NEWROUTE, 1) <= 0) {
2213     				dst_release(xchg(&skb->dst, NULL));
2214     				read_unlock_bh(&rt_hash_table[h].lock);
2215     				goto done;
2216     			}
2217     			dst_release(xchg(&skb->dst, NULL));
2218     		}
2219     		read_unlock_bh(&rt_hash_table[h].lock);
2220     	}
2221     
2222     done:
2223     	cb->args[0] = h;
2224     	cb->args[1] = idx;
2225     	return skb->len;
2226     }
2227     
2228     #endif /* CONFIG_RTNETLINK */
2229     
2230     void ip_rt_multicast_event(struct in_device *in_dev)
2231     {
2232     	rt_cache_flush(0);
2233     }
2234     
2235     #ifdef CONFIG_SYSCTL
2236     static int flush_delay;
2237     
2238     static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2239     					struct file *filp, void *buffer,
2240     					size_t *lenp)
2241     {
2242     	if (write) {
2243     		proc_dointvec(ctl, write, filp, buffer, lenp);
2244     		rt_cache_flush(flush_delay);
2245     		return 0;
2246     	} 
2247     
2248     	return -EINVAL;
2249     }
2250     
2251     static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2252     						int nlen, void *oldval,
2253     						size_t *oldlenp, void *newval,
2254     						size_t newlen, void **context)
2255     {
2256     	int delay;
2257     	if (newlen != sizeof(int))
2258     		return -EINVAL;
2259     	if (get_user(delay, (int *)newval))
2260     		return -EFAULT; 
2261     	rt_cache_flush(delay); 
2262     	return 0;
2263     }
2264     
2265     ctl_table ipv4_route_table[] = {
2266             {
2267     		ctl_name:	NET_IPV4_ROUTE_FLUSH,
2268     		procname:	"flush",
2269     		data:		&flush_delay,
2270     		maxlen:		sizeof(int),
2271     		mode:		0644,
2272     		proc_handler:	&ipv4_sysctl_rtcache_flush,
2273     		strategy:	&ipv4_sysctl_rtcache_flush_strategy,
2274     	},
2275     	{
2276     		ctl_name:	NET_IPV4_ROUTE_MIN_DELAY,
2277     		procname:	"min_delay",
2278     		data:		&ip_rt_min_delay,
2279     		maxlen:		sizeof(int),
2280     		mode:		0644,
2281     		proc_handler:	&proc_dointvec_jiffies,
2282     		strategy:	&sysctl_jiffies,
2283     	},
2284     	{
2285     		ctl_name:	NET_IPV4_ROUTE_MAX_DELAY,
2286     		procname:	"max_delay",
2287     		data:		&ip_rt_max_delay,
2288     		maxlen:		sizeof(int),
2289     		mode:		0644,
2290     		proc_handler:	&proc_dointvec_jiffies,
2291     		strategy:	&sysctl_jiffies,
2292     	},
2293     	{
2294     		ctl_name:	NET_IPV4_ROUTE_GC_THRESH,
2295     		procname:	"gc_thresh",
2296     		data:		&ipv4_dst_ops.gc_thresh,
2297     		maxlen:		sizeof(int),
2298     		mode:		0644,
2299     		proc_handler:	&proc_dointvec,
2300     	},
2301     	{
2302     		ctl_name:	NET_IPV4_ROUTE_MAX_SIZE,
2303     		procname:	"max_size",
2304     		data:		&ip_rt_max_size,
2305     		maxlen:		sizeof(int),
2306     		mode:		0644,
2307     		proc_handler:	&proc_dointvec,
2308     	},
2309     	{
2310     		ctl_name:	NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2311     		procname:	"gc_min_interval",
2312     		data:		&ip_rt_gc_min_interval,
2313     		maxlen:		sizeof(int),
2314     		mode:		0644,
2315     		proc_handler:	&proc_dointvec_jiffies,
2316     		strategy:	&sysctl_jiffies,
2317     	},
2318     	{
2319     		ctl_name:	NET_IPV4_ROUTE_GC_TIMEOUT,
2320     		procname:	"gc_timeout",
2321     		data:		&ip_rt_gc_timeout,
2322     		maxlen:		sizeof(int),
2323     		mode:		0644,
2324     		proc_handler:	&proc_dointvec_jiffies,
2325     		strategy:	&sysctl_jiffies,
2326     	},
2327     	{
2328     		ctl_name:	NET_IPV4_ROUTE_GC_INTERVAL,
2329     		procname:	"gc_interval",
2330     		data:		&ip_rt_gc_interval,
2331     		maxlen:		sizeof(int),
2332     		mode:		0644,
2333     		proc_handler:	&proc_dointvec_jiffies,
2334     		strategy:	&sysctl_jiffies,
2335     	},
2336     	{
2337     		ctl_name:	NET_IPV4_ROUTE_REDIRECT_LOAD,
2338     		procname:	"redirect_load",
2339     		data:		&ip_rt_redirect_load,
2340     		maxlen:		sizeof(int),
2341     		mode:		0644,
2342     		proc_handler:	&proc_dointvec,
2343     	},
2344     	{
2345     		ctl_name:	NET_IPV4_ROUTE_REDIRECT_NUMBER,
2346     		procname:	"redirect_number",
2347     		data:		&ip_rt_redirect_number,
2348     		maxlen:		sizeof(int),
2349     		mode:		0644,
2350     		proc_handler:	&proc_dointvec,
2351     	},
2352     	{
2353     		ctl_name:	NET_IPV4_ROUTE_REDIRECT_SILENCE,
2354     		procname:	"redirect_silence",
2355     		data:		&ip_rt_redirect_silence,
2356     		maxlen:		sizeof(int),
2357     		mode:		0644,
2358     		proc_handler:	&proc_dointvec,
2359     	},
2360     	{
2361     		ctl_name:	NET_IPV4_ROUTE_ERROR_COST,
2362     		procname:	"error_cost",
2363     		data:		&ip_rt_error_cost,
2364     		maxlen:		sizeof(int),
2365     		mode:		0644,
2366     		proc_handler:	&proc_dointvec,
2367     	},
2368     	{
2369     		ctl_name:	NET_IPV4_ROUTE_ERROR_BURST,
2370     		procname:	"error_burst",
2371     		data:		&ip_rt_error_burst,
2372     		maxlen:		sizeof(int),
2373     		mode:		0644,
2374     		proc_handler:	&proc_dointvec,
2375     	},
2376     	{
2377     		ctl_name:	NET_IPV4_ROUTE_GC_ELASTICITY,
2378     		procname:	"gc_elasticity",
2379     		data:		&ip_rt_gc_elasticity,
2380     		maxlen:		sizeof(int),
2381     		mode:		0644,
2382     		proc_handler:	&proc_dointvec,
2383     	},
2384     	{
2385     		ctl_name:	NET_IPV4_ROUTE_MTU_EXPIRES,
2386     		procname:	"mtu_expires",
2387     		data:		&ip_rt_mtu_expires,
2388     		maxlen:		sizeof(int),
2389     		mode:		0644,
2390     		proc_handler:	&proc_dointvec_jiffies,
2391     		strategy:	&sysctl_jiffies,
2392     	},
2393     	{
2394     		ctl_name:	NET_IPV4_ROUTE_MIN_PMTU,
2395     		procname:	"min_pmtu",
2396     		data:		&ip_rt_min_pmtu,
2397     		maxlen:		sizeof(int),
2398     		mode:		0644,
2399     		proc_handler:	&proc_dointvec,
2400     	},
2401     	{
2402     		ctl_name:	NET_IPV4_ROUTE_MIN_ADVMSS,
2403     		procname:	"min_adv_mss",
2404     		data:		&ip_rt_min_advmss,
2405     		maxlen:		sizeof(int),
2406     		mode:		0644,
2407     		proc_handler:	&proc_dointvec,
2408     	},
2409     	 { 0 }
2410     };
2411     #endif
2412     
2413     #ifdef CONFIG_NET_CLS_ROUTE
2414     struct ip_rt_acct *ip_rt_acct;
2415     
2416     static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2417     			   int length, int *eof, void *data)
2418     {
2419     	*start = buffer;
2420     
2421     	if ((offset & 3) || (length & 3))
2422     		return -EIO;
2423     
2424     	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2425     		length = sizeof(struct ip_rt_acct) * 256 - offset;
2426     		*eof = 1;
2427     	}
2428     	if (length > 0) {
2429     		u32 *dst = (u32*)buffer;
2430     		u32 *src = (u32*)(((u8*)ip_rt_acct) + offset);
2431     
2432     		memcpy(dst, src, length);
2433     
2434     #ifdef CONFIG_SMP
2435     		if (smp_num_cpus > 1 || cpu_logical_map(0) != 0) {
2436     			int i;
2437     			int cnt = length / 4;
2438     
2439     			for (i = 0; i < smp_num_cpus; i++) {
2440     				int cpu = cpu_logical_map(i);
2441     				int k;
2442     
2443     				if (cpu == 0)
2444     					continue;
2445     
2446     				src = (u32*)(((u8*)ip_rt_acct) + offset +
2447     					cpu * 256 * sizeof(struct ip_rt_acct));
2448     
2449     				for (k = 0; k < cnt; k++)
2450     					dst[k] += src[k];
2451     			}
2452     		}
2453     #endif
2454     		return length;
2455     	}
2456     	return 0;
2457     }
2458     #endif
2459     
2460     void __init ip_rt_init(void)
2461     {
2462     	int i, order, goal;
2463     
2464     #ifdef CONFIG_NET_CLS_ROUTE
2465     	for (order = 0;
2466     	     (PAGE_SIZE << order) < 256 * sizeof(ip_rt_acct) * NR_CPUS; order++)
2467     		/* NOTHING */;
2468     	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2469     	if (!ip_rt_acct)
2470     		panic("IP: failed to allocate ip_rt_acct\n");
2471     	memset(ip_rt_acct, 0, PAGE_SIZE << order);
2472     #endif
2473     
2474     	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2475     						     sizeof(struct rtable),
2476     						     0, SLAB_HWCACHE_ALIGN,
2477     						     NULL, NULL);
2478     
2479     	if (!ipv4_dst_ops.kmem_cachep)
2480     		panic("IP: failed to allocate ip_dst_cache\n");
2481     
2482     	goal = num_physpages >> (26 - PAGE_SHIFT);
2483     
2484     	for (order = 0; (1UL << order) < goal; order++)
2485     		/* NOTHING */;
2486     
2487     	do {
2488     		rt_hash_mask = (1UL << order) * PAGE_SIZE /
2489     			sizeof(struct rt_hash_bucket);
2490     		while (rt_hash_mask & (rt_hash_mask - 1))
2491     			rt_hash_mask--;
2492     		rt_hash_table = (struct rt_hash_bucket *)
2493     			__get_free_pages(GFP_ATOMIC, order);
2494     	} while (rt_hash_table == NULL && --order > 0);
2495     
2496     	if (!rt_hash_table)
2497     		panic("Failed to allocate IP route cache hash table\n");
2498     
2499     	printk("IP: routing cache hash table of %u buckets, %ldKbytes\n",
2500     	       rt_hash_mask,
2501     	       (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2502     
2503     	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2504     		/* NOTHING */;
2505     
2506     	rt_hash_mask--;
2507     	for (i = 0; i <= rt_hash_mask; i++) {
2508     		rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2509     		rt_hash_table[i].chain = NULL;
2510     	}
2511     
2512     	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2513     	ip_rt_max_size = (rt_hash_mask + 1) * 16;
2514     
2515     	devinet_init();
2516     	ip_fib_init();
2517     
2518     	rt_flush_timer.function = rt_run_flush;
2519     	rt_periodic_timer.function = rt_check_expire;
2520     
2521     	/* All the timers, started at system startup tend
2522     	   to synchronize. Perturb it a bit.
2523     	 */
2524     	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2525     					ip_rt_gc_interval;
2526     	add_timer(&rt_periodic_timer);
2527     
2528     	proc_net_create ("rt_cache", 0, rt_cache_get_info);
2529     	proc_net_create ("rt_cache_stat", 0, rt_cache_stat_get_info);
2530     #ifdef CONFIG_NET_CLS_ROUTE
2531     	create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2532     #endif
2533     }
2534