File: /usr/src/linux/net/ipv4/route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.99 2001/09/18 22:29:09 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 *
57 * This program is free software; you can redistribute it and/or
58 * modify it under the terms of the GNU General Public License
59 * as published by the Free Software Foundation; either version
60 * 2 of the License, or (at your option) any later version.
61 */
62
63 #include <linux/config.h>
64 #include <asm/uaccess.h>
65 #include <asm/system.h>
66 #include <asm/bitops.h>
67 #include <linux/types.h>
68 #include <linux/kernel.h>
69 #include <linux/sched.h>
70 #include <linux/mm.h>
71 #include <linux/string.h>
72 #include <linux/socket.h>
73 #include <linux/sockios.h>
74 #include <linux/errno.h>
75 #include <linux/in.h>
76 #include <linux/inet.h>
77 #include <linux/netdevice.h>
78 #include <linux/proc_fs.h>
79 #include <linux/init.h>
80 #include <linux/skbuff.h>
81 #include <linux/rtnetlink.h>
82 #include <linux/inetdevice.h>
83 #include <linux/igmp.h>
84 #include <linux/pkt_sched.h>
85 #include <linux/mroute.h>
86 #include <linux/netfilter_ipv4.h>
87 #include <linux/random.h>
88 #include <net/protocol.h>
89 #include <net/ip.h>
90 #include <net/route.h>
91 #include <net/inetpeer.h>
92 #include <net/sock.h>
93 #include <net/ip_fib.h>
94 #include <net/arp.h>
95 #include <net/tcp.h>
96 #include <net/icmp.h>
97 #ifdef CONFIG_SYSCTL
98 #include <linux/sysctl.h>
99 #endif
100
101 #define IP_MAX_MTU 0xFFF0
102
103 #define RT_GC_TIMEOUT (300*HZ)
104
105 int ip_rt_min_delay = 2 * HZ;
106 int ip_rt_max_delay = 10 * HZ;
107 int ip_rt_max_size;
108 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
109 int ip_rt_gc_interval = 60 * HZ;
110 int ip_rt_gc_min_interval = 5 * HZ;
111 int ip_rt_redirect_number = 9;
112 int ip_rt_redirect_load = HZ / 50;
113 int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
114 int ip_rt_error_cost = HZ;
115 int ip_rt_error_burst = 5 * HZ;
116 int ip_rt_gc_elasticity = 8;
117 int ip_rt_mtu_expires = 10 * 60 * HZ;
118 int ip_rt_min_pmtu = 512 + 20 + 20;
119 int ip_rt_min_advmss = 256;
120
121 static unsigned long rt_deadline;
122
123 #define RTprint(a...) printk(KERN_DEBUG a)
124
125 static struct timer_list rt_flush_timer;
126 static struct timer_list rt_periodic_timer;
127
128 /*
129 * Interface to generic destination cache.
130 */
131
132 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
133 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
134 struct sk_buff *skb);
135 static void ipv4_dst_destroy(struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void ipv4_link_failure(struct sk_buff *skb);
138 static int rt_garbage_collect(void);
139
140
141 struct dst_ops ipv4_dst_ops = {
142 family: AF_INET,
143 protocol: __constant_htons(ETH_P_IP),
144 gc: rt_garbage_collect,
145 check: ipv4_dst_check,
146 reroute: ipv4_dst_reroute,
147 destroy: ipv4_dst_destroy,
148 negative_advice: ipv4_negative_advice,
149 link_failure: ipv4_link_failure,
150 entry_size: sizeof(struct rtable),
151 };
152
153 #ifdef CONFIG_INET_ECN
154 #define ECN_OR_COST(class) TC_PRIO_##class
155 #else
156 #define ECN_OR_COST(class) TC_PRIO_FILLER
157 #endif
158
159 __u8 ip_tos2prio[16] = {
160 TC_PRIO_BESTEFFORT,
161 ECN_OR_COST(FILLER),
162 TC_PRIO_BESTEFFORT,
163 ECN_OR_COST(BESTEFFORT),
164 TC_PRIO_BULK,
165 ECN_OR_COST(BULK),
166 TC_PRIO_BULK,
167 ECN_OR_COST(BULK),
168 TC_PRIO_INTERACTIVE,
169 ECN_OR_COST(INTERACTIVE),
170 TC_PRIO_INTERACTIVE,
171 ECN_OR_COST(INTERACTIVE),
172 TC_PRIO_INTERACTIVE_BULK,
173 ECN_OR_COST(INTERACTIVE_BULK),
174 TC_PRIO_INTERACTIVE_BULK,
175 ECN_OR_COST(INTERACTIVE_BULK)
176 };
177
178
179 /*
180 * Route cache.
181 */
182
183 /* The locking scheme is rather straight forward:
184 *
185 * 1) A BH protected rwlocks protect buckets of the central route hash.
186 * 2) Only writers remove entries, and they hold the lock
187 * as they look at rtable reference counts.
188 * 3) Only readers acquire references to rtable entries,
189 * they do so with atomic increments and with the
190 * lock held.
191 */
192
193 struct rt_hash_bucket {
194 struct rtable *chain;
195 rwlock_t lock;
196 } __attribute__((__aligned__(8)));
197
198 static struct rt_hash_bucket *rt_hash_table;
199 static unsigned rt_hash_mask;
200 static int rt_hash_log;
201
202 struct rt_cache_stat rt_cache_stat[NR_CPUS];
203
204 static int rt_intern_hash(unsigned hash, struct rtable *rth,
205 struct rtable **res);
206
207 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
208 {
209 unsigned hash = ((daddr & 0xF0F0F0F0) >> 4) |
210 ((daddr & 0x0F0F0F0F) << 4);
211 hash ^= saddr ^ tos;
212 hash ^= (hash >> 16);
213 return (hash ^ (hash >> 8)) & rt_hash_mask;
214 }
215
216 static int rt_cache_get_info(char *buffer, char **start, off_t offset,
217 int length)
218 {
219 int len = 0;
220 off_t pos = 128;
221 char temp[129];
222 struct rtable *r;
223 int i;
224
225 if (offset < 128) {
226 sprintf(buffer, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
230 len = 128;
231 }
232
233 for (i = rt_hash_mask; i >= 0; i--) {
234 read_lock_bh(&rt_hash_table[i].lock);
235 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
236 /*
237 * Spin through entries until we are ready
238 */
239 pos += 128;
240
241 if (pos <= offset) {
242 len = 0;
243 continue;
244 }
245 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
246 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
247 r->u.dst.dev ? r->u.dst.dev->name : "*",
248 (unsigned long)r->rt_dst,
249 (unsigned long)r->rt_gateway,
250 r->rt_flags,
251 atomic_read(&r->u.dst.__refcnt),
252 r->u.dst.__use,
253 0,
254 (unsigned long)r->rt_src,
255 (int)r->u.dst.advmss + 40,
256 r->u.dst.window,
257 (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
258 r->key.tos,
259 r->u.dst.hh ?
260 atomic_read(&r->u.dst.hh->hh_refcnt) :
261 -1,
262 r->u.dst.hh ?
263 (r->u.dst.hh->hh_output ==
264 dev_queue_xmit) : 0,
265 r->rt_spec_dst);
266 sprintf(buffer + len, "%-127s\n", temp);
267 len += 128;
268 if (pos >= offset+length) {
269 read_unlock_bh(&rt_hash_table[i].lock);
270 goto done;
271 }
272 }
273 read_unlock_bh(&rt_hash_table[i].lock);
274 }
275
276 done:
277 *start = buffer + len - (pos - offset);
278 len = pos - offset;
279 if (len > length)
280 len = length;
281 return len;
282 }
283
284 static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
285 {
286 unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
287 int i, lcpu;
288 int len = 0;
289
290 for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
291 i = cpu_logical_map(lcpu);
292
293 len += sprintf(buffer+len, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
294 dst_entries,
295 rt_cache_stat[i].in_hit,
296 rt_cache_stat[i].in_slow_tot,
297 rt_cache_stat[i].in_slow_mc,
298 rt_cache_stat[i].in_no_route,
299 rt_cache_stat[i].in_brd,
300 rt_cache_stat[i].in_martian_dst,
301 rt_cache_stat[i].in_martian_src,
302
303 rt_cache_stat[i].out_hit,
304 rt_cache_stat[i].out_slow_tot,
305 rt_cache_stat[i].out_slow_mc
306 );
307 }
308 len -= offset;
309
310 if (len > length)
311 len = length;
312 if (len < 0)
313 len = 0;
314
315 *start = buffer + offset;
316 return len;
317 }
318
319 static __inline__ void rt_free(struct rtable *rt)
320 {
321 dst_free(&rt->u.dst);
322 }
323
324 static __inline__ void rt_drop(struct rtable *rt)
325 {
326 ip_rt_put(rt);
327 dst_free(&rt->u.dst);
328 }
329
330 static __inline__ int rt_fast_clean(struct rtable *rth)
331 {
332 /* Kill broadcast/multicast entries very aggresively, if they
333 collide in hash table with more useful entries */
334 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
335 rth->key.iif && rth->u.rt_next;
336 }
337
338 static __inline__ int rt_valuable(struct rtable *rth)
339 {
340 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
341 rth->u.dst.expires;
342 }
343
344 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
345 {
346 int age;
347 int ret = 0;
348
349 if (atomic_read(&rth->u.dst.__refcnt))
350 goto out;
351
352 ret = 1;
353 if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
354 goto out;
355
356 age = jiffies - rth->u.dst.lastuse;
357 ret = 0;
358 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
359 (age <= tmo2 && rt_valuable(rth)))
360 goto out;
361 ret = 1;
362 out: return ret;
363 }
364
365 /* This runs via a timer and thus is always in BH context. */
366 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
367 {
368 static int rover;
369 int i = rover, t;
370 struct rtable *rth, **rthp;
371 unsigned long now = jiffies;
372
373 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
374 t -= ip_rt_gc_timeout) {
375 unsigned tmo = ip_rt_gc_timeout;
376
377 i = (i + 1) & rt_hash_mask;
378 rthp = &rt_hash_table[i].chain;
379
380 write_lock(&rt_hash_table[i].lock);
381 while ((rth = *rthp) != NULL) {
382 if (rth->u.dst.expires) {
383 /* Entry is expired even if it is in use */
384 if ((long)(now - rth->u.dst.expires) <= 0) {
385 tmo >>= 1;
386 rthp = &rth->u.rt_next;
387 continue;
388 }
389 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
390 tmo >>= 1;
391 rthp = &rth->u.rt_next;
392 continue;
393 }
394
395 /* Cleanup aged off entries. */
396 *rthp = rth->u.rt_next;
397 rt_free(rth);
398 }
399 write_unlock(&rt_hash_table[i].lock);
400
401 /* Fallback loop breaker. */
402 if ((jiffies - now) > 0)
403 break;
404 }
405 rover = i;
406 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
407 }
408
409 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
410
411 /* This can run from both BH and non-BH contexts, the latter
412 * in the case of a forced flush event.
413 */
414 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
415 {
416 int i;
417 struct rtable *rth, *next;
418
419 rt_deadline = 0;
420
421 for (i = rt_hash_mask; i >= 0; i--) {
422 write_lock_bh(&rt_hash_table[i].lock);
423 rth = rt_hash_table[i].chain;
424 if (rth)
425 rt_hash_table[i].chain = NULL;
426 write_unlock_bh(&rt_hash_table[i].lock);
427
428 for (; rth; rth = next) {
429 next = rth->u.rt_next;
430 rt_free(rth);
431 }
432 }
433 }
434
435 SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
436
437 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
438
439 void rt_cache_flush(int delay)
440 {
441 unsigned long now = jiffies;
442 int user_mode = !in_softirq();
443
444 if (delay < 0)
445 delay = ip_rt_min_delay;
446
447 spin_lock_bh(&rt_flush_lock);
448
449 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
450 long tmo = (long)(rt_deadline - now);
451
452 /* If flush timer is already running
453 and flush request is not immediate (delay > 0):
454
455 if deadline is not achieved, prolongate timer to "delay",
456 otherwise fire it at deadline time.
457 */
458
459 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
460 tmo = 0;
461
462 if (delay > tmo)
463 delay = tmo;
464 }
465
466 if (delay <= 0) {
467 spin_unlock_bh(&rt_flush_lock);
468 SMP_TIMER_NAME(rt_run_flush)(0);
469 return;
470 }
471
472 if (rt_deadline == 0)
473 rt_deadline = now + ip_rt_max_delay;
474
475 mod_timer(&rt_flush_timer, now+delay);
476 spin_unlock_bh(&rt_flush_lock);
477 }
478
479 /*
480 Short description of GC goals.
481
482 We want to build algorithm, which will keep routing cache
483 at some equilibrium point, when number of aged off entries
484 is kept approximately equal to newly generated ones.
485
486 Current expiration strength is variable "expire".
487 We try to adjust it dynamically, so that if networking
488 is idle expires is large enough to keep enough of warm entries,
489 and when load increases it reduces to limit cache size.
490 */
491
492 static int rt_garbage_collect(void)
493 {
494 static unsigned expire = RT_GC_TIMEOUT;
495 static unsigned long last_gc;
496 static int rover;
497 static int equilibrium;
498 struct rtable *rth, **rthp;
499 unsigned long now = jiffies;
500 int goal;
501
502 /*
503 * Garbage collection is pretty expensive,
504 * do not make it too frequently.
505 */
506 if (now - last_gc < ip_rt_gc_min_interval &&
507 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
508 goto out;
509
510 /* Calculate number of entries, which we want to expire now. */
511 goal = atomic_read(&ipv4_dst_ops.entries) -
512 (ip_rt_gc_elasticity << rt_hash_log);
513 if (goal <= 0) {
514 if (equilibrium < ipv4_dst_ops.gc_thresh)
515 equilibrium = ipv4_dst_ops.gc_thresh;
516 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
517 if (goal > 0) {
518 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
519 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
520 }
521 } else {
522 /* We are in dangerous area. Try to reduce cache really
523 * aggressively.
524 */
525 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
526 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
527 }
528
529 if (now - last_gc >= ip_rt_gc_min_interval)
530 last_gc = now;
531
532 if (goal <= 0) {
533 equilibrium += goal;
534 goto work_done;
535 }
536
537 do {
538 int i, k;
539
540 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
541 unsigned tmo = expire;
542
543 k = (k + 1) & rt_hash_mask;
544 rthp = &rt_hash_table[k].chain;
545 write_lock_bh(&rt_hash_table[k].lock);
546 while ((rth = *rthp) != NULL) {
547 if (!rt_may_expire(rth, tmo, expire)) {
548 tmo >>= 1;
549 rthp = &rth->u.rt_next;
550 continue;
551 }
552 *rthp = rth->u.rt_next;
553 rt_free(rth);
554 goal--;
555 }
556 write_unlock_bh(&rt_hash_table[k].lock);
557 if (goal <= 0)
558 break;
559 }
560 rover = k;
561
562 if (goal <= 0)
563 goto work_done;
564
565 /* Goal is not achieved. We stop process if:
566
567 - if expire reduced to zero. Otherwise, expire is halfed.
568 - if table is not full.
569 - if we are called from interrupt.
570 - jiffies check is just fallback/debug loop breaker.
571 We will not spin here for long time in any case.
572 */
573
574 if (expire == 0)
575 break;
576
577 expire >>= 1;
578 #if RT_CACHE_DEBUG >= 2
579 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
580 atomic_read(&ipv4_dst_ops.entries), goal, i);
581 #endif
582
583 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
584 goto out;
585 } while (!in_softirq() && jiffies - now < 1);
586
587 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
588 goto out;
589 if (net_ratelimit())
590 printk("dst cache overflow\n");
591 return 1;
592
593 work_done:
594 expire += ip_rt_gc_min_interval;
595 if (expire > ip_rt_gc_timeout ||
596 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
597 expire = ip_rt_gc_timeout;
598 #if RT_CACHE_DEBUG >= 2
599 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
600 atomic_read(&ipv4_dst_ops.entries), goal, rover);
601 #endif
602 out: return 0;
603 }
604
605 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
606 {
607 struct rtable *rth, **rthp;
608 unsigned long now = jiffies;
609 int attempts = !in_softirq();
610
611 restart:
612 rthp = &rt_hash_table[hash].chain;
613
614 write_lock_bh(&rt_hash_table[hash].lock);
615 while ((rth = *rthp) != NULL) {
616 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
617 /* Put it first */
618 *rthp = rth->u.rt_next;
619 rth->u.rt_next = rt_hash_table[hash].chain;
620 rt_hash_table[hash].chain = rth;
621
622 rth->u.dst.__use++;
623 dst_hold(&rth->u.dst);
624 rth->u.dst.lastuse = now;
625 write_unlock_bh(&rt_hash_table[hash].lock);
626
627 rt_drop(rt);
628 *rp = rth;
629 return 0;
630 }
631
632 rthp = &rth->u.rt_next;
633 }
634
635 /* Try to bind route to arp only if it is output
636 route or unicast forwarding path.
637 */
638 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
639 int err = arp_bind_neighbour(&rt->u.dst);
640 if (err) {
641 write_unlock_bh(&rt_hash_table[hash].lock);
642
643 if (err != -ENOBUFS) {
644 rt_drop(rt);
645 return err;
646 }
647
648 /* Neighbour tables are full and nothing
649 can be released. Try to shrink route cache,
650 it is most likely it holds some neighbour records.
651 */
652 if (attempts-- > 0) {
653 int saved_elasticity = ip_rt_gc_elasticity;
654 int saved_int = ip_rt_gc_min_interval;
655 ip_rt_gc_elasticity = 1;
656 ip_rt_gc_min_interval = 0;
657 rt_garbage_collect();
658 ip_rt_gc_min_interval = saved_int;
659 ip_rt_gc_elasticity = saved_elasticity;
660 goto restart;
661 }
662
663 if (net_ratelimit())
664 printk("Neighbour table overflow.\n");
665 rt_drop(rt);
666 return -ENOBUFS;
667 }
668 }
669
670 rt->u.rt_next = rt_hash_table[hash].chain;
671 #if RT_CACHE_DEBUG >= 2
672 if (rt->u.rt_next) {
673 struct rtable *trt;
674 printk("rt_cache @%02x: %u.%u.%u.%u", hash,
675 NIPQUAD(rt->rt_dst));
676 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
677 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
678 printk("\n");
679 }
680 #endif
681 rt_hash_table[hash].chain = rt;
682 write_unlock_bh(&rt_hash_table[hash].lock);
683 *rp = rt;
684 return 0;
685 }
686
687 void rt_bind_peer(struct rtable *rt, int create)
688 {
689 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
690 struct inet_peer *peer;
691
692 peer = inet_getpeer(rt->rt_dst, create);
693
694 spin_lock_bh(&rt_peer_lock);
695 if (rt->peer == NULL) {
696 rt->peer = peer;
697 peer = NULL;
698 }
699 spin_unlock_bh(&rt_peer_lock);
700 if (peer)
701 inet_putpeer(peer);
702 }
703
704 /*
705 * Peer allocation may fail only in serious out-of-memory conditions. However
706 * we still can generate some output.
707 * Random ID selection looks a bit dangerous because we have no chances to
708 * select ID being unique in a reasonable period of time.
709 * But broken packet identifier may be better than no packet at all.
710 */
711 static void ip_select_fb_ident(struct iphdr *iph)
712 {
713 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
714 static u32 ip_fallback_id;
715 u32 salt;
716
717 spin_lock_bh(&ip_fb_id_lock);
718 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
719 iph->id = htons(salt & 0xFFFF);
720 ip_fallback_id = salt;
721 spin_unlock_bh(&ip_fb_id_lock);
722 }
723
724 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
725 {
726 struct rtable *rt = (struct rtable *) dst;
727
728 if (rt) {
729 if (rt->peer == NULL)
730 rt_bind_peer(rt, 1);
731
732 /* If peer is attached to destination, it is never detached,
733 so that we need not to grab a lock to dereference it.
734 */
735 if (rt->peer) {
736 iph->id = htons(inet_getid(rt->peer));
737 return;
738 }
739 } else
740 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
741
742 ip_select_fb_ident(iph);
743 }
744
745 static void rt_del(unsigned hash, struct rtable *rt)
746 {
747 struct rtable **rthp;
748
749 write_lock_bh(&rt_hash_table[hash].lock);
750 ip_rt_put(rt);
751 for (rthp = &rt_hash_table[hash].chain; *rthp;
752 rthp = &(*rthp)->u.rt_next)
753 if (*rthp == rt) {
754 *rthp = rt->u.rt_next;
755 rt_free(rt);
756 break;
757 }
758 write_unlock_bh(&rt_hash_table[hash].lock);
759 }
760
761 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
762 u32 saddr, u8 tos, struct net_device *dev)
763 {
764 int i, k;
765 struct in_device *in_dev = in_dev_get(dev);
766 struct rtable *rth, **rthp;
767 u32 skeys[2] = { saddr, 0 };
768 int ikeys[2] = { dev->ifindex, 0 };
769
770 tos &= IPTOS_RT_MASK;
771
772 if (!in_dev)
773 return;
774
775 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
776 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
777 goto reject_redirect;
778
779 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
780 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
781 goto reject_redirect;
782 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
783 goto reject_redirect;
784 } else {
785 if (inet_addr_type(new_gw) != RTN_UNICAST)
786 goto reject_redirect;
787 }
788
789 for (i = 0; i < 2; i++) {
790 for (k = 0; k < 2; k++) {
791 unsigned hash = rt_hash_code(daddr,
792 skeys[i] ^ (ikeys[k] << 5),
793 tos);
794
795 rthp=&rt_hash_table[hash].chain;
796
797 read_lock(&rt_hash_table[hash].lock);
798 while ((rth = *rthp) != NULL) {
799 struct rtable *rt;
800
801 if (rth->key.dst != daddr ||
802 rth->key.src != skeys[i] ||
803 rth->key.tos != tos ||
804 rth->key.oif != ikeys[k] ||
805 rth->key.iif != 0) {
806 rthp = &rth->u.rt_next;
807 continue;
808 }
809
810 if (rth->rt_dst != daddr ||
811 rth->rt_src != saddr ||
812 rth->u.dst.error ||
813 rth->rt_gateway != old_gw ||
814 rth->u.dst.dev != dev)
815 break;
816
817 dst_clone(&rth->u.dst);
818 read_unlock(&rt_hash_table[hash].lock);
819
820 rt = dst_alloc(&ipv4_dst_ops);
821 if (rt == NULL) {
822 ip_rt_put(rth);
823 in_dev_put(in_dev);
824 return;
825 }
826
827 /* Copy all the information. */
828 *rt = *rth;
829 rt->u.dst.__use = 1;
830 atomic_set(&rt->u.dst.__refcnt, 1);
831 if (rt->u.dst.dev)
832 dev_hold(rt->u.dst.dev);
833 rt->u.dst.lastuse = jiffies;
834 rt->u.dst.neighbour = NULL;
835 rt->u.dst.hh = NULL;
836 rt->u.dst.obsolete = 0;
837
838 rt->rt_flags |= RTCF_REDIRECTED;
839
840 /* Gateway is different ... */
841 rt->rt_gateway = new_gw;
842
843 /* Redirect received -> path was valid */
844 dst_confirm(&rth->u.dst);
845
846 if (rt->peer)
847 atomic_inc(&rt->peer->refcnt);
848
849 if (arp_bind_neighbour(&rt->u.dst) ||
850 !(rt->u.dst.neighbour->nud_state &
851 NUD_VALID)) {
852 if (rt->u.dst.neighbour)
853 neigh_event_send(rt->u.dst.neighbour, NULL);
854 ip_rt_put(rth);
855 rt_drop(rt);
856 goto do_next;
857 }
858
859 rt_del(hash, rth);
860 if (!rt_intern_hash(hash, rt, &rt))
861 ip_rt_put(rt);
862 goto do_next;
863 }
864 read_unlock(&rt_hash_table[hash].lock);
865 do_next:
866 ;
867 }
868 }
869 in_dev_put(in_dev);
870 return;
871
872 reject_redirect:
873 #ifdef CONFIG_IP_ROUTE_VERBOSE
874 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
875 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
876 "%u.%u.%u.%u ignored.\n"
877 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
878 "tos %02x\n",
879 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
880 NIPQUAD(saddr), NIPQUAD(daddr), tos);
881 #endif
882 in_dev_put(in_dev);
883 }
884
885 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
886 {
887 struct rtable *rt = (struct rtable*)dst;
888 struct dst_entry *ret = dst;
889
890 if (rt) {
891 if (dst->obsolete) {
892 ip_rt_put(rt);
893 ret = NULL;
894 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
895 rt->u.dst.expires) {
896 unsigned hash = rt_hash_code(rt->key.dst,
897 rt->key.src ^
898 (rt->key.oif << 5),
899 rt->key.tos);
900 #if RT_CACHE_DEBUG >= 1
901 printk(KERN_DEBUG "ip_rt_advice: redirect to "
902 "%u.%u.%u.%u/%02x dropped\n",
903 NIPQUAD(rt->rt_dst), rt->key.tos);
904 #endif
905 rt_del(hash, rt);
906 ret = NULL;
907 }
908 }
909 return ret;
910 }
911
912 /*
913 * Algorithm:
914 * 1. The first ip_rt_redirect_number redirects are sent
915 * with exponential backoff, then we stop sending them at all,
916 * assuming that the host ignores our redirects.
917 * 2. If we did not see packets requiring redirects
918 * during ip_rt_redirect_silence, we assume that the host
919 * forgot redirected route and start to send redirects again.
920 *
921 * This algorithm is much cheaper and more intelligent than dumb load limiting
922 * in icmp.c.
923 *
924 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
925 * and "frag. need" (breaks PMTU discovery) in icmp.c.
926 */
927
928 void ip_rt_send_redirect(struct sk_buff *skb)
929 {
930 struct rtable *rt = (struct rtable*)skb->dst;
931 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
932
933 if (!in_dev)
934 return;
935
936 if (!IN_DEV_TX_REDIRECTS(in_dev))
937 goto out;
938
939 /* No redirected packets during ip_rt_redirect_silence;
940 * reset the algorithm.
941 */
942 if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
943 rt->u.dst.rate_tokens = 0;
944
945 /* Too many ignored redirects; do not send anything
946 * set u.dst.rate_last to the last seen redirected packet.
947 */
948 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
949 rt->u.dst.rate_last = jiffies;
950 goto out;
951 }
952
953 /* Check for load limit; set rate_last to the latest sent
954 * redirect.
955 */
956 if (jiffies - rt->u.dst.rate_last >
957 (ip_rt_redirect_load << rt->u.dst.rate_tokens)) {
958 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
959 rt->u.dst.rate_last = jiffies;
960 ++rt->u.dst.rate_tokens;
961 #ifdef CONFIG_IP_ROUTE_VERBOSE
962 if (IN_DEV_LOG_MARTIANS(in_dev) &&
963 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
964 net_ratelimit())
965 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
966 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
967 NIPQUAD(rt->rt_src), rt->rt_iif,
968 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
969 #endif
970 }
971 out:
972 in_dev_put(in_dev);
973 }
974
975 static int ip_error(struct sk_buff *skb)
976 {
977 struct rtable *rt = (struct rtable*)skb->dst;
978 unsigned long now;
979 int code;
980
981 switch (rt->u.dst.error) {
982 case EINVAL:
983 default:
984 goto out;
985 case EHOSTUNREACH:
986 code = ICMP_HOST_UNREACH;
987 break;
988 case ENETUNREACH:
989 code = ICMP_NET_UNREACH;
990 break;
991 case EACCES:
992 code = ICMP_PKT_FILTERED;
993 break;
994 }
995
996 now = jiffies;
997 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
998 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
999 rt->u.dst.rate_tokens = ip_rt_error_burst;
1000 rt->u.dst.rate_last = now;
1001 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1002 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1003 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1004 }
1005
1006 out: kfree_skb(skb);
1007 return 0;
1008 }
1009
1010 /*
1011 * The last two values are not from the RFC but
1012 * are needed for AMPRnet AX.25 paths.
1013 */
1014
1015 static unsigned short mtu_plateau[] =
1016 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1017
1018 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1019 {
1020 int i;
1021
1022 for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1023 if (old_mtu > mtu_plateau[i])
1024 return mtu_plateau[i];
1025 return 68;
1026 }
1027
1028 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1029 {
1030 int i;
1031 unsigned short old_mtu = ntohs(iph->tot_len);
1032 struct rtable *rth;
1033 u32 skeys[2] = { iph->saddr, 0, };
1034 u32 daddr = iph->daddr;
1035 u8 tos = iph->tos & IPTOS_RT_MASK;
1036 unsigned short est_mtu = 0;
1037
1038 if (ipv4_config.no_pmtu_disc)
1039 return 0;
1040
1041 for (i = 0; i < 2; i++) {
1042 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1043
1044 read_lock(&rt_hash_table[hash].lock);
1045 for (rth = rt_hash_table[hash].chain; rth;
1046 rth = rth->u.rt_next) {
1047 if (rth->key.dst == daddr &&
1048 rth->key.src == skeys[i] &&
1049 rth->rt_dst == daddr &&
1050 rth->rt_src == iph->saddr &&
1051 rth->key.tos == tos &&
1052 rth->key.iif == 0 &&
1053 !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1054 unsigned short mtu = new_mtu;
1055
1056 if (new_mtu < 68 || new_mtu >= old_mtu) {
1057
1058 /* BSD 4.2 compatibility hack :-( */
1059 if (mtu == 0 &&
1060 old_mtu >= rth->u.dst.pmtu &&
1061 old_mtu >= 68 + (iph->ihl << 2))
1062 old_mtu -= iph->ihl << 2;
1063
1064 mtu = guess_mtu(old_mtu);
1065 }
1066 if (mtu <= rth->u.dst.pmtu) {
1067 if (mtu < rth->u.dst.pmtu) {
1068 dst_confirm(&rth->u.dst);
1069 if (mtu < ip_rt_min_pmtu) {
1070 mtu = ip_rt_min_pmtu;
1071 rth->u.dst.mxlock |=
1072 (1 << RTAX_MTU);
1073 }
1074 rth->u.dst.pmtu = mtu;
1075 dst_set_expires(&rth->u.dst,
1076 ip_rt_mtu_expires);
1077 }
1078 est_mtu = mtu;
1079 }
1080 }
1081 }
1082 read_unlock(&rt_hash_table[hash].lock);
1083 }
1084 return est_mtu ? : new_mtu;
1085 }
1086
1087 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1088 {
1089 if (dst->pmtu > mtu && mtu >= 68 &&
1090 !(dst->mxlock & (1 << RTAX_MTU))) {
1091 if (mtu < ip_rt_min_pmtu) {
1092 mtu = ip_rt_min_pmtu;
1093 dst->mxlock |= (1 << RTAX_MTU);
1094 }
1095 dst->pmtu = mtu;
1096 dst_set_expires(dst, ip_rt_mtu_expires);
1097 }
1098 }
1099
1100 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1101 {
1102 dst_release(dst);
1103 return NULL;
1104 }
1105
1106 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1107 struct sk_buff *skb)
1108 {
1109 return NULL;
1110 }
1111
1112 static void ipv4_dst_destroy(struct dst_entry *dst)
1113 {
1114 struct rtable *rt = (struct rtable *) dst;
1115 struct inet_peer *peer = rt->peer;
1116
1117 if (peer) {
1118 rt->peer = NULL;
1119 inet_putpeer(peer);
1120 }
1121 }
1122
1123 static void ipv4_link_failure(struct sk_buff *skb)
1124 {
1125 struct rtable *rt;
1126
1127 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1128
1129 rt = (struct rtable *) skb->dst;
1130 if (rt)
1131 dst_set_expires(&rt->u.dst, 0);
1132 }
1133
1134 static int ip_rt_bug(struct sk_buff *skb)
1135 {
1136 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1137 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1138 skb->dev ? skb->dev->name : "?");
1139 kfree_skb(skb);
1140 return 0;
1141 }
1142
1143 /*
1144 We do not cache source address of outgoing interface,
1145 because it is used only by IP RR, TS and SRR options,
1146 so that it out of fast path.
1147
1148 BTW remember: "addr" is allowed to be not aligned
1149 in IP options!
1150 */
1151
1152 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1153 {
1154 u32 src;
1155 struct fib_result res;
1156
1157 if (rt->key.iif == 0)
1158 src = rt->rt_src;
1159 else if (fib_lookup(&rt->key, &res) == 0) {
1160 #ifdef CONFIG_IP_ROUTE_NAT
1161 if (res.type == RTN_NAT)
1162 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1163 RT_SCOPE_UNIVERSE);
1164 else
1165 #endif
1166 src = FIB_RES_PREFSRC(res);
1167 fib_res_put(&res);
1168 } else
1169 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1170 RT_SCOPE_UNIVERSE);
1171 memcpy(addr, &src, 4);
1172 }
1173
1174 #ifdef CONFIG_NET_CLS_ROUTE
1175 static void set_class_tag(struct rtable *rt, u32 tag)
1176 {
1177 if (!(rt->u.dst.tclassid & 0xFFFF))
1178 rt->u.dst.tclassid |= tag & 0xFFFF;
1179 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1180 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1181 }
1182 #endif
1183
1184 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1185 {
1186 struct fib_info *fi = res->fi;
1187
1188 if (fi) {
1189 if (FIB_RES_GW(*res) &&
1190 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1191 rt->rt_gateway = FIB_RES_GW(*res);
1192 memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1193 sizeof(fi->fib_metrics));
1194 if (fi->fib_mtu == 0) {
1195 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1196 if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1197 rt->rt_gateway != rt->rt_dst &&
1198 rt->u.dst.pmtu > 576)
1199 rt->u.dst.pmtu = 576;
1200 }
1201 #ifdef CONFIG_NET_CLS_ROUTE
1202 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1203 #endif
1204 } else
1205 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1206
1207 if (rt->u.dst.pmtu > IP_MAX_MTU)
1208 rt->u.dst.pmtu = IP_MAX_MTU;
1209 if (rt->u.dst.advmss == 0)
1210 rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1211 ip_rt_min_advmss);
1212 if (rt->u.dst.advmss > 65535 - 40)
1213 rt->u.dst.advmss = 65535 - 40;
1214
1215 #ifdef CONFIG_NET_CLS_ROUTE
1216 #ifdef CONFIG_IP_MULTIPLE_TABLES
1217 set_class_tag(rt, fib_rules_tclass(res));
1218 #endif
1219 set_class_tag(rt, itag);
1220 #endif
1221 rt->rt_type = res->type;
1222 }
1223
1224 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1225 u8 tos, struct net_device *dev, int our)
1226 {
1227 unsigned hash;
1228 struct rtable *rth;
1229 u32 spec_dst;
1230 struct in_device *in_dev = in_dev_get(dev);
1231 u32 itag = 0;
1232
1233 /* Primary sanity checks. */
1234
1235 if (in_dev == NULL)
1236 return -EINVAL;
1237
1238 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1239 skb->protocol != __constant_htons(ETH_P_IP))
1240 goto e_inval;
1241
1242 if (ZERONET(saddr)) {
1243 if (!LOCAL_MCAST(daddr))
1244 goto e_inval;
1245 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1246 } else if (fib_validate_source(saddr, 0, tos, 0,
1247 dev, &spec_dst, &itag) < 0)
1248 goto e_inval;
1249
1250 rth = dst_alloc(&ipv4_dst_ops);
1251 if (!rth)
1252 goto e_nobufs;
1253
1254 rth->u.dst.output= ip_rt_bug;
1255
1256 atomic_set(&rth->u.dst.__refcnt, 1);
1257 rth->u.dst.flags= DST_HOST;
1258 rth->key.dst = daddr;
1259 rth->rt_dst = daddr;
1260 rth->key.tos = tos;
1261 #ifdef CONFIG_IP_ROUTE_FWMARK
1262 rth->key.fwmark = skb->nfmark;
1263 #endif
1264 rth->key.src = saddr;
1265 rth->rt_src = saddr;
1266 #ifdef CONFIG_IP_ROUTE_NAT
1267 rth->rt_dst_map = daddr;
1268 rth->rt_src_map = saddr;
1269 #endif
1270 #ifdef CONFIG_NET_CLS_ROUTE
1271 rth->u.dst.tclassid = itag;
1272 #endif
1273 rth->rt_iif =
1274 rth->key.iif = dev->ifindex;
1275 rth->u.dst.dev = &loopback_dev;
1276 dev_hold(rth->u.dst.dev);
1277 rth->key.oif = 0;
1278 rth->rt_gateway = daddr;
1279 rth->rt_spec_dst= spec_dst;
1280 rth->rt_type = RTN_MULTICAST;
1281 rth->rt_flags = RTCF_MULTICAST;
1282 if (our) {
1283 rth->u.dst.input= ip_local_deliver;
1284 rth->rt_flags |= RTCF_LOCAL;
1285 }
1286
1287 #ifdef CONFIG_IP_MROUTE
1288 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1289 rth->u.dst.input = ip_mr_input;
1290 #endif
1291 rt_cache_stat[smp_processor_id()].in_slow_mc++;
1292
1293 in_dev_put(in_dev);
1294 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1295 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1296
1297 e_nobufs:
1298 in_dev_put(in_dev);
1299 return -ENOBUFS;
1300
1301 e_inval:
1302 in_dev_put(in_dev);
1303 return -EINVAL;
1304 }
1305
1306 /*
1307 * NOTE. We drop all the packets that has local source
1308 * addresses, because every properly looped back packet
1309 * must have correct destination already attached by output routine.
1310 *
1311 * Such approach solves two big problems:
1312 * 1. Not simplex devices are handled properly.
1313 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1314 */
1315
1316 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1317 u8 tos, struct net_device *dev)
1318 {
1319 struct rt_key key;
1320 struct fib_result res;
1321 struct in_device *in_dev = in_dev_get(dev);
1322 struct in_device *out_dev = NULL;
1323 unsigned flags = 0;
1324 u32 itag = 0;
1325 struct rtable * rth;
1326 unsigned hash;
1327 u32 spec_dst;
1328 int err = -EINVAL;
1329 int free_res = 0;
1330
1331 /* IP on this device is disabled. */
1332
1333 if (!in_dev)
1334 goto out;
1335
1336 key.dst = daddr;
1337 key.src = saddr;
1338 key.tos = tos;
1339 #ifdef CONFIG_IP_ROUTE_FWMARK
1340 key.fwmark = skb->nfmark;
1341 #endif
1342 key.iif = dev->ifindex;
1343 key.oif = 0;
1344 key.scope = RT_SCOPE_UNIVERSE;
1345
1346 hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1347
1348 /* Check for the most weird martians, which can be not detected
1349 by fib_lookup.
1350 */
1351
1352 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1353 goto martian_source;
1354
1355 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1356 goto brd_input;
1357
1358 /* Accept zero addresses only to limited broadcast;
1359 * I even do not know to fix it or not. Waiting for complains :-)
1360 */
1361 if (ZERONET(saddr))
1362 goto martian_source;
1363
1364 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1365 goto martian_destination;
1366
1367 /*
1368 * Now we are ready to route packet.
1369 */
1370 if ((err = fib_lookup(&key, &res)) != 0) {
1371 if (!IN_DEV_FORWARD(in_dev))
1372 goto e_inval;
1373 goto no_route;
1374 }
1375 free_res = 1;
1376
1377 rt_cache_stat[smp_processor_id()].in_slow_tot++;
1378
1379 #ifdef CONFIG_IP_ROUTE_NAT
1380 /* Policy is applied before mapping destination,
1381 but rerouting after map should be made with old source.
1382 */
1383
1384 if (1) {
1385 u32 src_map = saddr;
1386 if (res.r)
1387 src_map = fib_rules_policy(saddr, &res, &flags);
1388
1389 if (res.type == RTN_NAT) {
1390 key.dst = fib_rules_map_destination(daddr, &res);
1391 fib_res_put(&res);
1392 free_res = 0;
1393 if (fib_lookup(&key, &res))
1394 goto e_inval;
1395 free_res = 1;
1396 if (res.type != RTN_UNICAST)
1397 goto e_inval;
1398 flags |= RTCF_DNAT;
1399 }
1400 key.src = src_map;
1401 }
1402 #endif
1403
1404 if (res.type == RTN_BROADCAST)
1405 goto brd_input;
1406
1407 if (res.type == RTN_LOCAL) {
1408 int result;
1409 result = fib_validate_source(saddr, daddr, tos,
1410 loopback_dev.ifindex,
1411 dev, &spec_dst, &itag);
1412 if (result < 0)
1413 goto martian_source;
1414 if (result)
1415 flags |= RTCF_DIRECTSRC;
1416 spec_dst = daddr;
1417 goto local_input;
1418 }
1419
1420 if (!IN_DEV_FORWARD(in_dev))
1421 goto e_inval;
1422 if (res.type != RTN_UNICAST)
1423 goto martian_destination;
1424
1425 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1426 if (res.fi->fib_nhs > 1 && key.oif == 0)
1427 fib_select_multipath(&key, &res);
1428 #endif
1429 out_dev = in_dev_get(FIB_RES_DEV(res));
1430 if (out_dev == NULL) {
1431 if (net_ratelimit())
1432 printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1433 "Please, report\n");
1434 goto e_inval;
1435 }
1436
1437 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1438 &spec_dst, &itag);
1439 if (err < 0)
1440 goto martian_source;
1441
1442 if (err)
1443 flags |= RTCF_DIRECTSRC;
1444
1445 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1446 (IN_DEV_SHARED_MEDIA(out_dev) ||
1447 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1448 flags |= RTCF_DOREDIRECT;
1449
1450 if (skb->protocol != __constant_htons(ETH_P_IP)) {
1451 /* Not IP (i.e. ARP). Do not create route, if it is
1452 * invalid for proxy arp. DNAT routes are always valid.
1453 */
1454 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1455 goto e_inval;
1456 }
1457
1458 rth = dst_alloc(&ipv4_dst_ops);
1459 if (!rth)
1460 goto e_nobufs;
1461
1462 atomic_set(&rth->u.dst.__refcnt, 1);
1463 rth->u.dst.flags= DST_HOST;
1464 rth->key.dst = daddr;
1465 rth->rt_dst = daddr;
1466 rth->key.tos = tos;
1467 #ifdef CONFIG_IP_ROUTE_FWMARK
1468 rth->key.fwmark = skb->nfmark;
1469 #endif
1470 rth->key.src = saddr;
1471 rth->rt_src = saddr;
1472 rth->rt_gateway = daddr;
1473 #ifdef CONFIG_IP_ROUTE_NAT
1474 rth->rt_src_map = key.src;
1475 rth->rt_dst_map = key.dst;
1476 if (flags&RTCF_DNAT)
1477 rth->rt_gateway = key.dst;
1478 #endif
1479 rth->rt_iif =
1480 rth->key.iif = dev->ifindex;
1481 rth->u.dst.dev = out_dev->dev;
1482 dev_hold(rth->u.dst.dev);
1483 rth->key.oif = 0;
1484 rth->rt_spec_dst= spec_dst;
1485
1486 rth->u.dst.input = ip_forward;
1487 rth->u.dst.output = ip_output;
1488
1489 rt_set_nexthop(rth, &res, itag);
1490
1491 rth->rt_flags = flags;
1492
1493 #ifdef CONFIG_NET_FASTROUTE
1494 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1495 struct net_device *odev = rth->u.dst.dev;
1496 if (odev != dev &&
1497 dev->accept_fastpath &&
1498 odev->mtu >= dev->mtu &&
1499 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1500 rth->rt_flags |= RTCF_FAST;
1501 }
1502 #endif
1503
1504 intern:
1505 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1506 done:
1507 in_dev_put(in_dev);
1508 if (out_dev)
1509 in_dev_put(out_dev);
1510 if (free_res)
1511 fib_res_put(&res);
1512 out: return err;
1513
1514 brd_input:
1515 if (skb->protocol != __constant_htons(ETH_P_IP))
1516 goto e_inval;
1517
1518 if (ZERONET(saddr))
1519 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1520 else {
1521 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1522 &itag);
1523 if (err < 0)
1524 goto martian_source;
1525 if (err)
1526 flags |= RTCF_DIRECTSRC;
1527 }
1528 flags |= RTCF_BROADCAST;
1529 res.type = RTN_BROADCAST;
1530 rt_cache_stat[smp_processor_id()].in_brd++;
1531
1532 local_input:
1533 rth = dst_alloc(&ipv4_dst_ops);
1534 if (!rth)
1535 goto e_nobufs;
1536
1537 rth->u.dst.output= ip_rt_bug;
1538
1539 atomic_set(&rth->u.dst.__refcnt, 1);
1540 rth->u.dst.flags= DST_HOST;
1541 rth->key.dst = daddr;
1542 rth->rt_dst = daddr;
1543 rth->key.tos = tos;
1544 #ifdef CONFIG_IP_ROUTE_FWMARK
1545 rth->key.fwmark = skb->nfmark;
1546 #endif
1547 rth->key.src = saddr;
1548 rth->rt_src = saddr;
1549 #ifdef CONFIG_IP_ROUTE_NAT
1550 rth->rt_dst_map = key.dst;
1551 rth->rt_src_map = key.src;
1552 #endif
1553 #ifdef CONFIG_NET_CLS_ROUTE
1554 rth->u.dst.tclassid = itag;
1555 #endif
1556 rth->rt_iif =
1557 rth->key.iif = dev->ifindex;
1558 rth->u.dst.dev = &loopback_dev;
1559 dev_hold(rth->u.dst.dev);
1560 rth->key.oif = 0;
1561 rth->rt_gateway = daddr;
1562 rth->rt_spec_dst= spec_dst;
1563 rth->u.dst.input= ip_local_deliver;
1564 rth->rt_flags = flags|RTCF_LOCAL;
1565 if (res.type == RTN_UNREACHABLE) {
1566 rth->u.dst.input= ip_error;
1567 rth->u.dst.error= -err;
1568 rth->rt_flags &= ~RTCF_LOCAL;
1569 }
1570 rth->rt_type = res.type;
1571 goto intern;
1572
1573 no_route:
1574 rt_cache_stat[smp_processor_id()].in_no_route++;
1575 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1576 res.type = RTN_UNREACHABLE;
1577 goto local_input;
1578
1579 /*
1580 * Do not cache martian addresses: they should be logged (RFC1812)
1581 */
1582 martian_destination:
1583 rt_cache_stat[smp_processor_id()].in_martian_dst++;
1584 #ifdef CONFIG_IP_ROUTE_VERBOSE
1585 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1586 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1587 "%u.%u.%u.%u, dev %s\n",
1588 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1589 #endif
1590 e_inval:
1591 err = -EINVAL;
1592 goto done;
1593
1594 e_nobufs:
1595 err = -ENOBUFS;
1596 goto done;
1597
1598 martian_source:
1599
1600 rt_cache_stat[smp_processor_id()].in_martian_src++;
1601 #ifdef CONFIG_IP_ROUTE_VERBOSE
1602 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1603 /*
1604 * RFC1812 recommendation, if source is martian,
1605 * the only hint is MAC header.
1606 */
1607 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1608 "%u.%u.%u.%u, on dev %s\n",
1609 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1610 if (dev->hard_header_len) {
1611 int i;
1612 unsigned char *p = skb->mac.raw;
1613 printk(KERN_WARNING "ll header: ");
1614 for (i = 0; i < dev->hard_header_len; i++, p++) {
1615 printk("%02x", *p);
1616 if (i < (dev->hard_header_len - 1))
1617 printk(":");
1618 }
1619 printk("\n");
1620 }
1621 }
1622 #endif
1623 goto e_inval;
1624 }
1625
1626 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1627 u8 tos, struct net_device *dev)
1628 {
1629 struct rtable * rth;
1630 unsigned hash;
1631 int iif = dev->ifindex;
1632
1633 tos &= IPTOS_RT_MASK;
1634 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1635
1636 read_lock(&rt_hash_table[hash].lock);
1637 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1638 if (rth->key.dst == daddr &&
1639 rth->key.src == saddr &&
1640 rth->key.iif == iif &&
1641 rth->key.oif == 0 &&
1642 #ifdef CONFIG_IP_ROUTE_FWMARK
1643 rth->key.fwmark == skb->nfmark &&
1644 #endif
1645 rth->key.tos == tos) {
1646 rth->u.dst.lastuse = jiffies;
1647 dst_hold(&rth->u.dst);
1648 rth->u.dst.__use++;
1649 rt_cache_stat[smp_processor_id()].in_hit++;
1650 read_unlock(&rt_hash_table[hash].lock);
1651 skb->dst = (struct dst_entry*)rth;
1652 return 0;
1653 }
1654 }
1655 read_unlock(&rt_hash_table[hash].lock);
1656
1657 /* Multicast recognition logic is moved from route cache to here.
1658 The problem was that too many Ethernet cards have broken/missing
1659 hardware multicast filters :-( As result the host on multicasting
1660 network acquires a lot of useless route cache entries, sort of
1661 SDR messages from all the world. Now we try to get rid of them.
1662 Really, provided software IP multicast filter is organized
1663 reasonably (at least, hashed), it does not result in a slowdown
1664 comparing with route cache reject entries.
1665 Note, that multicast routers are not affected, because
1666 route cache entry is created eventually.
1667 */
1668 if (MULTICAST(daddr)) {
1669 struct in_device *in_dev;
1670
1671 read_lock(&inetdev_lock);
1672 if ((in_dev = __in_dev_get(dev)) != NULL) {
1673 int our = ip_check_mc(in_dev, daddr);
1674 if (our
1675 #ifdef CONFIG_IP_MROUTE
1676 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1677 #endif
1678 ) {
1679 read_unlock(&inetdev_lock);
1680 return ip_route_input_mc(skb, daddr, saddr,
1681 tos, dev, our);
1682 }
1683 }
1684 read_unlock(&inetdev_lock);
1685 return -EINVAL;
1686 }
1687 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1688 }
1689
1690 /*
1691 * Major route resolver routine.
1692 */
1693
1694 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1695 {
1696 struct rt_key key;
1697 struct fib_result res;
1698 unsigned flags = 0;
1699 struct rtable *rth;
1700 struct net_device *dev_out = NULL;
1701 unsigned hash;
1702 int free_res = 0;
1703 int err;
1704 u32 tos;
1705
1706 tos = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1707 key.dst = oldkey->dst;
1708 key.src = oldkey->src;
1709 key.tos = tos & IPTOS_RT_MASK;
1710 key.iif = loopback_dev.ifindex;
1711 key.oif = oldkey->oif;
1712 #ifdef CONFIG_IP_ROUTE_FWMARK
1713 key.fwmark = oldkey->fwmark;
1714 #endif
1715 key.scope = (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1716 RT_SCOPE_UNIVERSE;
1717 res.fi = NULL;
1718 #ifdef CONFIG_IP_MULTIPLE_TABLES
1719 res.r = NULL;
1720 #endif
1721
1722 if (oldkey->src) {
1723 err = -EINVAL;
1724 if (MULTICAST(oldkey->src) ||
1725 BADCLASS(oldkey->src) ||
1726 ZERONET(oldkey->src))
1727 goto out;
1728
1729 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1730 dev_out = ip_dev_find(oldkey->src);
1731 if (dev_out == NULL)
1732 goto out;
1733
1734 /* I removed check for oif == dev_out->oif here.
1735 It was wrong by three reasons:
1736 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1737 assigned to multiple interfaces.
1738 2. Moreover, we are allowed to send packets with saddr
1739 of another iface. --ANK
1740 */
1741
1742 if (oldkey->oif == 0
1743 && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1744 /* Special hack: user can direct multicasts
1745 and limited broadcast via necessary interface
1746 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1747 This hack is not just for fun, it allows
1748 vic,vat and friends to work.
1749 They bind socket to loopback, set ttl to zero
1750 and expect that it will work.
1751 From the viewpoint of routing cache they are broken,
1752 because we are not allowed to build multicast path
1753 with loopback source addr (look, routing cache
1754 cannot know, that ttl is zero, so that packet
1755 will not leave this host and route is valid).
1756 Luckily, this hack is good workaround.
1757 */
1758
1759 key.oif = dev_out->ifindex;
1760 goto make_route;
1761 }
1762 if (dev_out)
1763 dev_put(dev_out);
1764 dev_out = NULL;
1765 }
1766 if (oldkey->oif) {
1767 dev_out = dev_get_by_index(oldkey->oif);
1768 err = -ENODEV;
1769 if (dev_out == NULL)
1770 goto out;
1771 if (__in_dev_get(dev_out) == NULL) {
1772 dev_put(dev_out);
1773 goto out; /* Wrong error code */
1774 }
1775
1776 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1777 if (!key.src)
1778 key.src = inet_select_addr(dev_out, 0,
1779 RT_SCOPE_LINK);
1780 goto make_route;
1781 }
1782 if (!key.src) {
1783 if (MULTICAST(oldkey->dst))
1784 key.src = inet_select_addr(dev_out, 0,
1785 key.scope);
1786 else if (!oldkey->dst)
1787 key.src = inet_select_addr(dev_out, 0,
1788 RT_SCOPE_HOST);
1789 }
1790 }
1791
1792 if (!key.dst) {
1793 key.dst = key.src;
1794 if (!key.dst)
1795 key.dst = key.src = htonl(INADDR_LOOPBACK);
1796 if (dev_out)
1797 dev_put(dev_out);
1798 dev_out = &loopback_dev;
1799 dev_hold(dev_out);
1800 key.oif = loopback_dev.ifindex;
1801 res.type = RTN_LOCAL;
1802 flags |= RTCF_LOCAL;
1803 goto make_route;
1804 }
1805
1806 if (fib_lookup(&key, &res)) {
1807 res.fi = NULL;
1808 if (oldkey->oif) {
1809 /* Apparently, routing tables are wrong. Assume,
1810 that the destination is on link.
1811
1812 WHY? DW.
1813 Because we are allowed to send to iface
1814 even if it has NO routes and NO assigned
1815 addresses. When oif is specified, routing
1816 tables are looked up with only one purpose:
1817 to catch if destination is gatewayed, rather than
1818 direct. Moreover, if MSG_DONTROUTE is set,
1819 we send packet, ignoring both routing tables
1820 and ifaddr state. --ANK
1821
1822
1823 We could make it even if oif is unknown,
1824 likely IPv6, but we do not.
1825 */
1826
1827 if (key.src == 0)
1828 key.src = inet_select_addr(dev_out, 0,
1829 RT_SCOPE_LINK);
1830 res.type = RTN_UNICAST;
1831 goto make_route;
1832 }
1833 if (dev_out)
1834 dev_put(dev_out);
1835 err = -ENETUNREACH;
1836 goto out;
1837 }
1838 free_res = 1;
1839
1840 if (res.type == RTN_NAT)
1841 goto e_inval;
1842
1843 if (res.type == RTN_LOCAL) {
1844 if (!key.src)
1845 key.src = key.dst;
1846 if (dev_out)
1847 dev_put(dev_out);
1848 dev_out = &loopback_dev;
1849 dev_hold(dev_out);
1850 key.oif = dev_out->ifindex;
1851 if (res.fi)
1852 fib_info_put(res.fi);
1853 res.fi = NULL;
1854 flags |= RTCF_LOCAL;
1855 goto make_route;
1856 }
1857
1858 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1859 if (res.fi->fib_nhs > 1 && key.oif == 0)
1860 fib_select_multipath(&key, &res);
1861 else
1862 #endif
1863 if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1864 fib_select_default(&key, &res);
1865
1866 if (!key.src)
1867 key.src = FIB_RES_PREFSRC(res);
1868
1869 if (dev_out)
1870 dev_put(dev_out);
1871 dev_out = FIB_RES_DEV(res);
1872 dev_hold(dev_out);
1873 key.oif = dev_out->ifindex;
1874
1875 make_route:
1876 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1877 goto e_inval;
1878
1879 if (key.dst == 0xFFFFFFFF)
1880 res.type = RTN_BROADCAST;
1881 else if (MULTICAST(key.dst))
1882 res.type = RTN_MULTICAST;
1883 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1884 goto e_inval;
1885
1886 if (dev_out->flags & IFF_LOOPBACK)
1887 flags |= RTCF_LOCAL;
1888
1889 if (res.type == RTN_BROADCAST) {
1890 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1891 if (res.fi) {
1892 fib_info_put(res.fi);
1893 res.fi = NULL;
1894 }
1895 } else if (res.type == RTN_MULTICAST) {
1896 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1897 read_lock(&inetdev_lock);
1898 if (!__in_dev_get(dev_out) ||
1899 !ip_check_mc(__in_dev_get(dev_out), oldkey->dst))
1900 flags &= ~RTCF_LOCAL;
1901 read_unlock(&inetdev_lock);
1902 /* If multicast route do not exist use
1903 default one, but do not gateway in this case.
1904 Yes, it is hack.
1905 */
1906 if (res.fi && res.prefixlen < 4) {
1907 fib_info_put(res.fi);
1908 res.fi = NULL;
1909 }
1910 }
1911
1912 rth = dst_alloc(&ipv4_dst_ops);
1913 if (!rth)
1914 goto e_nobufs;
1915
1916 atomic_set(&rth->u.dst.__refcnt, 1);
1917 rth->u.dst.flags= DST_HOST;
1918 rth->key.dst = oldkey->dst;
1919 rth->key.tos = tos;
1920 rth->key.src = oldkey->src;
1921 rth->key.iif = 0;
1922 rth->key.oif = oldkey->oif;
1923 #ifdef CONFIG_IP_ROUTE_FWMARK
1924 rth->key.fwmark = oldkey->fwmark;
1925 #endif
1926 rth->rt_dst = key.dst;
1927 rth->rt_src = key.src;
1928 #ifdef CONFIG_IP_ROUTE_NAT
1929 rth->rt_dst_map = key.dst;
1930 rth->rt_src_map = key.src;
1931 #endif
1932 rth->rt_iif = oldkey->oif ? : dev_out->ifindex;
1933 rth->u.dst.dev = dev_out;
1934 dev_hold(dev_out);
1935 rth->rt_gateway = key.dst;
1936 rth->rt_spec_dst= key.src;
1937
1938 rth->u.dst.output=ip_output;
1939
1940 rt_cache_stat[smp_processor_id()].out_slow_tot++;
1941
1942 if (flags & RTCF_LOCAL) {
1943 rth->u.dst.input = ip_local_deliver;
1944 rth->rt_spec_dst = key.dst;
1945 }
1946 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1947 rth->rt_spec_dst = key.src;
1948 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
1949 rth->u.dst.output = ip_mc_output;
1950 rt_cache_stat[smp_processor_id()].out_slow_mc++;
1951 }
1952 #ifdef CONFIG_IP_MROUTE
1953 if (res.type == RTN_MULTICAST) {
1954 struct in_device *in_dev = in_dev_get(dev_out);
1955 if (in_dev) {
1956 if (IN_DEV_MFORWARD(in_dev) &&
1957 !LOCAL_MCAST(oldkey->dst)) {
1958 rth->u.dst.input = ip_mr_input;
1959 rth->u.dst.output = ip_mc_output;
1960 }
1961 in_dev_put(in_dev);
1962 }
1963 }
1964 #endif
1965 }
1966
1967 rt_set_nexthop(rth, &res, 0);
1968
1969 rth->rt_flags = flags;
1970
1971 hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
1972 err = rt_intern_hash(hash, rth, rp);
1973 done:
1974 if (free_res)
1975 fib_res_put(&res);
1976 if (dev_out)
1977 dev_put(dev_out);
1978 out: return err;
1979
1980 e_inval:
1981 err = -EINVAL;
1982 goto done;
1983 e_nobufs:
1984 err = -ENOBUFS;
1985 goto done;
1986 }
1987
1988 int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
1989 {
1990 unsigned hash;
1991 struct rtable *rth;
1992
1993 hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
1994
1995 read_lock_bh(&rt_hash_table[hash].lock);
1996 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1997 if (rth->key.dst == key->dst &&
1998 rth->key.src == key->src &&
1999 rth->key.iif == 0 &&
2000 rth->key.oif == key->oif &&
2001 #ifdef CONFIG_IP_ROUTE_FWMARK
2002 rth->key.fwmark == key->fwmark &&
2003 #endif
2004 !((rth->key.tos ^ key->tos) &
2005 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2006 ((key->tos & RTO_TPROXY) ||
2007 !(rth->rt_flags & RTCF_TPROXY))) {
2008 rth->u.dst.lastuse = jiffies;
2009 dst_hold(&rth->u.dst);
2010 rth->u.dst.__use++;
2011 rt_cache_stat[smp_processor_id()].out_hit++;
2012 read_unlock_bh(&rt_hash_table[hash].lock);
2013 *rp = rth;
2014 return 0;
2015 }
2016 }
2017 read_unlock_bh(&rt_hash_table[hash].lock);
2018
2019 return ip_route_output_slow(rp, key);
2020 }
2021
2022 #ifdef CONFIG_RTNETLINK
2023 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2024 int nowait)
2025 {
2026 struct rtable *rt = (struct rtable*)skb->dst;
2027 struct rtmsg *r;
2028 struct nlmsghdr *nlh;
2029 unsigned char *b = skb->tail;
2030 struct rta_cacheinfo ci;
2031 #ifdef CONFIG_IP_MROUTE
2032 struct rtattr *eptr;
2033 #endif
2034 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2035 r = NLMSG_DATA(nlh);
2036 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2037 r->rtm_family = AF_INET;
2038 r->rtm_dst_len = 32;
2039 r->rtm_src_len = 0;
2040 r->rtm_tos = rt->key.tos;
2041 r->rtm_table = RT_TABLE_MAIN;
2042 r->rtm_type = rt->rt_type;
2043 r->rtm_scope = RT_SCOPE_UNIVERSE;
2044 r->rtm_protocol = RTPROT_UNSPEC;
2045 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2046 if (rt->rt_flags & RTCF_NOTIFY)
2047 r->rtm_flags |= RTM_F_NOTIFY;
2048 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2049 if (rt->key.src) {
2050 r->rtm_src_len = 32;
2051 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2052 }
2053 if (rt->u.dst.dev)
2054 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2055 #ifdef CONFIG_NET_CLS_ROUTE
2056 if (rt->u.dst.tclassid)
2057 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2058 #endif
2059 if (rt->key.iif)
2060 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2061 else if (rt->rt_src != rt->key.src)
2062 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2063 if (rt->rt_dst != rt->rt_gateway)
2064 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2065 if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2066 goto rtattr_failure;
2067 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
2068 ci.rta_used = rt->u.dst.__use;
2069 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2070 if (rt->u.dst.expires)
2071 ci.rta_expires = rt->u.dst.expires - jiffies;
2072 else
2073 ci.rta_expires = 0;
2074 ci.rta_error = rt->u.dst.error;
2075 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2076 if (rt->peer) {
2077 ci.rta_id = rt->peer->ip_id_count;
2078 if (rt->peer->tcp_ts_stamp) {
2079 ci.rta_ts = rt->peer->tcp_ts;
2080 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2081 }
2082 }
2083 #ifdef CONFIG_IP_MROUTE
2084 eptr = (struct rtattr*)skb->tail;
2085 #endif
2086 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2087 if (rt->key.iif) {
2088 #ifdef CONFIG_IP_MROUTE
2089 u32 dst = rt->rt_dst;
2090
2091 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2092 ipv4_devconf.mc_forwarding) {
2093 int err = ipmr_get_route(skb, r, nowait);
2094 if (err <= 0) {
2095 if (!nowait) {
2096 if (err == 0)
2097 return 0;
2098 goto nlmsg_failure;
2099 } else {
2100 if (err == -EMSGSIZE)
2101 goto nlmsg_failure;
2102 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2103 }
2104 }
2105 } else
2106 #endif
2107 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2108 }
2109
2110 nlh->nlmsg_len = skb->tail - b;
2111 return skb->len;
2112
2113 nlmsg_failure:
2114 rtattr_failure:
2115 skb_trim(skb, b - skb->data);
2116 return -1;
2117 }
2118
2119 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2120 {
2121 struct rtattr **rta = arg;
2122 struct rtmsg *rtm = NLMSG_DATA(nlh);
2123 struct rtable *rt = NULL;
2124 u32 dst = 0;
2125 u32 src = 0;
2126 int iif = 0;
2127 int err = -ENOBUFS;
2128 struct sk_buff *skb;
2129
2130 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2131 if (!skb)
2132 goto out;
2133
2134 /* Reserve room for dummy headers, this skb can pass
2135 through good chunk of routing engine.
2136 */
2137 skb->mac.raw = skb->data;
2138 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2139
2140 if (rta[RTA_SRC - 1])
2141 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2142 if (rta[RTA_DST - 1])
2143 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2144 if (rta[RTA_IIF - 1])
2145 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2146
2147 if (iif) {
2148 struct net_device *dev = __dev_get_by_index(iif);
2149 err = -ENODEV;
2150 if (!dev)
2151 goto out;
2152 skb->protocol = __constant_htons(ETH_P_IP);
2153 skb->dev = dev;
2154 local_bh_disable();
2155 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2156 local_bh_enable();
2157 rt = (struct rtable*)skb->dst;
2158 if (!err && rt->u.dst.error)
2159 err = -rt->u.dst.error;
2160 } else {
2161 int oif = 0;
2162 if (rta[RTA_OIF - 1])
2163 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2164 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2165 }
2166 if (err) {
2167 kfree_skb(skb);
2168 goto out;
2169 }
2170
2171 skb->dst = &rt->u.dst;
2172 if (rtm->rtm_flags & RTM_F_NOTIFY)
2173 rt->rt_flags |= RTCF_NOTIFY;
2174
2175 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2176
2177 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2178 RTM_NEWROUTE, 0);
2179 if (!err)
2180 goto out;
2181 if (err < 0) {
2182 err = -EMSGSIZE;
2183 goto out;
2184 }
2185
2186 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2187 if (err > 0)
2188 err = 0;
2189 out: return err;
2190 }
2191
2192 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2193 {
2194 struct rtable *rt;
2195 int h, s_h;
2196 int idx, s_idx;
2197
2198 s_h = cb->args[0];
2199 s_idx = idx = cb->args[1];
2200 for (h = 0; h <= rt_hash_mask; h++) {
2201 if (h < s_h) continue;
2202 if (h > s_h)
2203 s_idx = 0;
2204 read_lock_bh(&rt_hash_table[h].lock);
2205 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2206 rt = rt->u.rt_next, idx++) {
2207 if (idx < s_idx)
2208 continue;
2209 skb->dst = dst_clone(&rt->u.dst);
2210 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2211 cb->nlh->nlmsg_seq,
2212 RTM_NEWROUTE, 1) <= 0) {
2213 dst_release(xchg(&skb->dst, NULL));
2214 read_unlock_bh(&rt_hash_table[h].lock);
2215 goto done;
2216 }
2217 dst_release(xchg(&skb->dst, NULL));
2218 }
2219 read_unlock_bh(&rt_hash_table[h].lock);
2220 }
2221
2222 done:
2223 cb->args[0] = h;
2224 cb->args[1] = idx;
2225 return skb->len;
2226 }
2227
2228 #endif /* CONFIG_RTNETLINK */
2229
2230 void ip_rt_multicast_event(struct in_device *in_dev)
2231 {
2232 rt_cache_flush(0);
2233 }
2234
2235 #ifdef CONFIG_SYSCTL
2236 static int flush_delay;
2237
2238 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2239 struct file *filp, void *buffer,
2240 size_t *lenp)
2241 {
2242 if (write) {
2243 proc_dointvec(ctl, write, filp, buffer, lenp);
2244 rt_cache_flush(flush_delay);
2245 return 0;
2246 }
2247
2248 return -EINVAL;
2249 }
2250
2251 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2252 int nlen, void *oldval,
2253 size_t *oldlenp, void *newval,
2254 size_t newlen, void **context)
2255 {
2256 int delay;
2257 if (newlen != sizeof(int))
2258 return -EINVAL;
2259 if (get_user(delay, (int *)newval))
2260 return -EFAULT;
2261 rt_cache_flush(delay);
2262 return 0;
2263 }
2264
2265 ctl_table ipv4_route_table[] = {
2266 {
2267 ctl_name: NET_IPV4_ROUTE_FLUSH,
2268 procname: "flush",
2269 data: &flush_delay,
2270 maxlen: sizeof(int),
2271 mode: 0644,
2272 proc_handler: &ipv4_sysctl_rtcache_flush,
2273 strategy: &ipv4_sysctl_rtcache_flush_strategy,
2274 },
2275 {
2276 ctl_name: NET_IPV4_ROUTE_MIN_DELAY,
2277 procname: "min_delay",
2278 data: &ip_rt_min_delay,
2279 maxlen: sizeof(int),
2280 mode: 0644,
2281 proc_handler: &proc_dointvec_jiffies,
2282 strategy: &sysctl_jiffies,
2283 },
2284 {
2285 ctl_name: NET_IPV4_ROUTE_MAX_DELAY,
2286 procname: "max_delay",
2287 data: &ip_rt_max_delay,
2288 maxlen: sizeof(int),
2289 mode: 0644,
2290 proc_handler: &proc_dointvec_jiffies,
2291 strategy: &sysctl_jiffies,
2292 },
2293 {
2294 ctl_name: NET_IPV4_ROUTE_GC_THRESH,
2295 procname: "gc_thresh",
2296 data: &ipv4_dst_ops.gc_thresh,
2297 maxlen: sizeof(int),
2298 mode: 0644,
2299 proc_handler: &proc_dointvec,
2300 },
2301 {
2302 ctl_name: NET_IPV4_ROUTE_MAX_SIZE,
2303 procname: "max_size",
2304 data: &ip_rt_max_size,
2305 maxlen: sizeof(int),
2306 mode: 0644,
2307 proc_handler: &proc_dointvec,
2308 },
2309 {
2310 ctl_name: NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2311 procname: "gc_min_interval",
2312 data: &ip_rt_gc_min_interval,
2313 maxlen: sizeof(int),
2314 mode: 0644,
2315 proc_handler: &proc_dointvec_jiffies,
2316 strategy: &sysctl_jiffies,
2317 },
2318 {
2319 ctl_name: NET_IPV4_ROUTE_GC_TIMEOUT,
2320 procname: "gc_timeout",
2321 data: &ip_rt_gc_timeout,
2322 maxlen: sizeof(int),
2323 mode: 0644,
2324 proc_handler: &proc_dointvec_jiffies,
2325 strategy: &sysctl_jiffies,
2326 },
2327 {
2328 ctl_name: NET_IPV4_ROUTE_GC_INTERVAL,
2329 procname: "gc_interval",
2330 data: &ip_rt_gc_interval,
2331 maxlen: sizeof(int),
2332 mode: 0644,
2333 proc_handler: &proc_dointvec_jiffies,
2334 strategy: &sysctl_jiffies,
2335 },
2336 {
2337 ctl_name: NET_IPV4_ROUTE_REDIRECT_LOAD,
2338 procname: "redirect_load",
2339 data: &ip_rt_redirect_load,
2340 maxlen: sizeof(int),
2341 mode: 0644,
2342 proc_handler: &proc_dointvec,
2343 },
2344 {
2345 ctl_name: NET_IPV4_ROUTE_REDIRECT_NUMBER,
2346 procname: "redirect_number",
2347 data: &ip_rt_redirect_number,
2348 maxlen: sizeof(int),
2349 mode: 0644,
2350 proc_handler: &proc_dointvec,
2351 },
2352 {
2353 ctl_name: NET_IPV4_ROUTE_REDIRECT_SILENCE,
2354 procname: "redirect_silence",
2355 data: &ip_rt_redirect_silence,
2356 maxlen: sizeof(int),
2357 mode: 0644,
2358 proc_handler: &proc_dointvec,
2359 },
2360 {
2361 ctl_name: NET_IPV4_ROUTE_ERROR_COST,
2362 procname: "error_cost",
2363 data: &ip_rt_error_cost,
2364 maxlen: sizeof(int),
2365 mode: 0644,
2366 proc_handler: &proc_dointvec,
2367 },
2368 {
2369 ctl_name: NET_IPV4_ROUTE_ERROR_BURST,
2370 procname: "error_burst",
2371 data: &ip_rt_error_burst,
2372 maxlen: sizeof(int),
2373 mode: 0644,
2374 proc_handler: &proc_dointvec,
2375 },
2376 {
2377 ctl_name: NET_IPV4_ROUTE_GC_ELASTICITY,
2378 procname: "gc_elasticity",
2379 data: &ip_rt_gc_elasticity,
2380 maxlen: sizeof(int),
2381 mode: 0644,
2382 proc_handler: &proc_dointvec,
2383 },
2384 {
2385 ctl_name: NET_IPV4_ROUTE_MTU_EXPIRES,
2386 procname: "mtu_expires",
2387 data: &ip_rt_mtu_expires,
2388 maxlen: sizeof(int),
2389 mode: 0644,
2390 proc_handler: &proc_dointvec_jiffies,
2391 strategy: &sysctl_jiffies,
2392 },
2393 {
2394 ctl_name: NET_IPV4_ROUTE_MIN_PMTU,
2395 procname: "min_pmtu",
2396 data: &ip_rt_min_pmtu,
2397 maxlen: sizeof(int),
2398 mode: 0644,
2399 proc_handler: &proc_dointvec,
2400 },
2401 {
2402 ctl_name: NET_IPV4_ROUTE_MIN_ADVMSS,
2403 procname: "min_adv_mss",
2404 data: &ip_rt_min_advmss,
2405 maxlen: sizeof(int),
2406 mode: 0644,
2407 proc_handler: &proc_dointvec,
2408 },
2409 { 0 }
2410 };
2411 #endif
2412
2413 #ifdef CONFIG_NET_CLS_ROUTE
2414 struct ip_rt_acct *ip_rt_acct;
2415
2416 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2417 int length, int *eof, void *data)
2418 {
2419 *start = buffer;
2420
2421 if ((offset & 3) || (length & 3))
2422 return -EIO;
2423
2424 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2425 length = sizeof(struct ip_rt_acct) * 256 - offset;
2426 *eof = 1;
2427 }
2428 if (length > 0) {
2429 u32 *dst = (u32*)buffer;
2430 u32 *src = (u32*)(((u8*)ip_rt_acct) + offset);
2431
2432 memcpy(dst, src, length);
2433
2434 #ifdef CONFIG_SMP
2435 if (smp_num_cpus > 1 || cpu_logical_map(0) != 0) {
2436 int i;
2437 int cnt = length / 4;
2438
2439 for (i = 0; i < smp_num_cpus; i++) {
2440 int cpu = cpu_logical_map(i);
2441 int k;
2442
2443 if (cpu == 0)
2444 continue;
2445
2446 src = (u32*)(((u8*)ip_rt_acct) + offset +
2447 cpu * 256 * sizeof(struct ip_rt_acct));
2448
2449 for (k = 0; k < cnt; k++)
2450 dst[k] += src[k];
2451 }
2452 }
2453 #endif
2454 return length;
2455 }
2456 return 0;
2457 }
2458 #endif
2459
2460 void __init ip_rt_init(void)
2461 {
2462 int i, order, goal;
2463
2464 #ifdef CONFIG_NET_CLS_ROUTE
2465 for (order = 0;
2466 (PAGE_SIZE << order) < 256 * sizeof(ip_rt_acct) * NR_CPUS; order++)
2467 /* NOTHING */;
2468 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2469 if (!ip_rt_acct)
2470 panic("IP: failed to allocate ip_rt_acct\n");
2471 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2472 #endif
2473
2474 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2475 sizeof(struct rtable),
2476 0, SLAB_HWCACHE_ALIGN,
2477 NULL, NULL);
2478
2479 if (!ipv4_dst_ops.kmem_cachep)
2480 panic("IP: failed to allocate ip_dst_cache\n");
2481
2482 goal = num_physpages >> (26 - PAGE_SHIFT);
2483
2484 for (order = 0; (1UL << order) < goal; order++)
2485 /* NOTHING */;
2486
2487 do {
2488 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2489 sizeof(struct rt_hash_bucket);
2490 while (rt_hash_mask & (rt_hash_mask - 1))
2491 rt_hash_mask--;
2492 rt_hash_table = (struct rt_hash_bucket *)
2493 __get_free_pages(GFP_ATOMIC, order);
2494 } while (rt_hash_table == NULL && --order > 0);
2495
2496 if (!rt_hash_table)
2497 panic("Failed to allocate IP route cache hash table\n");
2498
2499 printk("IP: routing cache hash table of %u buckets, %ldKbytes\n",
2500 rt_hash_mask,
2501 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2502
2503 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2504 /* NOTHING */;
2505
2506 rt_hash_mask--;
2507 for (i = 0; i <= rt_hash_mask; i++) {
2508 rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2509 rt_hash_table[i].chain = NULL;
2510 }
2511
2512 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2513 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2514
2515 devinet_init();
2516 ip_fib_init();
2517
2518 rt_flush_timer.function = rt_run_flush;
2519 rt_periodic_timer.function = rt_check_expire;
2520
2521 /* All the timers, started at system startup tend
2522 to synchronize. Perturb it a bit.
2523 */
2524 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2525 ip_rt_gc_interval;
2526 add_timer(&rt_periodic_timer);
2527
2528 proc_net_create ("rt_cache", 0, rt_cache_get_info);
2529 proc_net_create ("rt_cache_stat", 0, rt_cache_stat_get_info);
2530 #ifdef CONFIG_NET_CLS_ROUTE
2531 create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2532 #endif
2533 }
2534