File: /usr/src/linux/net/ipv4/netfilter/ip_nat_core.c
1 /* NAT for netfilter; shared with compatibility layer. */
2
3 /* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
4 Public Licence. */
5 #ifdef MODULE
6 #define __NO_VERSION__
7 #endif
8 #include <linux/version.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/timer.h>
12 #include <linux/skbuff.h>
13 #include <linux/netfilter_ipv4.h>
14 #include <linux/brlock.h>
15 #include <linux/vmalloc.h>
16 #include <net/checksum.h>
17 #include <net/icmp.h>
18 #include <net/ip.h>
19 #include <net/tcp.h> /* For tcp_prot in getorigdst */
20
21 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
22 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
23
24 #include <linux/netfilter_ipv4/ip_nat.h>
25 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
26 #include <linux/netfilter_ipv4/ip_nat_core.h>
27 #include <linux/netfilter_ipv4/ip_nat_helper.h>
28 #include <linux/netfilter_ipv4/listhelp.h>
29
30 #if 0
31 #define DEBUGP printk
32 #else
33 #define DEBUGP(format, args...)
34 #endif
35
36 DECLARE_RWLOCK(ip_nat_lock);
37
38 /* Calculated at init based on memory size */
39 static unsigned int ip_nat_htable_size;
40
41 static struct list_head *bysource;
42 static struct list_head *byipsproto;
43 LIST_HEAD(protos);
44 LIST_HEAD(helpers);
45
46 extern struct ip_nat_protocol unknown_nat_protocol;
47
48 /* We keep extra hashes for each conntrack, for fast searching. */
49 static inline size_t
50 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
51 {
52 /* Modified src and dst, to ensure we don't create two
53 identical streams. */
54 return (src + dst + proto) % ip_nat_htable_size;
55 }
56
57 static inline size_t
58 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
59 {
60 /* Original src, to ensure we map it consistently if poss. */
61 return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
62 }
63
64 /* Noone using conntrack by the time this called. */
65 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
66 {
67 struct ip_nat_info *info = &conn->nat.info;
68
69 if (!info->initialized)
70 return;
71
72 IP_NF_ASSERT(info->bysource.conntrack);
73 IP_NF_ASSERT(info->byipsproto.conntrack);
74
75 WRITE_LOCK(&ip_nat_lock);
76 LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL]
77 .tuple.src,
78 conn->tuplehash[IP_CT_DIR_ORIGINAL]
79 .tuple.dst.protonum)],
80 &info->bysource);
81
82 LIST_DELETE(&byipsproto
83 [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY]
84 .tuple.src.ip,
85 conn->tuplehash[IP_CT_DIR_REPLY]
86 .tuple.dst.ip,
87 conn->tuplehash[IP_CT_DIR_REPLY]
88 .tuple.dst.protonum)],
89 &info->byipsproto);
90 WRITE_UNLOCK(&ip_nat_lock);
91 }
92
93 /* We do checksum mangling, so if they were wrong before they're still
94 * wrong. Also works for incomplete packets (eg. ICMP dest
95 * unreachables.) */
96 u_int16_t
97 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
98 {
99 u_int32_t diffs[] = { oldvalinv, newval };
100 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
101 oldcheck^0xFFFF));
102 }
103
104 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
105 {
106 return i->protonum == proto;
107 }
108
109 struct ip_nat_protocol *
110 find_nat_proto(u_int16_t protonum)
111 {
112 struct ip_nat_protocol *i;
113
114 MUST_BE_READ_LOCKED(&ip_nat_lock);
115 i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
116 if (!i)
117 i = &unknown_nat_protocol;
118 return i;
119 }
120
121 /* Is this tuple already taken? (not by us) */
122 int
123 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
124 const struct ip_conntrack *ignored_conntrack)
125 {
126 /* Conntrack tracking doesn't keep track of outgoing tuples; only
127 incoming ones. NAT means they don't have a fixed mapping,
128 so we invert the tuple and look for the incoming reply.
129
130 We could keep a separate hash if this proves too slow. */
131 struct ip_conntrack_tuple reply;
132
133 invert_tuplepr(&reply, tuple);
134 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
135 }
136
137 /* Does tuple + the source manip come within the range mr */
138 static int
139 in_range(const struct ip_conntrack_tuple *tuple,
140 const struct ip_conntrack_manip *manip,
141 const struct ip_nat_multi_range *mr)
142 {
143 struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
144 unsigned int i;
145 struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
146
147 for (i = 0; i < mr->rangesize; i++) {
148 /* If we are allowed to map IPs, then we must be in the
149 range specified, otherwise we must be unchanged. */
150 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
151 if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
152 || (ntohl(newtuple.src.ip)
153 > ntohl(mr->range[i].max_ip)))
154 continue;
155 } else {
156 if (newtuple.src.ip != tuple->src.ip)
157 continue;
158 }
159
160 if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
161 && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
162 &mr->range[i].min, &mr->range[i].max))
163 return 1;
164 }
165 return 0;
166 }
167
168 static inline int
169 src_cmp(const struct ip_nat_hash *i,
170 const struct ip_conntrack_tuple *tuple,
171 const struct ip_nat_multi_range *mr)
172 {
173 return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
174 == tuple->dst.protonum
175 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
176 == tuple->src.ip
177 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
178 == tuple->src.u.all
179 && in_range(tuple,
180 &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
181 .tuple.src,
182 mr));
183 }
184
185 /* Only called for SRC manip */
186 static struct ip_conntrack_manip *
187 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
188 const struct ip_nat_multi_range *mr)
189 {
190 unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
191 struct ip_nat_hash *i;
192
193 MUST_BE_READ_LOCKED(&ip_nat_lock);
194 i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
195 if (i)
196 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
197 else
198 return NULL;
199 }
200
201 /* If it's really a local destination manip, it may need to do a
202 source manip too. */
203 static int
204 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
205 {
206 struct rtable *rt;
207
208 /* FIXME: IPTOS_TOS(iph->tos) --RR */
209 if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
210 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
211 NIPQUAD(var_ip));
212 return 0;
213 }
214
215 *other_ipp = rt->rt_src;
216 ip_rt_put(rt);
217 return 1;
218 }
219
220 /* Simple way to iterate through all. */
221 static inline int fake_cmp(const struct ip_nat_hash *i,
222 u_int32_t src, u_int32_t dst, u_int16_t protonum,
223 unsigned int *score,
224 const struct ip_conntrack *conntrack)
225 {
226 /* Compare backwards: we're dealing with OUTGOING tuples, and
227 inside the conntrack is the REPLY tuple. Don't count this
228 conntrack. */
229 if (i->conntrack != conntrack
230 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
231 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
232 && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
233 == protonum))
234 (*score)++;
235 return 0;
236 }
237
238 static inline unsigned int
239 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
240 const struct ip_conntrack *conntrack)
241 {
242 unsigned int score = 0;
243
244 MUST_BE_READ_LOCKED(&ip_nat_lock);
245 LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)],
246 fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score,
247 conntrack);
248
249 return score;
250 }
251
252 /* For [FUTURE] fragmentation handling, we want the least-used
253 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
254 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
255 1-65535, we don't do pro-rata allocation based on ports; we choose
256 the ip with the lowest src-ip/dst-ip/proto usage.
257
258 If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
259 range), we eliminate that and try again. This is not the most
260 efficient approach, but if you're worried about that, don't hand us
261 ranges you don't really have. */
262 static struct ip_nat_range *
263 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
264 const struct ip_nat_multi_range *mr,
265 const struct ip_conntrack *conntrack,
266 unsigned int hooknum)
267 {
268 unsigned int i;
269 struct {
270 const struct ip_nat_range *range;
271 unsigned int score;
272 struct ip_conntrack_tuple tuple;
273 } best = { NULL, 0xFFFFFFFF };
274 u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
275 static unsigned int randomness = 0;
276
277 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
278 var_ipp = &tuple->src.ip;
279 saved_ip = tuple->dst.ip;
280 other_ipp = &tuple->dst.ip;
281 } else {
282 var_ipp = &tuple->dst.ip;
283 saved_ip = tuple->src.ip;
284 other_ipp = &tuple->src.ip;
285 }
286 /* Don't do do_extra_mangle unless neccessary (overrides
287 explicit socket bindings, for example) */
288 orig_dstip = tuple->dst.ip;
289
290 IP_NF_ASSERT(mr->rangesize >= 1);
291 for (i = 0; i < mr->rangesize; i++) {
292 /* Host order */
293 u_int32_t minip, maxip, j;
294
295 /* Don't do ranges which are already eliminated. */
296 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
297 continue;
298 }
299
300 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
301 minip = ntohl(mr->range[i].min_ip);
302 maxip = ntohl(mr->range[i].max_ip);
303 } else
304 minip = maxip = ntohl(*var_ipp);
305
306 randomness++;
307 for (j = 0; j < maxip - minip + 1; j++) {
308 unsigned int score;
309
310 *var_ipp = htonl(minip + (randomness + j)
311 % (maxip - minip + 1));
312
313 /* Reset the other ip in case it was mangled by
314 * do_extra_mangle last time. */
315 *other_ipp = saved_ip;
316
317 if (hooknum == NF_IP_LOCAL_OUT
318 && *var_ipp != orig_dstip
319 && !do_extra_mangle(*var_ipp, other_ipp)) {
320 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
321 i, NIPQUAD(*var_ipp));
322 /* Can't route? This whole range part is
323 * probably screwed, but keep trying
324 * anyway. */
325 continue;
326 }
327
328 /* Count how many others map onto this. */
329 score = count_maps(tuple->src.ip, tuple->dst.ip,
330 tuple->dst.protonum, conntrack);
331 if (score < best.score) {
332 /* Optimization: doesn't get any better than
333 this. */
334 if (score == 0)
335 return (struct ip_nat_range *)
336 &mr->range[i];
337
338 best.score = score;
339 best.tuple = *tuple;
340 best.range = &mr->range[i];
341 }
342 }
343 }
344 *tuple = best.tuple;
345
346 /* Discard const. */
347 return (struct ip_nat_range *)best.range;
348 }
349
350 /* Fast version doesn't iterate through hash chains, but only handles
351 common case of single IP address (null NAT, masquerade) */
352 static struct ip_nat_range *
353 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
354 const struct ip_nat_multi_range *mr,
355 const struct ip_conntrack *conntrack,
356 unsigned int hooknum)
357 {
358 if (mr->rangesize != 1
359 || (mr->range[0].flags & IP_NAT_RANGE_FULL)
360 || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
361 && mr->range[0].min_ip != mr->range[0].max_ip))
362 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
363
364 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
365 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
366 tuple->src.ip = mr->range[0].min_ip;
367 else {
368 /* Only do extra mangle when required (breaks
369 socket binding) */
370 if (tuple->dst.ip != mr->range[0].min_ip
371 && hooknum == NF_IP_LOCAL_OUT
372 && !do_extra_mangle(mr->range[0].min_ip,
373 &tuple->src.ip))
374 return NULL;
375 tuple->dst.ip = mr->range[0].min_ip;
376 }
377 }
378
379 /* Discard const. */
380 return (struct ip_nat_range *)&mr->range[0];
381 }
382
383 static int
384 get_unique_tuple(struct ip_conntrack_tuple *tuple,
385 const struct ip_conntrack_tuple *orig_tuple,
386 const struct ip_nat_multi_range *mrr,
387 struct ip_conntrack *conntrack,
388 unsigned int hooknum)
389 {
390 struct ip_nat_protocol *proto
391 = find_nat_proto(orig_tuple->dst.protonum);
392 struct ip_nat_range *rptr;
393 unsigned int i;
394 int ret;
395
396 /* We temporarily use flags for marking full parts, but we
397 always clean up afterwards */
398 struct ip_nat_multi_range *mr = (void *)mrr;
399
400 /* 1) If this srcip/proto/src-proto-part is currently mapped,
401 and that same mapping gives a unique tuple within the given
402 range, use that.
403
404 This is only required for source (ie. NAT/masq) mappings.
405 So far, we don't do local source mappings, so multiple
406 manips not an issue. */
407 if (hooknum == NF_IP_POST_ROUTING) {
408 struct ip_conntrack_manip *manip;
409
410 manip = find_appropriate_src(orig_tuple, mr);
411 if (manip) {
412 /* Apply same source manipulation. */
413 *tuple = ((struct ip_conntrack_tuple)
414 { *manip, orig_tuple->dst });
415 DEBUGP("get_unique_tuple: Found current src map\n");
416 return 1;
417 }
418 }
419
420 /* 2) Select the least-used IP/proto combination in the given
421 range.
422 */
423 *tuple = *orig_tuple;
424 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
425 != NULL) {
426 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
427 /* 3) The per-protocol part of the manip is made to
428 map into the range to make a unique tuple. */
429
430 /* Only bother mapping if it's not already in range
431 and unique */
432 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
433 || proto->in_range(tuple, HOOK2MANIP(hooknum),
434 &rptr->min, &rptr->max))
435 && !ip_nat_used_tuple(tuple, conntrack)) {
436 ret = 1;
437 goto clear_fulls;
438 } else {
439 if (proto->unique_tuple(tuple, rptr,
440 HOOK2MANIP(hooknum),
441 conntrack)) {
442 /* Must be unique. */
443 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
444 conntrack));
445 ret = 1;
446 goto clear_fulls;
447 } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
448 /* Try implicit source NAT; protocol
449 may be able to play with ports to
450 make it unique. */
451 struct ip_nat_range r
452 = { IP_NAT_RANGE_MAP_IPS,
453 tuple->src.ip, tuple->src.ip,
454 { 0 }, { 0 } };
455 DEBUGP("Trying implicit mapping\n");
456 if (proto->unique_tuple(tuple, &r,
457 IP_NAT_MANIP_SRC,
458 conntrack)) {
459 /* Must be unique. */
460 IP_NF_ASSERT(!ip_nat_used_tuple
461 (tuple, conntrack));
462 ret = 1;
463 goto clear_fulls;
464 }
465 }
466 DEBUGP("Protocol can't get unique tuple %u.\n",
467 hooknum);
468 }
469
470 /* Eliminate that from range, and try again. */
471 rptr->flags |= IP_NAT_RANGE_FULL;
472 *tuple = *orig_tuple;
473 }
474
475 ret = 0;
476
477 clear_fulls:
478 /* Clear full flags. */
479 IP_NF_ASSERT(mr->rangesize >= 1);
480 for (i = 0; i < mr->rangesize; i++)
481 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
482
483 return ret;
484 }
485
486 static inline int
487 helper_cmp(const struct ip_nat_helper *helper,
488 const struct ip_conntrack_tuple *tuple)
489 {
490 return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
491 }
492
493 /* Where to manip the reply packets (will be reverse manip). */
494 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
495 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
496 [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
497 [NF_IP_LOCAL_OUT] = NF_IP_POST_ROUTING
498 };
499
500 unsigned int
501 ip_nat_setup_info(struct ip_conntrack *conntrack,
502 const struct ip_nat_multi_range *mr,
503 unsigned int hooknum)
504 {
505 struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
506 struct ip_conntrack_tuple orig_tp;
507 struct ip_nat_info *info = &conntrack->nat.info;
508
509 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
510 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
511 || hooknum == NF_IP_POST_ROUTING
512 || hooknum == NF_IP_LOCAL_OUT);
513 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
514
515 /* What we've got will look like inverse of reply. Normally
516 this is what is in the conntrack, except for prior
517 manipulations (future optimization: if num_manips == 0,
518 orig_tp =
519 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
520 invert_tuplepr(&orig_tp,
521 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
522
523 #if 0
524 {
525 unsigned int i;
526
527 DEBUGP("Hook %u (%s), ", hooknum,
528 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
529 DUMP_TUPLE(&orig_tp);
530 DEBUGP("Range %p: ", mr);
531 for (i = 0; i < mr->rangesize; i++) {
532 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
533 i,
534 (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
535 ? " MAP_IPS" : "",
536 (mr->range[i].flags
537 & IP_NAT_RANGE_PROTO_SPECIFIED)
538 ? " PROTO_SPECIFIED" : "",
539 (mr->range[i].flags & IP_NAT_RANGE_FULL)
540 ? " FULL" : "",
541 NIPQUAD(mr->range[i].min_ip),
542 NIPQUAD(mr->range[i].max_ip),
543 mr->range[i].min.all,
544 mr->range[i].max.all);
545 }
546 }
547 #endif
548
549 do {
550 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
551 hooknum)) {
552 DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
553 conntrack);
554 return NF_DROP;
555 }
556
557 #if 0
558 DEBUGP("Hook %u (%s) %p\n", hooknum,
559 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
560 conntrack);
561 DEBUGP("Original: ");
562 DUMP_TUPLE(&orig_tp);
563 DEBUGP("New: ");
564 DUMP_TUPLE(&new_tuple);
565 #endif
566
567 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
568 the original (A/B/C/D') and the mangled one (E/F/G/H').
569
570 We're only allowed to work with the SRC per-proto
571 part, so we create inverses of both to start, then
572 derive the other fields we need. */
573
574 /* Reply connection: simply invert the new tuple
575 (G/H/E/F') */
576 invert_tuplepr(&reply, &new_tuple);
577
578 /* Alter conntrack table so it recognizes replies.
579 If fail this race (reply tuple now used), repeat. */
580 } while (!ip_conntrack_alter_reply(conntrack, &reply));
581
582 /* FIXME: We can simply used existing conntrack reply tuple
583 here --RR */
584 /* Create inverse of original: C/D/A/B' */
585 invert_tuplepr(&inv_tuple, &orig_tp);
586
587 /* Has source changed?. */
588 if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
589 /* In this direction, a source manip. */
590 info->manips[info->num_manips++] =
591 ((struct ip_nat_info_manip)
592 { IP_CT_DIR_ORIGINAL, hooknum,
593 IP_NAT_MANIP_SRC, new_tuple.src });
594
595 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
596
597 /* In the reverse direction, a destination manip. */
598 info->manips[info->num_manips++] =
599 ((struct ip_nat_info_manip)
600 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
601 IP_NAT_MANIP_DST, orig_tp.src });
602 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
603 }
604
605 /* Has destination changed? */
606 if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
607 /* In this direction, a destination manip */
608 info->manips[info->num_manips++] =
609 ((struct ip_nat_info_manip)
610 { IP_CT_DIR_ORIGINAL, hooknum,
611 IP_NAT_MANIP_DST, reply.src });
612
613 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
614
615 /* In the reverse direction, a source manip. */
616 info->manips[info->num_manips++] =
617 ((struct ip_nat_info_manip)
618 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
619 IP_NAT_MANIP_SRC, inv_tuple.src });
620 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
621 }
622
623 /* If there's a helper, assign it; based on new tuple. */
624 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
625 &reply);
626
627 /* It's done. */
628 info->initialized |= (1 << HOOK2MANIP(hooknum));
629 return NF_ACCEPT;
630 }
631
632 void replace_in_hashes(struct ip_conntrack *conntrack,
633 struct ip_nat_info *info)
634 {
635 /* Source has changed, so replace in hashes. */
636 unsigned int srchash
637 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
638 .tuple.src,
639 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
640 .tuple.dst.protonum);
641 /* We place packet as seen OUTGOUNG in byips_proto hash
642 (ie. reverse dst and src of reply packet. */
643 unsigned int ipsprotohash
644 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
645 .tuple.dst.ip,
646 conntrack->tuplehash[IP_CT_DIR_REPLY]
647 .tuple.src.ip,
648 conntrack->tuplehash[IP_CT_DIR_REPLY]
649 .tuple.dst.protonum);
650
651 IP_NF_ASSERT(info->bysource.conntrack == conntrack);
652 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
653
654 list_del(&info->bysource.list);
655 list_del(&info->byipsproto.list);
656
657 list_prepend(&bysource[srchash], &info->bysource);
658 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
659 }
660
661 void place_in_hashes(struct ip_conntrack *conntrack,
662 struct ip_nat_info *info)
663 {
664 unsigned int srchash
665 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
666 .tuple.src,
667 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
668 .tuple.dst.protonum);
669 /* We place packet as seen OUTGOUNG in byips_proto hash
670 (ie. reverse dst and src of reply packet. */
671 unsigned int ipsprotohash
672 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
673 .tuple.dst.ip,
674 conntrack->tuplehash[IP_CT_DIR_REPLY]
675 .tuple.src.ip,
676 conntrack->tuplehash[IP_CT_DIR_REPLY]
677 .tuple.dst.protonum);
678
679 IP_NF_ASSERT(!info->bysource.conntrack);
680
681 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
682 info->byipsproto.conntrack = conntrack;
683 info->bysource.conntrack = conntrack;
684
685 list_prepend(&bysource[srchash], &info->bysource);
686 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
687 }
688
689 static void
690 manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
691 const struct ip_conntrack_manip *manip,
692 enum ip_nat_manip_type maniptype,
693 __u32 *nfcache)
694 {
695 *nfcache |= NFC_ALTERED;
696 find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
697
698 if (maniptype == IP_NAT_MANIP_SRC) {
699 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
700 iph->check);
701 iph->saddr = manip->ip;
702 } else {
703 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
704 iph->check);
705 iph->daddr = manip->ip;
706 }
707 #if 0
708 if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
709 DEBUGP("IP: checksum on packet bad.\n");
710
711 if (proto == IPPROTO_TCP) {
712 void *th = (u_int32_t *)iph + iph->ihl;
713 if (tcp_v4_check(th, len - 4*iph->ihl, iph->saddr, iph->daddr,
714 csum_partial((char *)th, len-4*iph->ihl, 0)))
715 DEBUGP("TCP: checksum on packet bad\n");
716 }
717 #endif
718 }
719
720 /* Do packet manipulations according to binding. */
721 unsigned int
722 do_bindings(struct ip_conntrack *ct,
723 enum ip_conntrack_info ctinfo,
724 struct ip_nat_info *info,
725 unsigned int hooknum,
726 struct sk_buff **pskb)
727 {
728 unsigned int i;
729 struct ip_nat_helper *helper;
730 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
731
732 /* Need nat lock to protect against modification, but neither
733 conntrack (referenced) and helper (deleted with
734 synchronize_bh()) can vanish. */
735 READ_LOCK(&ip_nat_lock);
736 for (i = 0; i < info->num_manips; i++) {
737 if (info->manips[i].direction == dir
738 && info->manips[i].hooknum == hooknum) {
739 DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
740 *pskb,
741 info->manips[i].maniptype == IP_NAT_MANIP_SRC
742 ? "SRC" : "DST",
743 NIPQUAD(info->manips[i].manip.ip),
744 htons(info->manips[i].manip.u.all));
745 manip_pkt((*pskb)->nh.iph->protocol,
746 (*pskb)->nh.iph,
747 (*pskb)->len,
748 &info->manips[i].manip,
749 info->manips[i].maniptype,
750 &(*pskb)->nfcache);
751 }
752 }
753 helper = info->helper;
754 READ_UNLOCK(&ip_nat_lock);
755
756 if (helper) {
757 /* Always defragged for helpers */
758 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
759 & __constant_htons(IP_MF|IP_OFFSET)));
760 return helper->help(ct, info, ctinfo, hooknum, pskb);
761 } else return NF_ACCEPT;
762 }
763
764 unsigned int
765 icmp_reply_translation(struct sk_buff *skb,
766 struct ip_conntrack *conntrack,
767 unsigned int hooknum,
768 int dir)
769 {
770 struct iphdr *iph = skb->nh.iph;
771 struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
772 struct iphdr *inner = (struct iphdr *)(hdr + 1);
773 size_t datalen = skb->len - ((void *)inner - (void *)iph);
774 unsigned int i;
775 struct ip_nat_info *info = &conntrack->nat.info;
776
777 IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr));
778 /* Must be RELATED */
779 IP_NF_ASSERT(skb->nfct - (struct ip_conntrack *)skb->nfct->master
780 == IP_CT_RELATED
781 || skb->nfct - (struct ip_conntrack *)skb->nfct->master
782 == IP_CT_RELATED+IP_CT_IS_REPLY);
783
784 /* Redirects on non-null nats must be dropped, else they'll
785 start talking to each other without our translation, and be
786 confused... --RR */
787 if (hdr->type == ICMP_REDIRECT) {
788 /* Don't care about races here. */
789 if (info->initialized
790 != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
791 || info->num_manips != 0)
792 return NF_DROP;
793 }
794
795 DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
796 skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
797 /* Note: May not be from a NAT'd host, but probably safest to
798 do translation always as if it came from the host itself
799 (even though a "host unreachable" coming from the host
800 itself is a bit wierd).
801
802 More explanation: some people use NAT for anonymizing.
803 Also, CERT recommends dropping all packets from private IP
804 addresses (although ICMP errors from internal links with
805 such addresses are not too uncommon, as Alan Cox points
806 out) */
807
808 READ_LOCK(&ip_nat_lock);
809 for (i = 0; i < info->num_manips; i++) {
810 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
811 i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
812 "ORIG" : "REPLY", info->manips[i].hooknum);
813
814 if (info->manips[i].direction != dir)
815 continue;
816
817 /* Mapping the inner packet is just like a normal
818 packet, except it was never src/dst reversed, so
819 where we would normally apply a dst manip, we apply
820 a src, and vice versa. */
821 if (info->manips[i].hooknum == opposite_hook[hooknum]) {
822 DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
823 info->manips[i].maniptype == IP_NAT_MANIP_SRC
824 ? "DST" : "SRC",
825 NIPQUAD(info->manips[i].manip.ip),
826 ntohs(info->manips[i].manip.u.udp.port));
827 manip_pkt(inner->protocol, inner,
828 skb->len - ((void *)inner - (void *)iph),
829 &info->manips[i].manip,
830 !info->manips[i].maniptype,
831 &skb->nfcache);
832 /* Outer packet needs to have IP header NATed like
833 it's a reply. */
834 } else if (info->manips[i].hooknum == hooknum) {
835 /* Use mapping to map outer packet: 0 give no
836 per-proto mapping */
837 DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
838 info->manips[i].maniptype == IP_NAT_MANIP_SRC
839 ? "SRC" : "DST",
840 NIPQUAD(info->manips[i].manip.ip));
841 manip_pkt(0, iph, skb->len,
842 &info->manips[i].manip,
843 info->manips[i].maniptype,
844 &skb->nfcache);
845 }
846 }
847 READ_UNLOCK(&ip_nat_lock);
848
849 /* Since we mangled inside ICMP packet, recalculate its
850 checksum from scratch. (Hence the handling of incorrect
851 checksums in conntrack, so we don't accidentally fix one.) */
852 hdr->checksum = 0;
853 hdr->checksum = ip_compute_csum((unsigned char *)hdr,
854 sizeof(*hdr) + datalen);
855
856 return NF_ACCEPT;
857 }
858
859 int __init ip_nat_init(void)
860 {
861 size_t i;
862
863 /* Leave them the same for the moment. */
864 ip_nat_htable_size = ip_conntrack_htable_size;
865
866 /* One vmalloc for both hash tables */
867 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
868 if (!bysource) {
869 return -ENOMEM;
870 }
871 byipsproto = bysource + ip_nat_htable_size;
872
873 /* Sew in builtin protocols. */
874 WRITE_LOCK(&ip_nat_lock);
875 list_append(&protos, &ip_nat_protocol_tcp);
876 list_append(&protos, &ip_nat_protocol_udp);
877 list_append(&protos, &ip_nat_protocol_icmp);
878 WRITE_UNLOCK(&ip_nat_lock);
879
880 for (i = 0; i < ip_nat_htable_size; i++) {
881 INIT_LIST_HEAD(&bysource[i]);
882 INIT_LIST_HEAD(&byipsproto[i]);
883 }
884
885 /* FIXME: Man, this is a hack. <SIGH> */
886 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
887 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
888
889 return 0;
890 }
891
892 /* Clear NAT section of all conntracks, in case we're loaded again. */
893 static int clean_nat(const struct ip_conntrack *i, void *data)
894 {
895 memset((void *)&i->nat, 0, sizeof(i->nat));
896 return 0;
897 }
898
899 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
900 void ip_nat_cleanup(void)
901 {
902 ip_ct_selective_cleanup(&clean_nat, NULL);
903 ip_conntrack_destroyed = NULL;
904 }
905