File: /usr/src/linux/net/ipv4/netfilter/ip_nat_core.c

1     /* NAT for netfilter; shared with compatibility layer. */
2     
3     /* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
4        Public Licence. */
5     #ifdef MODULE
6     #define __NO_VERSION__
7     #endif
8     #include <linux/version.h>
9     #include <linux/module.h>
10     #include <linux/types.h>
11     #include <linux/timer.h>
12     #include <linux/skbuff.h>
13     #include <linux/netfilter_ipv4.h>
14     #include <linux/brlock.h>
15     #include <linux/vmalloc.h>
16     #include <net/checksum.h>
17     #include <net/icmp.h>
18     #include <net/ip.h>
19     #include <net/tcp.h>  /* For tcp_prot in getorigdst */
20     
21     #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
22     #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
23     
24     #include <linux/netfilter_ipv4/ip_nat.h>
25     #include <linux/netfilter_ipv4/ip_nat_protocol.h>
26     #include <linux/netfilter_ipv4/ip_nat_core.h>
27     #include <linux/netfilter_ipv4/ip_nat_helper.h>
28     #include <linux/netfilter_ipv4/listhelp.h>
29     
30     #if 0
31     #define DEBUGP printk
32     #else
33     #define DEBUGP(format, args...)
34     #endif
35     
36     DECLARE_RWLOCK(ip_nat_lock);
37     
38     /* Calculated at init based on memory size */
39     static unsigned int ip_nat_htable_size;
40     
41     static struct list_head *bysource;
42     static struct list_head *byipsproto;
43     LIST_HEAD(protos);
44     LIST_HEAD(helpers);
45     
46     extern struct ip_nat_protocol unknown_nat_protocol;
47     
48     /* We keep extra hashes for each conntrack, for fast searching. */
49     static inline size_t
50     hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
51     {
52     	/* Modified src and dst, to ensure we don't create two
53                identical streams. */
54     	return (src + dst + proto) % ip_nat_htable_size;
55     }
56     
57     static inline size_t
58     hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
59     {
60     	/* Original src, to ensure we map it consistently if poss. */
61     	return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
62     }
63     
64     /* Noone using conntrack by the time this called. */
65     static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
66     {
67     	struct ip_nat_info *info = &conn->nat.info;
68     
69     	if (!info->initialized)
70     		return;
71     
72     	IP_NF_ASSERT(info->bysource.conntrack);
73     	IP_NF_ASSERT(info->byipsproto.conntrack);
74     
75     	WRITE_LOCK(&ip_nat_lock);
76     	LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL]
77     					  .tuple.src,
78     					  conn->tuplehash[IP_CT_DIR_ORIGINAL]
79     					  .tuple.dst.protonum)],
80     		    &info->bysource);
81     
82     	LIST_DELETE(&byipsproto
83     		    [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY]
84     				      .tuple.src.ip,
85     				      conn->tuplehash[IP_CT_DIR_REPLY]
86     				      .tuple.dst.ip,
87     				      conn->tuplehash[IP_CT_DIR_REPLY]
88     				      .tuple.dst.protonum)],
89     		    &info->byipsproto);
90     	WRITE_UNLOCK(&ip_nat_lock);
91     }
92     
93     /* We do checksum mangling, so if they were wrong before they're still
94      * wrong.  Also works for incomplete packets (eg. ICMP dest
95      * unreachables.) */
96     u_int16_t
97     ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
98     {
99     	u_int32_t diffs[] = { oldvalinv, newval };
100     	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
101     				      oldcheck^0xFFFF));
102     }
103     
104     static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
105     {
106     	return i->protonum == proto;
107     }
108     
109     struct ip_nat_protocol *
110     find_nat_proto(u_int16_t protonum)
111     {
112     	struct ip_nat_protocol *i;
113     
114     	MUST_BE_READ_LOCKED(&ip_nat_lock);
115     	i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
116     	if (!i)
117     		i = &unknown_nat_protocol;
118     	return i;
119     }
120     
121     /* Is this tuple already taken? (not by us) */
122     int
123     ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
124     		  const struct ip_conntrack *ignored_conntrack)
125     {
126     	/* Conntrack tracking doesn't keep track of outgoing tuples; only
127     	   incoming ones.  NAT means they don't have a fixed mapping,
128     	   so we invert the tuple and look for the incoming reply.
129     
130     	   We could keep a separate hash if this proves too slow. */
131     	struct ip_conntrack_tuple reply;
132     
133     	invert_tuplepr(&reply, tuple);
134     	return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
135     }
136     
137     /* Does tuple + the source manip come within the range mr */
138     static int
139     in_range(const struct ip_conntrack_tuple *tuple,
140     	 const struct ip_conntrack_manip *manip,
141     	 const struct ip_nat_multi_range *mr)
142     {
143     	struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
144     	unsigned int i;
145     	struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
146     
147     	for (i = 0; i < mr->rangesize; i++) {
148     		/* If we are allowed to map IPs, then we must be in the
149     		   range specified, otherwise we must be unchanged. */
150     		if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
151     			if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
152     			    || (ntohl(newtuple.src.ip)
153     				> ntohl(mr->range[i].max_ip)))
154     				continue;
155     		} else {
156     			if (newtuple.src.ip != tuple->src.ip)
157     				continue;
158     		}
159     
160     		if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
161     		    && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
162     				       &mr->range[i].min, &mr->range[i].max))
163     			return 1;
164     	}
165     	return 0;
166     }
167     
168     static inline int
169     src_cmp(const struct ip_nat_hash *i,
170     	const struct ip_conntrack_tuple *tuple,
171     	const struct ip_nat_multi_range *mr)
172     {
173     	return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
174     		== tuple->dst.protonum
175     		&& i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
176     		== tuple->src.ip
177     		&& i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
178     		== tuple->src.u.all
179     		&& in_range(tuple,
180     			    &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
181     			    .tuple.src,
182     			    mr));
183     }
184     
185     /* Only called for SRC manip */
186     static struct ip_conntrack_manip *
187     find_appropriate_src(const struct ip_conntrack_tuple *tuple,
188     		     const struct ip_nat_multi_range *mr)
189     {
190     	unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
191     	struct ip_nat_hash *i;
192     
193     	MUST_BE_READ_LOCKED(&ip_nat_lock);
194     	i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
195     	if (i)
196     		return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
197     	else
198     		return NULL;
199     }
200     
201     /* If it's really a local destination manip, it may need to do a
202        source manip too. */
203     static int
204     do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
205     {
206     	struct rtable *rt;
207     
208     	/* FIXME: IPTOS_TOS(iph->tos) --RR */
209     	if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
210     		DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
211     		       NIPQUAD(var_ip));
212     		return 0;
213     	}
214     
215     	*other_ipp = rt->rt_src;
216     	ip_rt_put(rt);
217     	return 1;
218     }
219     
220     /* Simple way to iterate through all. */
221     static inline int fake_cmp(const struct ip_nat_hash *i,
222     			   u_int32_t src, u_int32_t dst, u_int16_t protonum,
223     			   unsigned int *score,
224     			   const struct ip_conntrack *conntrack)
225     {
226     	/* Compare backwards: we're dealing with OUTGOING tuples, and
227                inside the conntrack is the REPLY tuple.  Don't count this
228                conntrack. */
229     	if (i->conntrack != conntrack
230     	    && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
231     	    && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
232     	    && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
233     		== protonum))
234     		(*score)++;
235     	return 0;
236     }
237     
238     static inline unsigned int
239     count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
240     	   const struct ip_conntrack *conntrack)
241     {
242     	unsigned int score = 0;
243     
244     	MUST_BE_READ_LOCKED(&ip_nat_lock);
245     	LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)],
246     		  fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score,
247     		  conntrack);
248     
249     	return score;
250     }
251     
252     /* For [FUTURE] fragmentation handling, we want the least-used
253        src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
254        if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
255        1-65535, we don't do pro-rata allocation based on ports; we choose
256        the ip with the lowest src-ip/dst-ip/proto usage.
257     
258        If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
259        range), we eliminate that and try again.  This is not the most
260        efficient approach, but if you're worried about that, don't hand us
261        ranges you don't really have.  */
262     static struct ip_nat_range *
263     find_best_ips_proto(struct ip_conntrack_tuple *tuple,
264     		    const struct ip_nat_multi_range *mr,
265     		    const struct ip_conntrack *conntrack,
266     		    unsigned int hooknum)
267     {
268     	unsigned int i;
269     	struct {
270     		const struct ip_nat_range *range;
271     		unsigned int score;
272     		struct ip_conntrack_tuple tuple;
273     	} best = { NULL,  0xFFFFFFFF };
274     	u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
275     	static unsigned int randomness = 0;
276     
277     	if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
278     		var_ipp = &tuple->src.ip;
279     		saved_ip = tuple->dst.ip;
280     		other_ipp = &tuple->dst.ip;
281     	} else {
282     		var_ipp = &tuple->dst.ip;
283     		saved_ip = tuple->src.ip;
284     		other_ipp = &tuple->src.ip;
285     	}
286     	/* Don't do do_extra_mangle unless neccessary (overrides
287                explicit socket bindings, for example) */
288     	orig_dstip = tuple->dst.ip;
289     
290     	IP_NF_ASSERT(mr->rangesize >= 1);
291     	for (i = 0; i < mr->rangesize; i++) {
292     		/* Host order */
293     		u_int32_t minip, maxip, j;
294     
295     		/* Don't do ranges which are already eliminated. */
296     		if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
297     			continue;
298     		}
299     
300     		if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
301     			minip = ntohl(mr->range[i].min_ip);
302     			maxip = ntohl(mr->range[i].max_ip);
303     		} else
304     			minip = maxip = ntohl(*var_ipp);
305     
306     		randomness++;
307     		for (j = 0; j < maxip - minip + 1; j++) {
308     			unsigned int score;
309     
310     			*var_ipp = htonl(minip + (randomness + j) 
311     					 % (maxip - minip + 1));
312     
313     			/* Reset the other ip in case it was mangled by
314     			 * do_extra_mangle last time. */
315     			*other_ipp = saved_ip;
316     
317     			if (hooknum == NF_IP_LOCAL_OUT
318     			    && *var_ipp != orig_dstip
319     			    && !do_extra_mangle(*var_ipp, other_ipp)) {
320     				DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
321     				       i, NIPQUAD(*var_ipp));
322     				/* Can't route?  This whole range part is
323     				 * probably screwed, but keep trying
324     				 * anyway. */
325     				continue;
326     			}
327     
328     			/* Count how many others map onto this. */
329     			score = count_maps(tuple->src.ip, tuple->dst.ip,
330     					   tuple->dst.protonum, conntrack);
331     			if (score < best.score) {
332     				/* Optimization: doesn't get any better than
333     				   this. */
334     				if (score == 0)
335     					return (struct ip_nat_range *)
336     						&mr->range[i];
337     
338     				best.score = score;
339     				best.tuple = *tuple;
340     				best.range = &mr->range[i];
341     			}
342     		}
343     	}
344     	*tuple = best.tuple;
345     
346     	/* Discard const. */
347     	return (struct ip_nat_range *)best.range;
348     }
349     
350     /* Fast version doesn't iterate through hash chains, but only handles
351        common case of single IP address (null NAT, masquerade) */
352     static struct ip_nat_range *
353     find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
354     			 const struct ip_nat_multi_range *mr,
355     			 const struct ip_conntrack *conntrack,
356     			 unsigned int hooknum)
357     {
358     	if (mr->rangesize != 1
359     	    || (mr->range[0].flags & IP_NAT_RANGE_FULL)
360     	    || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
361     		&& mr->range[0].min_ip != mr->range[0].max_ip))
362     		return find_best_ips_proto(tuple, mr, conntrack, hooknum);
363     
364     	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
365     		if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
366     			tuple->src.ip = mr->range[0].min_ip;
367     		else {
368     			/* Only do extra mangle when required (breaks
369                                socket binding) */
370     			if (tuple->dst.ip != mr->range[0].min_ip
371     			    && hooknum == NF_IP_LOCAL_OUT
372     			    && !do_extra_mangle(mr->range[0].min_ip,
373     						&tuple->src.ip))
374     				return NULL;
375     			tuple->dst.ip = mr->range[0].min_ip;
376     		}
377     	}
378     
379     	/* Discard const. */
380     	return (struct ip_nat_range *)&mr->range[0];
381     }
382     
383     static int
384     get_unique_tuple(struct ip_conntrack_tuple *tuple,
385     		 const struct ip_conntrack_tuple *orig_tuple,
386     		 const struct ip_nat_multi_range *mrr,
387     		 struct ip_conntrack *conntrack,
388     		 unsigned int hooknum)
389     {
390     	struct ip_nat_protocol *proto
391     		= find_nat_proto(orig_tuple->dst.protonum);
392     	struct ip_nat_range *rptr;
393     	unsigned int i;
394     	int ret;
395     
396     	/* We temporarily use flags for marking full parts, but we
397     	   always clean up afterwards */
398     	struct ip_nat_multi_range *mr = (void *)mrr;
399     
400     	/* 1) If this srcip/proto/src-proto-part is currently mapped,
401     	   and that same mapping gives a unique tuple within the given
402     	   range, use that.
403     
404     	   This is only required for source (ie. NAT/masq) mappings.
405     	   So far, we don't do local source mappings, so multiple
406     	   manips not an issue.  */
407     	if (hooknum == NF_IP_POST_ROUTING) {
408     		struct ip_conntrack_manip *manip;
409     
410     		manip = find_appropriate_src(orig_tuple, mr);
411     		if (manip) {
412     			/* Apply same source manipulation. */
413     			*tuple = ((struct ip_conntrack_tuple)
414     				  { *manip, orig_tuple->dst });
415     			DEBUGP("get_unique_tuple: Found current src map\n");
416     			return 1;
417     		}
418     	}
419     
420     	/* 2) Select the least-used IP/proto combination in the given
421     	   range.
422     	*/
423     	*tuple = *orig_tuple;
424     	while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
425     	       != NULL) {
426     		DEBUGP("Found best for "); DUMP_TUPLE(tuple);
427     		/* 3) The per-protocol part of the manip is made to
428     		   map into the range to make a unique tuple. */
429     
430     		/* Only bother mapping if it's not already in range
431     		   and unique */
432     		if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
433     		     || proto->in_range(tuple, HOOK2MANIP(hooknum),
434     					&rptr->min, &rptr->max))
435     		    && !ip_nat_used_tuple(tuple, conntrack)) {
436     			ret = 1;
437     			goto clear_fulls;
438     		} else {
439     			if (proto->unique_tuple(tuple, rptr,
440     						HOOK2MANIP(hooknum),
441     						conntrack)) {
442     				/* Must be unique. */
443     				IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
444     								conntrack));
445     				ret = 1;
446     				goto clear_fulls;
447     			} else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
448     				/* Try implicit source NAT; protocol
449                                        may be able to play with ports to
450                                        make it unique. */
451     				struct ip_nat_range r
452     					= { IP_NAT_RANGE_MAP_IPS, 
453     					    tuple->src.ip, tuple->src.ip,
454     					    { 0 }, { 0 } };
455     				DEBUGP("Trying implicit mapping\n");
456     				if (proto->unique_tuple(tuple, &r,
457     							IP_NAT_MANIP_SRC,
458     							conntrack)) {
459     					/* Must be unique. */
460     					IP_NF_ASSERT(!ip_nat_used_tuple
461     						     (tuple, conntrack));
462     					ret = 1;
463     					goto clear_fulls;
464     				}
465     			}
466     			DEBUGP("Protocol can't get unique tuple %u.\n",
467     			       hooknum);
468     		}
469     
470     		/* Eliminate that from range, and try again. */
471     		rptr->flags |= IP_NAT_RANGE_FULL;
472     		*tuple = *orig_tuple;
473     	}
474     
475     	ret = 0;
476     
477      clear_fulls:
478     	/* Clear full flags. */
479     	IP_NF_ASSERT(mr->rangesize >= 1);
480     	for (i = 0; i < mr->rangesize; i++)
481     		mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
482     
483     	return ret;
484     }
485     
486     static inline int
487     helper_cmp(const struct ip_nat_helper *helper,
488     	   const struct ip_conntrack_tuple *tuple)
489     {
490     	return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
491     }
492     
493     /* Where to manip the reply packets (will be reverse manip). */
494     static unsigned int opposite_hook[NF_IP_NUMHOOKS]
495     = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
496         [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
497         [NF_IP_LOCAL_OUT] = NF_IP_POST_ROUTING
498     };
499     
500     unsigned int
501     ip_nat_setup_info(struct ip_conntrack *conntrack,
502     		  const struct ip_nat_multi_range *mr,
503     		  unsigned int hooknum)
504     {
505     	struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
506     	struct ip_conntrack_tuple orig_tp;
507     	struct ip_nat_info *info = &conntrack->nat.info;
508     
509     	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
510     	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
511     		     || hooknum == NF_IP_POST_ROUTING
512     		     || hooknum == NF_IP_LOCAL_OUT);
513     	IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
514     
515     	/* What we've got will look like inverse of reply. Normally
516     	   this is what is in the conntrack, except for prior
517     	   manipulations (future optimization: if num_manips == 0,
518     	   orig_tp =
519     	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
520     	invert_tuplepr(&orig_tp,
521     		       &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
522     
523     #if 0
524     	{
525     	unsigned int i;
526     
527     	DEBUGP("Hook %u (%s), ", hooknum,
528     	       HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
529     	DUMP_TUPLE(&orig_tp);
530     	DEBUGP("Range %p: ", mr);
531     	for (i = 0; i < mr->rangesize; i++) {
532     		DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
533     		       i,
534     		       (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
535     		       ? " MAP_IPS" : "",
536     		       (mr->range[i].flags
537     			& IP_NAT_RANGE_PROTO_SPECIFIED)
538     		       ? " PROTO_SPECIFIED" : "",
539     		       (mr->range[i].flags & IP_NAT_RANGE_FULL)
540     		       ? " FULL" : "",
541     		       NIPQUAD(mr->range[i].min_ip),
542     		       NIPQUAD(mr->range[i].max_ip),
543     		       mr->range[i].min.all,
544     		       mr->range[i].max.all);
545     	}
546     	}
547     #endif
548     
549     	do {
550     		if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
551     				      hooknum)) {
552     			DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
553     			       conntrack);
554     			return NF_DROP;
555     		}
556     
557     #if 0
558     		DEBUGP("Hook %u (%s) %p\n", hooknum,
559     		       HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
560     		       conntrack);
561     		DEBUGP("Original: ");
562     		DUMP_TUPLE(&orig_tp);
563     		DEBUGP("New: ");
564     		DUMP_TUPLE(&new_tuple);
565     #endif
566     
567     		/* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
568     		   the original (A/B/C/D') and the mangled one (E/F/G/H').
569     
570     		   We're only allowed to work with the SRC per-proto
571     		   part, so we create inverses of both to start, then
572     		   derive the other fields we need.  */
573     
574     		/* Reply connection: simply invert the new tuple
575                        (G/H/E/F') */
576     		invert_tuplepr(&reply, &new_tuple);
577     
578     		/* Alter conntrack table so it recognizes replies.
579                        If fail this race (reply tuple now used), repeat. */
580     	} while (!ip_conntrack_alter_reply(conntrack, &reply));
581     
582     	/* FIXME: We can simply used existing conntrack reply tuple
583                here --RR */
584     	/* Create inverse of original: C/D/A/B' */
585     	invert_tuplepr(&inv_tuple, &orig_tp);
586     
587     	/* Has source changed?. */
588     	if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
589     		/* In this direction, a source manip. */
590     		info->manips[info->num_manips++] =
591     			((struct ip_nat_info_manip)
592     			 { IP_CT_DIR_ORIGINAL, hooknum,
593     			   IP_NAT_MANIP_SRC, new_tuple.src });
594     
595     		IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
596     
597     		/* In the reverse direction, a destination manip. */
598     		info->manips[info->num_manips++] =
599     			((struct ip_nat_info_manip)
600     			 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
601     			   IP_NAT_MANIP_DST, orig_tp.src });
602     		IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
603     	}
604     
605     	/* Has destination changed? */
606     	if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
607     		/* In this direction, a destination manip */
608     		info->manips[info->num_manips++] =
609     			((struct ip_nat_info_manip)
610     			 { IP_CT_DIR_ORIGINAL, hooknum,
611     			   IP_NAT_MANIP_DST, reply.src });
612     
613     		IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
614     
615     		/* In the reverse direction, a source manip. */
616     		info->manips[info->num_manips++] =
617     			((struct ip_nat_info_manip)
618     			 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
619     			   IP_NAT_MANIP_SRC, inv_tuple.src });
620     		IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
621     	}
622     
623     	/* If there's a helper, assign it; based on new tuple. */
624     	info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
625     				 &reply);
626     
627     	/* It's done. */
628     	info->initialized |= (1 << HOOK2MANIP(hooknum));
629     	return NF_ACCEPT;
630     }
631     
632     void replace_in_hashes(struct ip_conntrack *conntrack,
633     		       struct ip_nat_info *info)
634     {
635     	/* Source has changed, so replace in hashes. */
636     	unsigned int srchash
637     		= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
638     			      .tuple.src,
639     			      conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
640     			      .tuple.dst.protonum);
641     	/* We place packet as seen OUTGOUNG in byips_proto hash
642                (ie. reverse dst and src of reply packet. */
643     	unsigned int ipsprotohash
644     		= hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
645     				   .tuple.dst.ip,
646     				   conntrack->tuplehash[IP_CT_DIR_REPLY]
647     				   .tuple.src.ip,
648     				   conntrack->tuplehash[IP_CT_DIR_REPLY]
649     				   .tuple.dst.protonum);
650     
651     	IP_NF_ASSERT(info->bysource.conntrack == conntrack);
652     	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
653     
654     	list_del(&info->bysource.list);
655     	list_del(&info->byipsproto.list);
656     
657     	list_prepend(&bysource[srchash], &info->bysource);
658     	list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
659     }
660     
661     void place_in_hashes(struct ip_conntrack *conntrack,
662     		     struct ip_nat_info *info)
663     {
664     	unsigned int srchash
665     		= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
666     			      .tuple.src,
667     			      conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
668     			      .tuple.dst.protonum);
669     	/* We place packet as seen OUTGOUNG in byips_proto hash
670                (ie. reverse dst and src of reply packet. */
671     	unsigned int ipsprotohash
672     		= hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
673     				   .tuple.dst.ip,
674     				   conntrack->tuplehash[IP_CT_DIR_REPLY]
675     				   .tuple.src.ip,
676     				   conntrack->tuplehash[IP_CT_DIR_REPLY]
677     				   .tuple.dst.protonum);
678     
679     	IP_NF_ASSERT(!info->bysource.conntrack);
680     
681     	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
682     	info->byipsproto.conntrack = conntrack;
683     	info->bysource.conntrack = conntrack;
684     
685     	list_prepend(&bysource[srchash], &info->bysource);
686     	list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
687     }
688     
689     static void
690     manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
691     	  const struct ip_conntrack_manip *manip,
692     	  enum ip_nat_manip_type maniptype,
693     	  __u32 *nfcache)
694     {
695     	*nfcache |= NFC_ALTERED;
696     	find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
697     
698     	if (maniptype == IP_NAT_MANIP_SRC) {
699     		iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
700     						iph->check);
701     		iph->saddr = manip->ip;
702     	} else {
703     		iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
704     						iph->check);
705     		iph->daddr = manip->ip;
706     	}
707     #if 0
708     	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
709     		DEBUGP("IP: checksum on packet bad.\n");
710     
711     	if (proto == IPPROTO_TCP) {
712     		void *th = (u_int32_t *)iph + iph->ihl;
713     		if (tcp_v4_check(th, len - 4*iph->ihl, iph->saddr, iph->daddr,
714     				 csum_partial((char *)th, len-4*iph->ihl, 0)))
715     			DEBUGP("TCP: checksum on packet bad\n");
716     	}
717     #endif
718     }
719     
720     /* Do packet manipulations according to binding. */
721     unsigned int
722     do_bindings(struct ip_conntrack *ct,
723     	    enum ip_conntrack_info ctinfo,
724     	    struct ip_nat_info *info,
725     	    unsigned int hooknum,
726     	    struct sk_buff **pskb)
727     {
728     	unsigned int i;
729     	struct ip_nat_helper *helper;
730     	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
731     
732     	/* Need nat lock to protect against modification, but neither
733     	   conntrack (referenced) and helper (deleted with
734     	   synchronize_bh()) can vanish. */
735     	READ_LOCK(&ip_nat_lock);
736     	for (i = 0; i < info->num_manips; i++) {
737     		if (info->manips[i].direction == dir
738     		    && info->manips[i].hooknum == hooknum) {
739     			DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
740     			       *pskb,
741     			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
742     			       ? "SRC" : "DST",
743     			       NIPQUAD(info->manips[i].manip.ip),
744     			       htons(info->manips[i].manip.u.all));
745     			manip_pkt((*pskb)->nh.iph->protocol,
746     				  (*pskb)->nh.iph,
747     				  (*pskb)->len,
748     				  &info->manips[i].manip,
749     				  info->manips[i].maniptype,
750     				  &(*pskb)->nfcache);
751     		}
752     	}
753     	helper = info->helper;
754     	READ_UNLOCK(&ip_nat_lock);
755     
756     	if (helper) {
757     		/* Always defragged for helpers */
758     		IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
759     			       & __constant_htons(IP_MF|IP_OFFSET)));
760     		return helper->help(ct, info, ctinfo, hooknum, pskb);
761     	} else return NF_ACCEPT;
762     }
763     
764     unsigned int
765     icmp_reply_translation(struct sk_buff *skb,
766     		       struct ip_conntrack *conntrack,
767     		       unsigned int hooknum,
768     		       int dir)
769     {
770     	struct iphdr *iph = skb->nh.iph;
771     	struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
772     	struct iphdr *inner = (struct iphdr *)(hdr + 1);
773     	size_t datalen = skb->len - ((void *)inner - (void *)iph);
774     	unsigned int i;
775     	struct ip_nat_info *info = &conntrack->nat.info;
776     
777     	IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr));
778     	/* Must be RELATED */
779     	IP_NF_ASSERT(skb->nfct - (struct ip_conntrack *)skb->nfct->master
780     		     == IP_CT_RELATED
781     		     || skb->nfct - (struct ip_conntrack *)skb->nfct->master
782     		     == IP_CT_RELATED+IP_CT_IS_REPLY);
783     
784     	/* Redirects on non-null nats must be dropped, else they'll
785                start talking to each other without our translation, and be
786                confused... --RR */
787     	if (hdr->type == ICMP_REDIRECT) {
788     		/* Don't care about races here. */
789     		if (info->initialized
790     		    != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
791     		    || info->num_manips != 0)
792     			return NF_DROP;
793     	}
794     
795     	DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
796     	       skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
797     	/* Note: May not be from a NAT'd host, but probably safest to
798     	   do translation always as if it came from the host itself
799     	   (even though a "host unreachable" coming from the host
800     	   itself is a bit wierd).
801     
802     	   More explanation: some people use NAT for anonymizing.
803     	   Also, CERT recommends dropping all packets from private IP
804     	   addresses (although ICMP errors from internal links with
805     	   such addresses are not too uncommon, as Alan Cox points
806     	   out) */
807     
808     	READ_LOCK(&ip_nat_lock);
809     	for (i = 0; i < info->num_manips; i++) {
810     		DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
811     		       i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
812     		       "ORIG" : "REPLY", info->manips[i].hooknum);
813     
814     		if (info->manips[i].direction != dir)
815     			continue;
816     
817     		/* Mapping the inner packet is just like a normal
818     		   packet, except it was never src/dst reversed, so
819     		   where we would normally apply a dst manip, we apply
820     		   a src, and vice versa. */
821     		if (info->manips[i].hooknum == opposite_hook[hooknum]) {
822     			DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
823     			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
824     			       ? "DST" : "SRC",
825     			       NIPQUAD(info->manips[i].manip.ip),
826     			       ntohs(info->manips[i].manip.u.udp.port));
827     			manip_pkt(inner->protocol, inner,
828     				  skb->len - ((void *)inner - (void *)iph),
829     				  &info->manips[i].manip,
830     				  !info->manips[i].maniptype,
831     				  &skb->nfcache);
832     		/* Outer packet needs to have IP header NATed like
833                        it's a reply. */
834     		} else if (info->manips[i].hooknum == hooknum) {
835     			/* Use mapping to map outer packet: 0 give no
836                                per-proto mapping */
837     			DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
838     			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
839     			       ? "SRC" : "DST",
840     			       NIPQUAD(info->manips[i].manip.ip));
841     			manip_pkt(0, iph, skb->len,
842     				  &info->manips[i].manip,
843     				  info->manips[i].maniptype,
844     				  &skb->nfcache);
845     		}
846     	}
847     	READ_UNLOCK(&ip_nat_lock);
848     
849     	/* Since we mangled inside ICMP packet, recalculate its
850     	   checksum from scratch.  (Hence the handling of incorrect
851     	   checksums in conntrack, so we don't accidentally fix one.)  */
852     	hdr->checksum = 0;
853     	hdr->checksum = ip_compute_csum((unsigned char *)hdr,
854     					sizeof(*hdr) + datalen);
855     
856     	return NF_ACCEPT;
857     }
858     
859     int __init ip_nat_init(void)
860     {
861     	size_t i;
862     
863     	/* Leave them the same for the moment. */
864     	ip_nat_htable_size = ip_conntrack_htable_size;
865     
866     	/* One vmalloc for both hash tables */
867     	bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
868     	if (!bysource) {
869     		return -ENOMEM;
870     	}
871     	byipsproto = bysource + ip_nat_htable_size;
872     
873     	/* Sew in builtin protocols. */
874     	WRITE_LOCK(&ip_nat_lock);
875     	list_append(&protos, &ip_nat_protocol_tcp);
876     	list_append(&protos, &ip_nat_protocol_udp);
877     	list_append(&protos, &ip_nat_protocol_icmp);
878     	WRITE_UNLOCK(&ip_nat_lock);
879     
880     	for (i = 0; i < ip_nat_htable_size; i++) {
881     		INIT_LIST_HEAD(&bysource[i]);
882     		INIT_LIST_HEAD(&byipsproto[i]);
883     	}
884     
885     	/* FIXME: Man, this is a hack.  <SIGH> */
886     	IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
887     	ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
888     
889     	return 0;
890     }
891     
892     /* Clear NAT section of all conntracks, in case we're loaded again. */
893     static int clean_nat(const struct ip_conntrack *i, void *data)
894     {
895     	memset((void *)&i->nat, 0, sizeof(i->nat));
896     	return 0;
897     }
898     
899     /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
900     void ip_nat_cleanup(void)
901     {
902     	ip_ct_selective_cleanup(&clean_nat, NULL);
903     	ip_conntrack_destroyed = NULL;
904     }
905