File: /usr/src/linux/net/ipv4/netfilter/ip_conntrack_core.c

1     /* Connection state tracking for netfilter.  This is separated from,
2        but required by, the NAT layer; it can also be used by an iptables
3        extension. */
4     
5     /* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
6        Public Licence. */
7     
8     #ifdef MODULE
9     #define __NO_VERSION__
10     #endif
11     #include <linux/version.h>
12     #include <linux/config.h>
13     #include <linux/types.h>
14     #include <linux/ip.h>
15     #include <linux/netfilter.h>
16     #include <linux/netfilter_ipv4.h>
17     #include <linux/module.h>
18     #include <linux/skbuff.h>
19     #include <linux/proc_fs.h>
20     #include <linux/vmalloc.h>
21     #include <linux/brlock.h>
22     #include <net/checksum.h>
23     #include <linux/stddef.h>
24     #include <linux/sysctl.h>
25     #include <linux/slab.h>
26     /* For ERR_PTR().  Yeah, I know... --RR */
27     #include <linux/fs.h>
28     
29     /* This rwlock protects the main hash table, protocol/helper/expected
30        registrations, conntrack timers*/
31     #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
32     #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
33     
34     #include <linux/netfilter_ipv4/ip_conntrack.h>
35     #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
36     #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
37     #include <linux/netfilter_ipv4/ip_conntrack_core.h>
38     #include <linux/netfilter_ipv4/listhelp.h>
39     
40     #if 0
41     #define DEBUGP printk
42     #else
43     #define DEBUGP(format, args...)
44     #endif
45     
46     DECLARE_RWLOCK(ip_conntrack_lock);
47     
48     void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
49     LIST_HEAD(expect_list);
50     LIST_HEAD(protocol_list);
51     static LIST_HEAD(helpers);
52     unsigned int ip_conntrack_htable_size = 0;
53     static int ip_conntrack_max = 0;
54     static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
55     struct list_head *ip_conntrack_hash;
56     static kmem_cache_t *ip_conntrack_cachep;
57     
58     extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
59     
60     static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
61     			      u_int8_t protocol)
62     {
63     	return protocol == curr->proto;
64     }
65     
66     struct ip_conntrack_protocol *__find_proto(u_int8_t protocol)
67     {
68     	struct ip_conntrack_protocol *p;
69     
70     	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
71     	p = LIST_FIND(&protocol_list, proto_cmpfn,
72     		      struct ip_conntrack_protocol *, protocol);
73     	if (!p)
74     		p = &ip_conntrack_generic_protocol;
75     
76     	return p;
77     }
78     
79     struct ip_conntrack_protocol *find_proto(u_int8_t protocol)
80     {
81     	struct ip_conntrack_protocol *p;
82     
83     	READ_LOCK(&ip_conntrack_lock);
84     	p = __find_proto(protocol);
85     	READ_UNLOCK(&ip_conntrack_lock);
86     	return p;
87     }
88     
89     static inline void ip_conntrack_put(struct ip_conntrack *ct)
90     {
91     	IP_NF_ASSERT(ct);
92     	IP_NF_ASSERT(ct->infos[0].master);
93     	/* nf_conntrack_put wants to go via an info struct, so feed it
94                one at random. */
95     	nf_conntrack_put(&ct->infos[0]);
96     }
97     
98     static inline u_int32_t
99     hash_conntrack(const struct ip_conntrack_tuple *tuple)
100     {
101     #if 0
102     	dump_tuple(tuple);
103     #endif
104     	/* ntohl because more differences in low bits. */
105     	/* To ensure that halves of the same connection don't hash
106     	   clash, we add the source per-proto again. */
107     	return (ntohl(tuple->src.ip + tuple->dst.ip
108     		     + tuple->src.u.all + tuple->dst.u.all
109     		     + tuple->dst.protonum)
110     		+ ntohs(tuple->src.u.all))
111     		% ip_conntrack_htable_size;
112     }
113     
114     inline int
115     get_tuple(const struct iphdr *iph, size_t len,
116     	  struct ip_conntrack_tuple *tuple,
117     	  struct ip_conntrack_protocol *protocol)
118     {
119     	int ret;
120     
121     	/* Never happen */
122     	if (iph->frag_off & htons(IP_OFFSET)) {
123     		printk("ip_conntrack_core: Frag of proto %u.\n",
124     		       iph->protocol);
125     		return 0;
126     	}
127     	/* Guarantee 8 protocol bytes: if more wanted, use len param */
128     	else if (iph->ihl * 4 + 8 > len)
129     		return 0;
130     
131     	tuple->src.ip = iph->saddr;
132     	tuple->dst.ip = iph->daddr;
133     	tuple->dst.protonum = iph->protocol;
134     
135     	ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
136     				     len - 4*iph->ihl,
137     				     tuple);
138     	return ret;
139     }
140     
141     static int
142     invert_tuple(struct ip_conntrack_tuple *inverse,
143     	     const struct ip_conntrack_tuple *orig,
144     	     const struct ip_conntrack_protocol *protocol)
145     {
146     	inverse->src.ip = orig->dst.ip;
147     	inverse->dst.ip = orig->src.ip;
148     	inverse->dst.protonum = orig->dst.protonum;
149     
150     	return protocol->invert_tuple(inverse, orig);
151     }
152     
153     static void
154     clean_from_lists(struct ip_conntrack *ct)
155     {
156     	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
157     	/* Remove from both hash lists: must not NULL out next ptrs,
158                otherwise we'll look unconfirmed.  Fortunately, LIST_DELETE
159                doesn't do this. --RR */
160     	LIST_DELETE(&ip_conntrack_hash
161     		    [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)],
162     		    &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
163     	LIST_DELETE(&ip_conntrack_hash
164     		    [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)],
165     		    &ct->tuplehash[IP_CT_DIR_REPLY]);
166     	/* If our expected is in the list, take it out. */
167     	if (ct->expected.expectant) {
168     		IP_NF_ASSERT(list_inlist(&expect_list, &ct->expected));
169     		IP_NF_ASSERT(ct->expected.expectant == ct);
170     		LIST_DELETE(&expect_list, &ct->expected);
171     	}
172     }
173     
174     static void
175     destroy_conntrack(struct nf_conntrack *nfct)
176     {
177     	struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
178     
179     	IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
180     	IP_NF_ASSERT(!timer_pending(&ct->timeout));
181     
182     	if (ct->master.master)
183     		nf_conntrack_put(&ct->master);
184     
185     	if (ip_conntrack_destroyed)
186     		ip_conntrack_destroyed(ct);
187     	kmem_cache_free(ip_conntrack_cachep, ct);
188     	atomic_dec(&ip_conntrack_count);
189     }
190     
191     static void death_by_timeout(unsigned long ul_conntrack)
192     {
193     	struct ip_conntrack *ct = (void *)ul_conntrack;
194     
195     	WRITE_LOCK(&ip_conntrack_lock);
196     	clean_from_lists(ct);
197     	WRITE_UNLOCK(&ip_conntrack_lock);
198     	ip_conntrack_put(ct);
199     }
200     
201     static inline int
202     conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
203     		    const struct ip_conntrack_tuple *tuple,
204     		    const struct ip_conntrack *ignored_conntrack)
205     {
206     	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
207     	return i->ctrack != ignored_conntrack
208     		&& ip_ct_tuple_equal(tuple, &i->tuple);
209     }
210     
211     static struct ip_conntrack_tuple_hash *
212     __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
213     		    const struct ip_conntrack *ignored_conntrack)
214     {
215     	struct ip_conntrack_tuple_hash *h;
216     
217     	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
218     	h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)],
219     		      conntrack_tuple_cmp,
220     		      struct ip_conntrack_tuple_hash *,
221     		      tuple, ignored_conntrack);
222     	return h;
223     }
224     
225     /* Find a connection corresponding to a tuple. */
226     struct ip_conntrack_tuple_hash *
227     ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
228     		      const struct ip_conntrack *ignored_conntrack)
229     {
230     	struct ip_conntrack_tuple_hash *h;
231     
232     	READ_LOCK(&ip_conntrack_lock);
233     	h = __ip_conntrack_find(tuple, ignored_conntrack);
234     	if (h)
235     		atomic_inc(&h->ctrack->ct_general.use);
236     	READ_UNLOCK(&ip_conntrack_lock);
237     
238     	return h;
239     }
240     
241     static inline struct ip_conntrack *
242     __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
243     {
244     	struct ip_conntrack *ct
245     		= (struct ip_conntrack *)nfct->master;
246     
247     	/* ctinfo is the index of the nfct inside the conntrack */
248     	*ctinfo = nfct - ct->infos;
249     	IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
250     	return ct;
251     }
252     
253     /* Return conntrack and conntrack_info given skb->nfct->master */
254     struct ip_conntrack *
255     ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
256     {
257     	if (skb->nfct) 
258     		return __ip_conntrack_get(skb->nfct, ctinfo);
259     	return NULL;
260     }
261     
262     /* Confirm a connection given skb->nfct; places it in hash table */
263     int
264     __ip_conntrack_confirm(struct nf_ct_info *nfct)
265     {
266     	unsigned int hash, repl_hash;
267     	struct ip_conntrack *ct;
268     	enum ip_conntrack_info ctinfo;
269     
270     	ct = __ip_conntrack_get(nfct, &ctinfo);
271     
272     	/* ipt_REJECT uses ip_conntrack_attach to attach related
273     	   ICMP/TCP RST packets in other direction.  Actual packet
274     	   which created connection will be IP_CT_NEW or for an
275     	   expected connection, IP_CT_RELATED. */
276     	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
277     		return NF_ACCEPT;
278     
279     	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
280     	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
281     
282     	/* We're not in hash table, and we refuse to set up related
283     	   connections for unconfirmed conns.  But packet copies and
284     	   REJECT will give spurious warnings here. */
285     	/* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
286     
287     	/* No external references means noone else could have
288                confirmed us. */
289     	IP_NF_ASSERT(!is_confirmed(ct));
290     	DEBUGP("Confirming conntrack %p\n", ct);
291     
292     	WRITE_LOCK(&ip_conntrack_lock);
293     	/* See if there's one in the list already, including reverse:
294                NAT could have grabbed it without realizing, since we're
295                not in the hash.  If there is, we lost race. */
296     	if (!LIST_FIND(&ip_conntrack_hash[hash],
297     		       conntrack_tuple_cmp,
298     		       struct ip_conntrack_tuple_hash *,
299     		       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
300     	    && !LIST_FIND(&ip_conntrack_hash[repl_hash],
301     			  conntrack_tuple_cmp,
302     			  struct ip_conntrack_tuple_hash *,
303     			  &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
304     		list_prepend(&ip_conntrack_hash[hash],
305     			     &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
306     		list_prepend(&ip_conntrack_hash[repl_hash],
307     			     &ct->tuplehash[IP_CT_DIR_REPLY]);
308     		/* Timer relative to confirmation time, not original
309     		   setting time, otherwise we'd get timer wrap in
310     		   wierd delay cases. */
311     		ct->timeout.expires += jiffies;
312     		add_timer(&ct->timeout);
313     		atomic_inc(&ct->ct_general.use);
314     		WRITE_UNLOCK(&ip_conntrack_lock);
315     		return NF_ACCEPT;
316     	}
317     
318     	WRITE_UNLOCK(&ip_conntrack_lock);
319     	return NF_DROP;
320     }
321     
322     /* Returns true if a connection correspondings to the tuple (required
323        for NAT). */
324     int
325     ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
326     			 const struct ip_conntrack *ignored_conntrack)
327     {
328     	struct ip_conntrack_tuple_hash *h;
329     
330     	READ_LOCK(&ip_conntrack_lock);
331     	h = __ip_conntrack_find(tuple, ignored_conntrack);
332     	READ_UNLOCK(&ip_conntrack_lock);
333     
334     	return h != NULL;
335     }
336     
337     /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
338     struct ip_conntrack *
339     icmp_error_track(struct sk_buff *skb,
340     		 enum ip_conntrack_info *ctinfo,
341     		 unsigned int hooknum)
342     {
343     	const struct iphdr *iph;
344     	struct icmphdr *hdr;
345     	struct ip_conntrack_tuple innertuple, origtuple;
346     	struct iphdr *inner;
347     	size_t datalen;
348     	struct ip_conntrack_protocol *innerproto;
349     	struct ip_conntrack_tuple_hash *h;
350     
351     	IP_NF_ASSERT(iph->protocol == IPPROTO_ICMP);
352     	IP_NF_ASSERT(skb->nfct == NULL);
353     
354     	iph = skb->nh.iph;
355     	hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
356     	inner = (struct iphdr *)(hdr + 1);
357     	datalen = skb->len - iph->ihl*4 - sizeof(*hdr);
358     
359     	if (skb->len < iph->ihl * 4 + sizeof(*hdr) + sizeof(*iph)) {
360     		DEBUGP("icmp_error_track: too short\n");
361     		return NULL;
362     	}
363     
364     	if (hdr->type != ICMP_DEST_UNREACH
365     	    && hdr->type != ICMP_SOURCE_QUENCH
366     	    && hdr->type != ICMP_TIME_EXCEEDED
367     	    && hdr->type != ICMP_PARAMETERPROB
368     	    && hdr->type != ICMP_REDIRECT)
369     		return NULL;
370     
371     	/* Ignore ICMP's containing fragments (shouldn't happen) */
372     	if (inner->frag_off & htons(IP_OFFSET)) {
373     		DEBUGP("icmp_error_track: fragment of proto %u\n",
374     		       inner->protocol);
375     		return NULL;
376     	}
377     
378     	/* Ignore it if the checksum's bogus. */
379     	if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) {
380     		DEBUGP("icmp_error_track: bad csum\n");
381     		return NULL;
382     	}
383     
384     	innerproto = find_proto(inner->protocol);
385     	/* Are they talking about one of our connections? */
386     	if (inner->ihl * 4 + 8 > datalen
387     	    || !get_tuple(inner, datalen, &origtuple, innerproto)) {
388     		DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n",
389     		       inner->protocol, inner->ihl, 8,
390     		       datalen);
391     		return NULL;
392     	}
393     
394     	/* Ordinarily, we'd expect the inverted tupleproto, but it's
395     	   been preserved inside the ICMP. */
396     	if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
397     		DEBUGP("icmp_error_track: Can't invert tuple\n");
398     		return NULL;
399     	}
400     
401     	*ctinfo = IP_CT_RELATED;
402     
403     	h = ip_conntrack_find_get(&innertuple, NULL);
404     	if (!h) {
405     		/* Locally generated ICMPs will match inverted if they
406     		   haven't been SNAT'ed yet */
407     		/* FIXME: NAT code has to handle half-done double NAT --RR */
408     		if (hooknum == NF_IP_LOCAL_OUT)
409     			h = ip_conntrack_find_get(&origtuple, NULL);
410     
411     		if (!h) {
412     			DEBUGP("icmp_error_track: no match\n");
413     			return NULL;
414     		}
415     		/* Reverse direction from that found */
416     		if (DIRECTION(h) != IP_CT_DIR_REPLY)
417     			*ctinfo += IP_CT_IS_REPLY;
418     	} else {
419     		if (DIRECTION(h) == IP_CT_DIR_REPLY)
420     			*ctinfo += IP_CT_IS_REPLY;
421     	}
422     
423     	/* Update skb to refer to this connection */
424     	skb->nfct = &h->ctrack->infos[*ctinfo];
425     	return h->ctrack;
426     }
427     
428     /* There's a small race here where we may free a just-assured
429        connection.  Too bad: we're in trouble anyway. */
430     static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
431     {
432     	return !(i->ctrack->status & IPS_ASSURED);
433     }
434     
435     static int early_drop(struct list_head *chain)
436     {
437     	/* Traverse backwards: gives us oldest, which is roughly LRU */
438     	struct ip_conntrack_tuple_hash *h;
439     	int dropped = 0;
440     
441     	READ_LOCK(&ip_conntrack_lock);
442     	h = LIST_FIND(chain, unreplied, struct ip_conntrack_tuple_hash *);
443     	if (h)
444     		atomic_inc(&h->ctrack->ct_general.use);
445     	READ_UNLOCK(&ip_conntrack_lock);
446     
447     	if (!h)
448     		return dropped;
449     
450     	if (del_timer(&h->ctrack->timeout)) {
451     		death_by_timeout((unsigned long)h->ctrack);
452     		dropped = 1;
453     	}
454     	ip_conntrack_put(h->ctrack);
455     	return dropped;
456     }
457     
458     static inline int helper_cmp(const struct ip_conntrack_helper *i,
459     			     const struct ip_conntrack_tuple *rtuple)
460     {
461     	return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
462     }
463     
464     /* Compare parts depending on mask. */
465     static inline int expect_cmp(const struct ip_conntrack_expect *i,
466     			     const struct ip_conntrack_tuple *tuple)
467     {
468     	return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
469     }
470     
471     /* Allocate a new conntrack: we return -ENOMEM if classification
472        failed due to stress.  Otherwise it really is unclassifiable. */
473     static struct ip_conntrack_tuple_hash *
474     init_conntrack(const struct ip_conntrack_tuple *tuple,
475     	       struct ip_conntrack_protocol *protocol,
476     	       struct sk_buff *skb)
477     {
478     	struct ip_conntrack *conntrack;
479     	struct ip_conntrack_tuple repl_tuple;
480     	size_t hash, repl_hash;
481     	struct ip_conntrack_expect *expected;
482     	int i;
483     	static unsigned int drop_next = 0;
484     
485     	hash = hash_conntrack(tuple);
486     
487     	if (ip_conntrack_max &&
488     	    atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
489     		/* Try dropping from random chain, or else from the
490                        chain about to put into (in case they're trying to
491                        bomb one hash chain). */
492     		if (drop_next >= ip_conntrack_htable_size)
493     			drop_next = 0;
494     		if (!early_drop(&ip_conntrack_hash[drop_next++])
495     		    && !early_drop(&ip_conntrack_hash[hash])) {
496     			if (net_ratelimit())
497     				printk(KERN_WARNING
498     				       "ip_conntrack: table full, dropping"
499     				       " packet.\n");
500     			return ERR_PTR(-ENOMEM);
501     		}
502     	}
503     
504     	if (!invert_tuple(&repl_tuple, tuple, protocol)) {
505     		DEBUGP("Can't invert tuple.\n");
506     		return NULL;
507     	}
508     	repl_hash = hash_conntrack(&repl_tuple);
509     
510     	conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
511     	if (!conntrack) {
512     		DEBUGP("Can't allocate conntrack.\n");
513     		return ERR_PTR(-ENOMEM);
514     	}
515     
516     	memset(conntrack, 0, sizeof(struct ip_conntrack));
517     	atomic_set(&conntrack->ct_general.use, 1);
518     	conntrack->ct_general.destroy = destroy_conntrack;
519     	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
520     	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
521     	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
522     	conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
523     	for (i=0; i < IP_CT_NUMBER; i++)
524     		conntrack->infos[i].master = &conntrack->ct_general;
525     
526     	if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
527     		kmem_cache_free(ip_conntrack_cachep, conntrack);
528     		return NULL;
529     	}
530     	/* Don't set timer yet: wait for confirmation */
531     	init_timer(&conntrack->timeout);
532     	conntrack->timeout.data = (unsigned long)conntrack;
533     	conntrack->timeout.function = death_by_timeout;
534     
535     	/* Mark clearly that it's not in the hash table. */
536     	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL;
537     
538     	/* Write lock required for deletion of expected.  Without
539                this, a read-lock would do. */
540     	WRITE_LOCK(&ip_conntrack_lock);
541     	conntrack->helper = LIST_FIND(&helpers, helper_cmp,
542     				      struct ip_conntrack_helper *,
543     				      &repl_tuple);
544     	/* Need finding and deleting of expected ONLY if we win race */
545     	expected = LIST_FIND(&expect_list, expect_cmp,
546     			     struct ip_conntrack_expect *, tuple);
547     	/* If master is not in hash table yet (ie. packet hasn't left
548     	   this machine yet), how can other end know about expected?
549     	   Hence these are not the droids you are looking for (if
550     	   master ct never got confirmed, we'd hold a reference to it
551     	   and weird things would happen to future packets). */
552     	if (expected && is_confirmed(expected->expectant)) {
553     		/* Welcome, Mr. Bond.  We've been expecting you... */
554     		conntrack->status = IPS_EXPECTED;
555     		conntrack->master.master = &expected->expectant->ct_general;
556     		IP_NF_ASSERT(conntrack->master.master);
557     		LIST_DELETE(&expect_list, expected);
558     		expected->expectant = NULL;
559     		nf_conntrack_get(&conntrack->master);
560     	}
561     	atomic_inc(&ip_conntrack_count);
562     	WRITE_UNLOCK(&ip_conntrack_lock);
563     
564     	if (expected && expected->expectfn)
565     		expected->expectfn(conntrack);
566     	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
567     }
568     
569     /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
570     static inline struct ip_conntrack *
571     resolve_normal_ct(struct sk_buff *skb,
572     		  struct ip_conntrack_protocol *proto,
573     		  int *set_reply,
574     		  unsigned int hooknum,
575     		  enum ip_conntrack_info *ctinfo)
576     {
577     	struct ip_conntrack_tuple tuple;
578     	struct ip_conntrack_tuple_hash *h;
579     
580     	IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
581     
582     	if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto))
583     		return NULL;
584     
585     	/* look for tuple match */
586     	h = ip_conntrack_find_get(&tuple, NULL);
587     	if (!h) {
588     		h = init_conntrack(&tuple, proto, skb);
589     		if (!h)
590     			return NULL;
591     		if (IS_ERR(h))
592     			return (void *)h;
593     	}
594     
595     	/* It exists; we have (non-exclusive) reference. */
596     	if (DIRECTION(h) == IP_CT_DIR_REPLY) {
597     		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
598     		/* Please set reply bit if this packet OK */
599     		*set_reply = 1;
600     	} else {
601     		/* Once we've had two way comms, always ESTABLISHED. */
602     		if (h->ctrack->status & IPS_SEEN_REPLY) {
603     			DEBUGP("ip_conntrack_in: normal packet for %p\n",
604     			       h->ctrack);
605     		        *ctinfo = IP_CT_ESTABLISHED;
606     		} else if (h->ctrack->status & IPS_EXPECTED) {
607     			DEBUGP("ip_conntrack_in: related packet for %p\n",
608     			       h->ctrack);
609     			*ctinfo = IP_CT_RELATED;
610     		} else {
611     			DEBUGP("ip_conntrack_in: new packet for %p\n",
612     			       h->ctrack);
613     			*ctinfo = IP_CT_NEW;
614     		}
615     		*set_reply = 0;
616     	}
617     	skb->nfct = &h->ctrack->infos[*ctinfo];
618     	return h->ctrack;
619     }
620     
621     /* Netfilter hook itself. */
622     unsigned int ip_conntrack_in(unsigned int hooknum,
623     			     struct sk_buff **pskb,
624     			     const struct net_device *in,
625     			     const struct net_device *out,
626     			     int (*okfn)(struct sk_buff *))
627     {
628     	struct ip_conntrack *ct;
629     	enum ip_conntrack_info ctinfo;
630     	struct ip_conntrack_protocol *proto;
631     	int set_reply;
632     	int ret;
633     
634     	/* FIXME: Do this right please. --RR */
635     	(*pskb)->nfcache |= NFC_UNKNOWN;
636     
637     /* Doesn't cover locally-generated broadcast, so not worth it. */
638     #if 0
639     	/* Ignore broadcast: no `connection'. */
640     	if ((*pskb)->pkt_type == PACKET_BROADCAST) {
641     		printk("Broadcast packet!\n");
642     		return NF_ACCEPT;
643     	} else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
644     		   == htonl(0x000000FF)) {
645     		printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
646     		       NIPQUAD((*pskb)->nh.iph->saddr),
647     		       NIPQUAD((*pskb)->nh.iph->daddr),
648     		       (*pskb)->sk, (*pskb)->pkt_type);
649     	}
650     #endif
651     
652     	/* Previously seen (loopback)?  Ignore.  Do this before
653                fragment check. */
654     	if ((*pskb)->nfct)
655     		return NF_ACCEPT;
656     
657     	/* Gather fragments. */
658     	if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
659     		*pskb = ip_ct_gather_frags(*pskb);
660     		if (!*pskb)
661     			return NF_STOLEN;
662     	}
663     
664     	proto = find_proto((*pskb)->nh.iph->protocol);
665     
666     	/* It may be an icmp error... */
667     	if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
668     	    && icmp_error_track(*pskb, &ctinfo, hooknum))
669     		return NF_ACCEPT;
670     
671     	if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
672     		/* Not valid part of a connection */
673     		return NF_ACCEPT;
674     
675     	if (IS_ERR(ct))
676     		/* Too stressed to deal. */
677     		return NF_DROP;
678     
679     	IP_NF_ASSERT((*pskb)->nfct);
680     
681     	ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
682     	if (ret == -1) {
683     		/* Invalid */
684     		nf_conntrack_put((*pskb)->nfct);
685     		(*pskb)->nfct = NULL;
686     		return NF_ACCEPT;
687     	}
688     
689     	if (ret != NF_DROP && ct->helper) {
690     		ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len,
691     				       ct, ctinfo);
692     		if (ret == -1) {
693     			/* Invalid */
694     			nf_conntrack_put((*pskb)->nfct);
695     			(*pskb)->nfct = NULL;
696     			return NF_ACCEPT;
697     		}
698     	}
699     	if (set_reply)
700     		set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
701     
702     	return ret;
703     }
704     
705     int invert_tuplepr(struct ip_conntrack_tuple *inverse,
706     		   const struct ip_conntrack_tuple *orig)
707     {
708     	return invert_tuple(inverse, orig, find_proto(orig->dst.protonum));
709     }
710     
711     static void unexpect_related(struct ip_conntrack *related_to)
712     {
713     	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
714     	list_del(&related_to->expected.list);
715     	related_to->expected.expectant = NULL;
716     }
717     
718     /* Would two expected things clash? */
719     static inline int expect_clash(const struct ip_conntrack_expect *i,
720     			       const struct ip_conntrack_expect *new)
721     {
722     	/* Part covered by intersection of masks must be unequal,
723                otherwise they clash */
724     	struct ip_conntrack_tuple intersect_mask
725     		= { { i->mask.src.ip & new->mask.src.ip,
726     		      { i->mask.src.u.all & new->mask.src.u.all } },
727     		    { i->mask.dst.ip & new->mask.dst.ip,
728     		      { i->mask.dst.u.all & new->mask.dst.u.all },
729     		      i->mask.dst.protonum & new->mask.dst.protonum } };
730     
731     	return ip_ct_tuple_mask_cmp(&i->tuple, &new->tuple, &intersect_mask);
732     }
733     
734     /* Add a related connection. */
735     int ip_conntrack_expect_related(struct ip_conntrack *related_to,
736     				const struct ip_conntrack_tuple *tuple,
737     				const struct ip_conntrack_tuple *mask,
738     				int (*expectfn)(struct ip_conntrack *))
739     {
740     	WRITE_LOCK(&ip_conntrack_lock);
741     	if (related_to->expected.expectant)
742     		unexpect_related(related_to);
743     
744     	related_to->expected.tuple = *tuple;
745     	related_to->expected.mask = *mask;
746     	related_to->expected.expectfn = expectfn;
747     
748     	if (LIST_FIND(&expect_list, expect_clash,
749     		      struct ip_conntrack_expect *, &related_to->expected)) {
750     		WRITE_UNLOCK(&ip_conntrack_lock);
751     		return -EBUSY;
752     	}
753     
754     	list_prepend(&expect_list, &related_to->expected);
755     	related_to->expected.expectant = related_to;
756     	WRITE_UNLOCK(&ip_conntrack_lock);
757     
758     	return 0;
759     }
760     
761     void ip_conntrack_unexpect_related(struct ip_conntrack *related_to)
762     {
763     	WRITE_LOCK(&ip_conntrack_lock);
764     	unexpect_related(related_to);
765     	WRITE_UNLOCK(&ip_conntrack_lock);
766     }
767     	
768     /* Alter reply tuple (maybe alter helper).  If it's already taken,
769        return 0 and don't do alteration. */
770     int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
771     			     const struct ip_conntrack_tuple *newreply)
772     {
773     	WRITE_LOCK(&ip_conntrack_lock);
774     	if (__ip_conntrack_find(newreply, conntrack)) {
775     		WRITE_UNLOCK(&ip_conntrack_lock);
776     		return 0;
777     	}
778     	/* Should be unconfirmed, so not in hash table yet */
779     	IP_NF_ASSERT(!is_confirmed(conntrack));
780     
781     	DEBUGP("Altering reply tuple of %p to ", conntrack);
782     	DUMP_TUPLE(newreply);
783     
784     	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
785     	conntrack->helper = LIST_FIND(&helpers, helper_cmp,
786     				      struct ip_conntrack_helper *,
787     				      newreply);
788     	WRITE_UNLOCK(&ip_conntrack_lock);
789     	return 1;
790     }
791     
792     int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
793     {
794     	MOD_INC_USE_COUNT;
795     
796     	WRITE_LOCK(&ip_conntrack_lock);
797     	list_prepend(&helpers, me);
798     	WRITE_UNLOCK(&ip_conntrack_lock);
799     
800     	return 0;
801     }
802     
803     static inline int unhelp(struct ip_conntrack_tuple_hash *i,
804     			 const struct ip_conntrack_helper *me)
805     {
806     	if (i->ctrack->helper == me) {
807     		i->ctrack->helper = NULL;
808     		/* Get rid of any expected. */
809     		if (i->ctrack->expected.expectant) {
810     			IP_NF_ASSERT(i->ctrack->expected.expectant
811     				     == i->ctrack);
812     			LIST_DELETE(&expect_list, &i->ctrack->expected);
813     			i->ctrack->expected.expectant = NULL;
814     		}
815     	}
816     	return 0;
817     }
818     
819     void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
820     {
821     	unsigned int i;
822     
823     	/* Need write lock here, to delete helper. */
824     	WRITE_LOCK(&ip_conntrack_lock);
825     	LIST_DELETE(&helpers, me);
826     
827     	/* Get rid of expecteds, set helpers to NULL. */
828     	for (i = 0; i < ip_conntrack_htable_size; i++)
829     		LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
830     			    struct ip_conntrack_tuple_hash *, me);
831     	WRITE_UNLOCK(&ip_conntrack_lock);
832     
833     	/* Someone could be still looking at the helper in a bh. */
834     	br_write_lock_bh(BR_NETPROTO_LOCK);
835     	br_write_unlock_bh(BR_NETPROTO_LOCK);
836     
837     	MOD_DEC_USE_COUNT;
838     }
839     
840     /* Refresh conntrack for this many jiffies. */
841     void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
842     {
843     	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
844     
845     	WRITE_LOCK(&ip_conntrack_lock);
846     	/* If not in hash table, timer will not be active yet */
847     	if (!is_confirmed(ct))
848     		ct->timeout.expires = extra_jiffies;
849     	else {
850     		/* Need del_timer for race avoidance (may already be dying). */
851     		if (del_timer(&ct->timeout)) {
852     			ct->timeout.expires = jiffies + extra_jiffies;
853     			add_timer(&ct->timeout);
854     		}
855     	}
856     	WRITE_UNLOCK(&ip_conntrack_lock);
857     }
858     
859     /* Returns new sk_buff, or NULL */
860     struct sk_buff *
861     ip_ct_gather_frags(struct sk_buff *skb)
862     {
863     	struct sock *sk = skb->sk;
864     #ifdef CONFIG_NETFILTER_DEBUG
865     	unsigned int olddebug = skb->nf_debug;
866     #endif
867     	if (sk) {
868     		sock_hold(sk);
869     		skb_orphan(skb);
870     	}
871     
872     	local_bh_disable(); 
873     	skb = ip_defrag(skb);
874     	local_bh_enable();
875     
876     	if (!skb) {
877     		if (sk) sock_put(sk);
878     		return skb;
879     	} else if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
880     		kfree_skb(skb);
881     		if (sk) sock_put(sk);
882     		return NULL;
883     	}
884     
885     	if (sk) {
886     		skb_set_owner_w(skb, sk);
887     		sock_put(sk);
888     	}
889     
890     	ip_send_check(skb->nh.iph);
891     	skb->nfcache |= NFC_ALTERED;
892     #ifdef CONFIG_NETFILTER_DEBUG
893     	/* Packet path as if nothing had happened. */
894     	skb->nf_debug = olddebug;
895     #endif
896     	return skb;
897     }
898     
899     /* Used by ipt_REJECT. */
900     static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
901     {
902     	struct ip_conntrack *ct;
903     	enum ip_conntrack_info ctinfo;
904     
905     	ct = __ip_conntrack_get(nfct, &ctinfo);
906     
907     	/* This ICMP is in reverse direction to the packet which
908                caused it */
909     	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
910     		ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
911     	else
912     		ctinfo = IP_CT_RELATED;
913     
914     	/* Attach new skbuff, and increment count */
915     	nskb->nfct = &ct->infos[ctinfo];
916     	atomic_inc(&ct->ct_general.use);
917     }
918     
919     static inline int
920     do_kill(const struct ip_conntrack_tuple_hash *i,
921     	int (*kill)(const struct ip_conntrack *i, void *data),
922     	void *data)
923     {
924     	return kill(i->ctrack, data);
925     }
926     
927     /* Bring out ya dead! */
928     static struct ip_conntrack_tuple_hash *
929     get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
930     		void *data)
931     {
932     	struct ip_conntrack_tuple_hash *h = NULL;
933     	unsigned int i;
934     
935     	READ_LOCK(&ip_conntrack_lock);
936     	for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
937     		h = LIST_FIND(&ip_conntrack_hash[i], do_kill,
938     			      struct ip_conntrack_tuple_hash *, kill, data);
939     	}
940     	if (h)
941     		atomic_inc(&h->ctrack->ct_general.use);
942     	READ_UNLOCK(&ip_conntrack_lock);
943     
944     	return h;
945     }
946     
947     void
948     ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
949     			void *data)
950     {
951     	struct ip_conntrack_tuple_hash *h;
952     
953     	/* This is order n^2, by the way. */
954     	while ((h = get_next_corpse(kill, data)) != NULL) {
955     		/* Time to push up daises... */
956     		if (del_timer(&h->ctrack->timeout))
957     			death_by_timeout((unsigned long)h->ctrack);
958     		/* ... else the timer will get him soon. */
959     
960     		ip_conntrack_put(h->ctrack);
961     	}
962     }
963     
964     /* Fast function for those who don't want to parse /proc (and I don't
965        blame them). */
966     /* Reversing the socket's dst/src point of view gives us the reply
967        mapping. */
968     static int
969     getorigdst(struct sock *sk, int optval, void *user, int *len)
970     {
971     	struct ip_conntrack_tuple_hash *h;
972     	struct ip_conntrack_tuple tuple = { { sk->rcv_saddr, { sk->sport } },
973     					    { sk->daddr, { sk->dport },
974     					      IPPROTO_TCP } };
975     
976     	/* We only do TCP at the moment: is there a better way? */
977     	if (strcmp(sk->prot->name, "TCP") != 0) {
978     		DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
979     		return -ENOPROTOOPT;
980     	}
981     
982     	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
983     		DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
984     		       *len, sizeof(struct sockaddr_in));
985     		return -EINVAL;
986     	}
987     
988     	h = ip_conntrack_find_get(&tuple, NULL);
989     	if (h) {
990     		struct sockaddr_in sin;
991     
992     		sin.sin_family = AF_INET;
993     		sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
994     			.tuple.dst.u.tcp.port;
995     		sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
996     			.tuple.dst.ip;
997     
998     		DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
999     		       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1000     		ip_conntrack_put(h->ctrack);
1001     		if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1002     			return -EFAULT;
1003     		else
1004     			return 0;
1005     	}
1006     	DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1007     	       NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1008     	       NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1009     	return -ENOENT;
1010     }
1011     
1012     static struct nf_sockopt_ops so_getorigdst
1013     = { { NULL, NULL }, PF_INET,
1014         0, 0, NULL, /* Setsockopts */
1015         SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst,
1016         0, NULL };
1017     
1018     #define NET_IP_CONNTRACK_MAX 2089
1019     #define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max"
1020     
1021     #ifdef CONFIG_SYSCTL
1022     static struct ctl_table_header *ip_conntrack_sysctl_header;
1023     
1024     static ctl_table ip_conntrack_table[] = {
1025     	{ NET_IP_CONNTRACK_MAX, NET_IP_CONNTRACK_MAX_NAME, &ip_conntrack_max,
1026     	  sizeof(ip_conntrack_max), 0644,  NULL, proc_dointvec },
1027      	{ 0 }
1028     };
1029     
1030     static ctl_table ip_conntrack_dir_table[] = {
1031     	{NET_IPV4, "ipv4", NULL, 0, 0555, ip_conntrack_table, 0, 0, 0, 0, 0},
1032     	{ 0 }
1033     };
1034     
1035     static ctl_table ip_conntrack_root_table[] = {
1036     	{CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0},
1037     	{ 0 }
1038     };
1039     #endif /*CONFIG_SYSCTL*/
1040     
1041     static int kill_all(const struct ip_conntrack *i, void *data)
1042     {
1043     	return 1;
1044     }
1045     
1046     /* Mishearing the voices in his head, our hero wonders how he's
1047        supposed to kill the mall. */
1048     void ip_conntrack_cleanup(void)
1049     {
1050     #ifdef CONFIG_SYSCTL
1051     	unregister_sysctl_table(ip_conntrack_sysctl_header);
1052     #endif
1053     	ip_ct_attach = NULL;
1054     	/* This makes sure all current packets have passed through
1055                netfilter framework.  Roll on, two-stage module
1056                delete... */
1057     	br_write_lock_bh(BR_NETPROTO_LOCK);
1058     	br_write_unlock_bh(BR_NETPROTO_LOCK);
1059      
1060      i_see_dead_people:
1061     	ip_ct_selective_cleanup(kill_all, NULL);
1062     	if (atomic_read(&ip_conntrack_count) != 0) {
1063     		schedule();
1064     		goto i_see_dead_people;
1065     	}
1066     
1067     	kmem_cache_destroy(ip_conntrack_cachep);
1068     	vfree(ip_conntrack_hash);
1069     	nf_unregister_sockopt(&so_getorigdst);
1070     }
1071     
1072     static int hashsize = 0;
1073     MODULE_PARM(hashsize, "i");
1074     
1075     int __init ip_conntrack_init(void)
1076     {
1077     	unsigned int i;
1078     	int ret;
1079     
1080     	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1081     	 * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1082      	if (hashsize) {
1083      		ip_conntrack_htable_size = hashsize;
1084      	} else {
1085     		ip_conntrack_htable_size
1086     			= (((num_physpages << PAGE_SHIFT) / 16384)
1087     			   / sizeof(struct list_head));
1088     		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1089     			ip_conntrack_htable_size = 8192;
1090     		if (ip_conntrack_htable_size < 16)
1091     			ip_conntrack_htable_size = 16;
1092     	}
1093     	ip_conntrack_max = 8 * ip_conntrack_htable_size;
1094     
1095     	printk("ip_conntrack (%u buckets, %d max)\n",
1096     	       ip_conntrack_htable_size, ip_conntrack_max);
1097     
1098     	ret = nf_register_sockopt(&so_getorigdst);
1099     	if (ret != 0)
1100     		return ret;
1101     
1102     	ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1103     				    * ip_conntrack_htable_size);
1104     	if (!ip_conntrack_hash) {
1105     		nf_unregister_sockopt(&so_getorigdst);
1106     		return -ENOMEM;
1107     	}
1108     
1109     	ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1110     	                                        sizeof(struct ip_conntrack), 0,
1111     	                                        SLAB_HWCACHE_ALIGN, NULL, NULL);
1112     	if (!ip_conntrack_cachep) {
1113     		printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1114     		vfree(ip_conntrack_hash);
1115     		nf_unregister_sockopt(&so_getorigdst);
1116     		return -ENOMEM;
1117     	}
1118     	
1119     	/* Don't NEED lock here, but good form anyway. */
1120     	WRITE_LOCK(&ip_conntrack_lock);
1121     	/* Sew in builtin protocols. */
1122     	list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1123     	list_append(&protocol_list, &ip_conntrack_protocol_udp);
1124     	list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1125     	WRITE_UNLOCK(&ip_conntrack_lock);
1126     
1127     	for (i = 0; i < ip_conntrack_htable_size; i++)
1128     		INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1129     
1130     /* This is fucking braindead.  There is NO WAY of doing this without
1131        the CONFIG_SYSCTL unless you don't want to detect errors.
1132        Grrr... --RR */
1133     #ifdef CONFIG_SYSCTL
1134     	ip_conntrack_sysctl_header
1135     		= register_sysctl_table(ip_conntrack_root_table, 0);
1136     	if (ip_conntrack_sysctl_header == NULL) {
1137     		kmem_cache_destroy(ip_conntrack_cachep);
1138     		vfree(ip_conntrack_hash);
1139     		nf_unregister_sockopt(&so_getorigdst);
1140     		return -ENOMEM;
1141     	}
1142     #endif /*CONFIG_SYSCTL*/
1143     
1144     	/* For use by ipt_REJECT */
1145     	ip_ct_attach = ip_conntrack_attach;
1146     	return ret;
1147     }
1148