File: /usr/src/linux/net/ipv4/netfilter/ip_conntrack_core.c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
6 Public Licence. */
7
8 #ifdef MODULE
9 #define __NO_VERSION__
10 #endif
11 #include <linux/version.h>
12 #include <linux/config.h>
13 #include <linux/types.h>
14 #include <linux/ip.h>
15 #include <linux/netfilter.h>
16 #include <linux/netfilter_ipv4.h>
17 #include <linux/module.h>
18 #include <linux/skbuff.h>
19 #include <linux/proc_fs.h>
20 #include <linux/vmalloc.h>
21 #include <linux/brlock.h>
22 #include <net/checksum.h>
23 #include <linux/stddef.h>
24 #include <linux/sysctl.h>
25 #include <linux/slab.h>
26 /* For ERR_PTR(). Yeah, I know... --RR */
27 #include <linux/fs.h>
28
29 /* This rwlock protects the main hash table, protocol/helper/expected
30 registrations, conntrack timers*/
31 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
32 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
33
34 #include <linux/netfilter_ipv4/ip_conntrack.h>
35 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
36 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
37 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
38 #include <linux/netfilter_ipv4/listhelp.h>
39
40 #if 0
41 #define DEBUGP printk
42 #else
43 #define DEBUGP(format, args...)
44 #endif
45
46 DECLARE_RWLOCK(ip_conntrack_lock);
47
48 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
49 LIST_HEAD(expect_list);
50 LIST_HEAD(protocol_list);
51 static LIST_HEAD(helpers);
52 unsigned int ip_conntrack_htable_size = 0;
53 static int ip_conntrack_max = 0;
54 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
55 struct list_head *ip_conntrack_hash;
56 static kmem_cache_t *ip_conntrack_cachep;
57
58 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
59
60 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
61 u_int8_t protocol)
62 {
63 return protocol == curr->proto;
64 }
65
66 struct ip_conntrack_protocol *__find_proto(u_int8_t protocol)
67 {
68 struct ip_conntrack_protocol *p;
69
70 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
71 p = LIST_FIND(&protocol_list, proto_cmpfn,
72 struct ip_conntrack_protocol *, protocol);
73 if (!p)
74 p = &ip_conntrack_generic_protocol;
75
76 return p;
77 }
78
79 struct ip_conntrack_protocol *find_proto(u_int8_t protocol)
80 {
81 struct ip_conntrack_protocol *p;
82
83 READ_LOCK(&ip_conntrack_lock);
84 p = __find_proto(protocol);
85 READ_UNLOCK(&ip_conntrack_lock);
86 return p;
87 }
88
89 static inline void ip_conntrack_put(struct ip_conntrack *ct)
90 {
91 IP_NF_ASSERT(ct);
92 IP_NF_ASSERT(ct->infos[0].master);
93 /* nf_conntrack_put wants to go via an info struct, so feed it
94 one at random. */
95 nf_conntrack_put(&ct->infos[0]);
96 }
97
98 static inline u_int32_t
99 hash_conntrack(const struct ip_conntrack_tuple *tuple)
100 {
101 #if 0
102 dump_tuple(tuple);
103 #endif
104 /* ntohl because more differences in low bits. */
105 /* To ensure that halves of the same connection don't hash
106 clash, we add the source per-proto again. */
107 return (ntohl(tuple->src.ip + tuple->dst.ip
108 + tuple->src.u.all + tuple->dst.u.all
109 + tuple->dst.protonum)
110 + ntohs(tuple->src.u.all))
111 % ip_conntrack_htable_size;
112 }
113
114 inline int
115 get_tuple(const struct iphdr *iph, size_t len,
116 struct ip_conntrack_tuple *tuple,
117 struct ip_conntrack_protocol *protocol)
118 {
119 int ret;
120
121 /* Never happen */
122 if (iph->frag_off & htons(IP_OFFSET)) {
123 printk("ip_conntrack_core: Frag of proto %u.\n",
124 iph->protocol);
125 return 0;
126 }
127 /* Guarantee 8 protocol bytes: if more wanted, use len param */
128 else if (iph->ihl * 4 + 8 > len)
129 return 0;
130
131 tuple->src.ip = iph->saddr;
132 tuple->dst.ip = iph->daddr;
133 tuple->dst.protonum = iph->protocol;
134
135 ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
136 len - 4*iph->ihl,
137 tuple);
138 return ret;
139 }
140
141 static int
142 invert_tuple(struct ip_conntrack_tuple *inverse,
143 const struct ip_conntrack_tuple *orig,
144 const struct ip_conntrack_protocol *protocol)
145 {
146 inverse->src.ip = orig->dst.ip;
147 inverse->dst.ip = orig->src.ip;
148 inverse->dst.protonum = orig->dst.protonum;
149
150 return protocol->invert_tuple(inverse, orig);
151 }
152
153 static void
154 clean_from_lists(struct ip_conntrack *ct)
155 {
156 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
157 /* Remove from both hash lists: must not NULL out next ptrs,
158 otherwise we'll look unconfirmed. Fortunately, LIST_DELETE
159 doesn't do this. --RR */
160 LIST_DELETE(&ip_conntrack_hash
161 [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)],
162 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
163 LIST_DELETE(&ip_conntrack_hash
164 [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)],
165 &ct->tuplehash[IP_CT_DIR_REPLY]);
166 /* If our expected is in the list, take it out. */
167 if (ct->expected.expectant) {
168 IP_NF_ASSERT(list_inlist(&expect_list, &ct->expected));
169 IP_NF_ASSERT(ct->expected.expectant == ct);
170 LIST_DELETE(&expect_list, &ct->expected);
171 }
172 }
173
174 static void
175 destroy_conntrack(struct nf_conntrack *nfct)
176 {
177 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
178
179 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
180 IP_NF_ASSERT(!timer_pending(&ct->timeout));
181
182 if (ct->master.master)
183 nf_conntrack_put(&ct->master);
184
185 if (ip_conntrack_destroyed)
186 ip_conntrack_destroyed(ct);
187 kmem_cache_free(ip_conntrack_cachep, ct);
188 atomic_dec(&ip_conntrack_count);
189 }
190
191 static void death_by_timeout(unsigned long ul_conntrack)
192 {
193 struct ip_conntrack *ct = (void *)ul_conntrack;
194
195 WRITE_LOCK(&ip_conntrack_lock);
196 clean_from_lists(ct);
197 WRITE_UNLOCK(&ip_conntrack_lock);
198 ip_conntrack_put(ct);
199 }
200
201 static inline int
202 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
203 const struct ip_conntrack_tuple *tuple,
204 const struct ip_conntrack *ignored_conntrack)
205 {
206 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
207 return i->ctrack != ignored_conntrack
208 && ip_ct_tuple_equal(tuple, &i->tuple);
209 }
210
211 static struct ip_conntrack_tuple_hash *
212 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
213 const struct ip_conntrack *ignored_conntrack)
214 {
215 struct ip_conntrack_tuple_hash *h;
216
217 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
218 h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)],
219 conntrack_tuple_cmp,
220 struct ip_conntrack_tuple_hash *,
221 tuple, ignored_conntrack);
222 return h;
223 }
224
225 /* Find a connection corresponding to a tuple. */
226 struct ip_conntrack_tuple_hash *
227 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
228 const struct ip_conntrack *ignored_conntrack)
229 {
230 struct ip_conntrack_tuple_hash *h;
231
232 READ_LOCK(&ip_conntrack_lock);
233 h = __ip_conntrack_find(tuple, ignored_conntrack);
234 if (h)
235 atomic_inc(&h->ctrack->ct_general.use);
236 READ_UNLOCK(&ip_conntrack_lock);
237
238 return h;
239 }
240
241 static inline struct ip_conntrack *
242 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
243 {
244 struct ip_conntrack *ct
245 = (struct ip_conntrack *)nfct->master;
246
247 /* ctinfo is the index of the nfct inside the conntrack */
248 *ctinfo = nfct - ct->infos;
249 IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
250 return ct;
251 }
252
253 /* Return conntrack and conntrack_info given skb->nfct->master */
254 struct ip_conntrack *
255 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
256 {
257 if (skb->nfct)
258 return __ip_conntrack_get(skb->nfct, ctinfo);
259 return NULL;
260 }
261
262 /* Confirm a connection given skb->nfct; places it in hash table */
263 int
264 __ip_conntrack_confirm(struct nf_ct_info *nfct)
265 {
266 unsigned int hash, repl_hash;
267 struct ip_conntrack *ct;
268 enum ip_conntrack_info ctinfo;
269
270 ct = __ip_conntrack_get(nfct, &ctinfo);
271
272 /* ipt_REJECT uses ip_conntrack_attach to attach related
273 ICMP/TCP RST packets in other direction. Actual packet
274 which created connection will be IP_CT_NEW or for an
275 expected connection, IP_CT_RELATED. */
276 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
277 return NF_ACCEPT;
278
279 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
280 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
281
282 /* We're not in hash table, and we refuse to set up related
283 connections for unconfirmed conns. But packet copies and
284 REJECT will give spurious warnings here. */
285 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
286
287 /* No external references means noone else could have
288 confirmed us. */
289 IP_NF_ASSERT(!is_confirmed(ct));
290 DEBUGP("Confirming conntrack %p\n", ct);
291
292 WRITE_LOCK(&ip_conntrack_lock);
293 /* See if there's one in the list already, including reverse:
294 NAT could have grabbed it without realizing, since we're
295 not in the hash. If there is, we lost race. */
296 if (!LIST_FIND(&ip_conntrack_hash[hash],
297 conntrack_tuple_cmp,
298 struct ip_conntrack_tuple_hash *,
299 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
300 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
301 conntrack_tuple_cmp,
302 struct ip_conntrack_tuple_hash *,
303 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
304 list_prepend(&ip_conntrack_hash[hash],
305 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
306 list_prepend(&ip_conntrack_hash[repl_hash],
307 &ct->tuplehash[IP_CT_DIR_REPLY]);
308 /* Timer relative to confirmation time, not original
309 setting time, otherwise we'd get timer wrap in
310 wierd delay cases. */
311 ct->timeout.expires += jiffies;
312 add_timer(&ct->timeout);
313 atomic_inc(&ct->ct_general.use);
314 WRITE_UNLOCK(&ip_conntrack_lock);
315 return NF_ACCEPT;
316 }
317
318 WRITE_UNLOCK(&ip_conntrack_lock);
319 return NF_DROP;
320 }
321
322 /* Returns true if a connection correspondings to the tuple (required
323 for NAT). */
324 int
325 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
326 const struct ip_conntrack *ignored_conntrack)
327 {
328 struct ip_conntrack_tuple_hash *h;
329
330 READ_LOCK(&ip_conntrack_lock);
331 h = __ip_conntrack_find(tuple, ignored_conntrack);
332 READ_UNLOCK(&ip_conntrack_lock);
333
334 return h != NULL;
335 }
336
337 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
338 struct ip_conntrack *
339 icmp_error_track(struct sk_buff *skb,
340 enum ip_conntrack_info *ctinfo,
341 unsigned int hooknum)
342 {
343 const struct iphdr *iph;
344 struct icmphdr *hdr;
345 struct ip_conntrack_tuple innertuple, origtuple;
346 struct iphdr *inner;
347 size_t datalen;
348 struct ip_conntrack_protocol *innerproto;
349 struct ip_conntrack_tuple_hash *h;
350
351 IP_NF_ASSERT(iph->protocol == IPPROTO_ICMP);
352 IP_NF_ASSERT(skb->nfct == NULL);
353
354 iph = skb->nh.iph;
355 hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
356 inner = (struct iphdr *)(hdr + 1);
357 datalen = skb->len - iph->ihl*4 - sizeof(*hdr);
358
359 if (skb->len < iph->ihl * 4 + sizeof(*hdr) + sizeof(*iph)) {
360 DEBUGP("icmp_error_track: too short\n");
361 return NULL;
362 }
363
364 if (hdr->type != ICMP_DEST_UNREACH
365 && hdr->type != ICMP_SOURCE_QUENCH
366 && hdr->type != ICMP_TIME_EXCEEDED
367 && hdr->type != ICMP_PARAMETERPROB
368 && hdr->type != ICMP_REDIRECT)
369 return NULL;
370
371 /* Ignore ICMP's containing fragments (shouldn't happen) */
372 if (inner->frag_off & htons(IP_OFFSET)) {
373 DEBUGP("icmp_error_track: fragment of proto %u\n",
374 inner->protocol);
375 return NULL;
376 }
377
378 /* Ignore it if the checksum's bogus. */
379 if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) {
380 DEBUGP("icmp_error_track: bad csum\n");
381 return NULL;
382 }
383
384 innerproto = find_proto(inner->protocol);
385 /* Are they talking about one of our connections? */
386 if (inner->ihl * 4 + 8 > datalen
387 || !get_tuple(inner, datalen, &origtuple, innerproto)) {
388 DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n",
389 inner->protocol, inner->ihl, 8,
390 datalen);
391 return NULL;
392 }
393
394 /* Ordinarily, we'd expect the inverted tupleproto, but it's
395 been preserved inside the ICMP. */
396 if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
397 DEBUGP("icmp_error_track: Can't invert tuple\n");
398 return NULL;
399 }
400
401 *ctinfo = IP_CT_RELATED;
402
403 h = ip_conntrack_find_get(&innertuple, NULL);
404 if (!h) {
405 /* Locally generated ICMPs will match inverted if they
406 haven't been SNAT'ed yet */
407 /* FIXME: NAT code has to handle half-done double NAT --RR */
408 if (hooknum == NF_IP_LOCAL_OUT)
409 h = ip_conntrack_find_get(&origtuple, NULL);
410
411 if (!h) {
412 DEBUGP("icmp_error_track: no match\n");
413 return NULL;
414 }
415 /* Reverse direction from that found */
416 if (DIRECTION(h) != IP_CT_DIR_REPLY)
417 *ctinfo += IP_CT_IS_REPLY;
418 } else {
419 if (DIRECTION(h) == IP_CT_DIR_REPLY)
420 *ctinfo += IP_CT_IS_REPLY;
421 }
422
423 /* Update skb to refer to this connection */
424 skb->nfct = &h->ctrack->infos[*ctinfo];
425 return h->ctrack;
426 }
427
428 /* There's a small race here where we may free a just-assured
429 connection. Too bad: we're in trouble anyway. */
430 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
431 {
432 return !(i->ctrack->status & IPS_ASSURED);
433 }
434
435 static int early_drop(struct list_head *chain)
436 {
437 /* Traverse backwards: gives us oldest, which is roughly LRU */
438 struct ip_conntrack_tuple_hash *h;
439 int dropped = 0;
440
441 READ_LOCK(&ip_conntrack_lock);
442 h = LIST_FIND(chain, unreplied, struct ip_conntrack_tuple_hash *);
443 if (h)
444 atomic_inc(&h->ctrack->ct_general.use);
445 READ_UNLOCK(&ip_conntrack_lock);
446
447 if (!h)
448 return dropped;
449
450 if (del_timer(&h->ctrack->timeout)) {
451 death_by_timeout((unsigned long)h->ctrack);
452 dropped = 1;
453 }
454 ip_conntrack_put(h->ctrack);
455 return dropped;
456 }
457
458 static inline int helper_cmp(const struct ip_conntrack_helper *i,
459 const struct ip_conntrack_tuple *rtuple)
460 {
461 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
462 }
463
464 /* Compare parts depending on mask. */
465 static inline int expect_cmp(const struct ip_conntrack_expect *i,
466 const struct ip_conntrack_tuple *tuple)
467 {
468 return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
469 }
470
471 /* Allocate a new conntrack: we return -ENOMEM if classification
472 failed due to stress. Otherwise it really is unclassifiable. */
473 static struct ip_conntrack_tuple_hash *
474 init_conntrack(const struct ip_conntrack_tuple *tuple,
475 struct ip_conntrack_protocol *protocol,
476 struct sk_buff *skb)
477 {
478 struct ip_conntrack *conntrack;
479 struct ip_conntrack_tuple repl_tuple;
480 size_t hash, repl_hash;
481 struct ip_conntrack_expect *expected;
482 int i;
483 static unsigned int drop_next = 0;
484
485 hash = hash_conntrack(tuple);
486
487 if (ip_conntrack_max &&
488 atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
489 /* Try dropping from random chain, or else from the
490 chain about to put into (in case they're trying to
491 bomb one hash chain). */
492 if (drop_next >= ip_conntrack_htable_size)
493 drop_next = 0;
494 if (!early_drop(&ip_conntrack_hash[drop_next++])
495 && !early_drop(&ip_conntrack_hash[hash])) {
496 if (net_ratelimit())
497 printk(KERN_WARNING
498 "ip_conntrack: table full, dropping"
499 " packet.\n");
500 return ERR_PTR(-ENOMEM);
501 }
502 }
503
504 if (!invert_tuple(&repl_tuple, tuple, protocol)) {
505 DEBUGP("Can't invert tuple.\n");
506 return NULL;
507 }
508 repl_hash = hash_conntrack(&repl_tuple);
509
510 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
511 if (!conntrack) {
512 DEBUGP("Can't allocate conntrack.\n");
513 return ERR_PTR(-ENOMEM);
514 }
515
516 memset(conntrack, 0, sizeof(struct ip_conntrack));
517 atomic_set(&conntrack->ct_general.use, 1);
518 conntrack->ct_general.destroy = destroy_conntrack;
519 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
520 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
521 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
522 conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
523 for (i=0; i < IP_CT_NUMBER; i++)
524 conntrack->infos[i].master = &conntrack->ct_general;
525
526 if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
527 kmem_cache_free(ip_conntrack_cachep, conntrack);
528 return NULL;
529 }
530 /* Don't set timer yet: wait for confirmation */
531 init_timer(&conntrack->timeout);
532 conntrack->timeout.data = (unsigned long)conntrack;
533 conntrack->timeout.function = death_by_timeout;
534
535 /* Mark clearly that it's not in the hash table. */
536 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL;
537
538 /* Write lock required for deletion of expected. Without
539 this, a read-lock would do. */
540 WRITE_LOCK(&ip_conntrack_lock);
541 conntrack->helper = LIST_FIND(&helpers, helper_cmp,
542 struct ip_conntrack_helper *,
543 &repl_tuple);
544 /* Need finding and deleting of expected ONLY if we win race */
545 expected = LIST_FIND(&expect_list, expect_cmp,
546 struct ip_conntrack_expect *, tuple);
547 /* If master is not in hash table yet (ie. packet hasn't left
548 this machine yet), how can other end know about expected?
549 Hence these are not the droids you are looking for (if
550 master ct never got confirmed, we'd hold a reference to it
551 and weird things would happen to future packets). */
552 if (expected && is_confirmed(expected->expectant)) {
553 /* Welcome, Mr. Bond. We've been expecting you... */
554 conntrack->status = IPS_EXPECTED;
555 conntrack->master.master = &expected->expectant->ct_general;
556 IP_NF_ASSERT(conntrack->master.master);
557 LIST_DELETE(&expect_list, expected);
558 expected->expectant = NULL;
559 nf_conntrack_get(&conntrack->master);
560 }
561 atomic_inc(&ip_conntrack_count);
562 WRITE_UNLOCK(&ip_conntrack_lock);
563
564 if (expected && expected->expectfn)
565 expected->expectfn(conntrack);
566 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
567 }
568
569 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
570 static inline struct ip_conntrack *
571 resolve_normal_ct(struct sk_buff *skb,
572 struct ip_conntrack_protocol *proto,
573 int *set_reply,
574 unsigned int hooknum,
575 enum ip_conntrack_info *ctinfo)
576 {
577 struct ip_conntrack_tuple tuple;
578 struct ip_conntrack_tuple_hash *h;
579
580 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
581
582 if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto))
583 return NULL;
584
585 /* look for tuple match */
586 h = ip_conntrack_find_get(&tuple, NULL);
587 if (!h) {
588 h = init_conntrack(&tuple, proto, skb);
589 if (!h)
590 return NULL;
591 if (IS_ERR(h))
592 return (void *)h;
593 }
594
595 /* It exists; we have (non-exclusive) reference. */
596 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
597 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
598 /* Please set reply bit if this packet OK */
599 *set_reply = 1;
600 } else {
601 /* Once we've had two way comms, always ESTABLISHED. */
602 if (h->ctrack->status & IPS_SEEN_REPLY) {
603 DEBUGP("ip_conntrack_in: normal packet for %p\n",
604 h->ctrack);
605 *ctinfo = IP_CT_ESTABLISHED;
606 } else if (h->ctrack->status & IPS_EXPECTED) {
607 DEBUGP("ip_conntrack_in: related packet for %p\n",
608 h->ctrack);
609 *ctinfo = IP_CT_RELATED;
610 } else {
611 DEBUGP("ip_conntrack_in: new packet for %p\n",
612 h->ctrack);
613 *ctinfo = IP_CT_NEW;
614 }
615 *set_reply = 0;
616 }
617 skb->nfct = &h->ctrack->infos[*ctinfo];
618 return h->ctrack;
619 }
620
621 /* Netfilter hook itself. */
622 unsigned int ip_conntrack_in(unsigned int hooknum,
623 struct sk_buff **pskb,
624 const struct net_device *in,
625 const struct net_device *out,
626 int (*okfn)(struct sk_buff *))
627 {
628 struct ip_conntrack *ct;
629 enum ip_conntrack_info ctinfo;
630 struct ip_conntrack_protocol *proto;
631 int set_reply;
632 int ret;
633
634 /* FIXME: Do this right please. --RR */
635 (*pskb)->nfcache |= NFC_UNKNOWN;
636
637 /* Doesn't cover locally-generated broadcast, so not worth it. */
638 #if 0
639 /* Ignore broadcast: no `connection'. */
640 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
641 printk("Broadcast packet!\n");
642 return NF_ACCEPT;
643 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
644 == htonl(0x000000FF)) {
645 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
646 NIPQUAD((*pskb)->nh.iph->saddr),
647 NIPQUAD((*pskb)->nh.iph->daddr),
648 (*pskb)->sk, (*pskb)->pkt_type);
649 }
650 #endif
651
652 /* Previously seen (loopback)? Ignore. Do this before
653 fragment check. */
654 if ((*pskb)->nfct)
655 return NF_ACCEPT;
656
657 /* Gather fragments. */
658 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
659 *pskb = ip_ct_gather_frags(*pskb);
660 if (!*pskb)
661 return NF_STOLEN;
662 }
663
664 proto = find_proto((*pskb)->nh.iph->protocol);
665
666 /* It may be an icmp error... */
667 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
668 && icmp_error_track(*pskb, &ctinfo, hooknum))
669 return NF_ACCEPT;
670
671 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
672 /* Not valid part of a connection */
673 return NF_ACCEPT;
674
675 if (IS_ERR(ct))
676 /* Too stressed to deal. */
677 return NF_DROP;
678
679 IP_NF_ASSERT((*pskb)->nfct);
680
681 ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
682 if (ret == -1) {
683 /* Invalid */
684 nf_conntrack_put((*pskb)->nfct);
685 (*pskb)->nfct = NULL;
686 return NF_ACCEPT;
687 }
688
689 if (ret != NF_DROP && ct->helper) {
690 ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len,
691 ct, ctinfo);
692 if (ret == -1) {
693 /* Invalid */
694 nf_conntrack_put((*pskb)->nfct);
695 (*pskb)->nfct = NULL;
696 return NF_ACCEPT;
697 }
698 }
699 if (set_reply)
700 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
701
702 return ret;
703 }
704
705 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
706 const struct ip_conntrack_tuple *orig)
707 {
708 return invert_tuple(inverse, orig, find_proto(orig->dst.protonum));
709 }
710
711 static void unexpect_related(struct ip_conntrack *related_to)
712 {
713 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
714 list_del(&related_to->expected.list);
715 related_to->expected.expectant = NULL;
716 }
717
718 /* Would two expected things clash? */
719 static inline int expect_clash(const struct ip_conntrack_expect *i,
720 const struct ip_conntrack_expect *new)
721 {
722 /* Part covered by intersection of masks must be unequal,
723 otherwise they clash */
724 struct ip_conntrack_tuple intersect_mask
725 = { { i->mask.src.ip & new->mask.src.ip,
726 { i->mask.src.u.all & new->mask.src.u.all } },
727 { i->mask.dst.ip & new->mask.dst.ip,
728 { i->mask.dst.u.all & new->mask.dst.u.all },
729 i->mask.dst.protonum & new->mask.dst.protonum } };
730
731 return ip_ct_tuple_mask_cmp(&i->tuple, &new->tuple, &intersect_mask);
732 }
733
734 /* Add a related connection. */
735 int ip_conntrack_expect_related(struct ip_conntrack *related_to,
736 const struct ip_conntrack_tuple *tuple,
737 const struct ip_conntrack_tuple *mask,
738 int (*expectfn)(struct ip_conntrack *))
739 {
740 WRITE_LOCK(&ip_conntrack_lock);
741 if (related_to->expected.expectant)
742 unexpect_related(related_to);
743
744 related_to->expected.tuple = *tuple;
745 related_to->expected.mask = *mask;
746 related_to->expected.expectfn = expectfn;
747
748 if (LIST_FIND(&expect_list, expect_clash,
749 struct ip_conntrack_expect *, &related_to->expected)) {
750 WRITE_UNLOCK(&ip_conntrack_lock);
751 return -EBUSY;
752 }
753
754 list_prepend(&expect_list, &related_to->expected);
755 related_to->expected.expectant = related_to;
756 WRITE_UNLOCK(&ip_conntrack_lock);
757
758 return 0;
759 }
760
761 void ip_conntrack_unexpect_related(struct ip_conntrack *related_to)
762 {
763 WRITE_LOCK(&ip_conntrack_lock);
764 unexpect_related(related_to);
765 WRITE_UNLOCK(&ip_conntrack_lock);
766 }
767
768 /* Alter reply tuple (maybe alter helper). If it's already taken,
769 return 0 and don't do alteration. */
770 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
771 const struct ip_conntrack_tuple *newreply)
772 {
773 WRITE_LOCK(&ip_conntrack_lock);
774 if (__ip_conntrack_find(newreply, conntrack)) {
775 WRITE_UNLOCK(&ip_conntrack_lock);
776 return 0;
777 }
778 /* Should be unconfirmed, so not in hash table yet */
779 IP_NF_ASSERT(!is_confirmed(conntrack));
780
781 DEBUGP("Altering reply tuple of %p to ", conntrack);
782 DUMP_TUPLE(newreply);
783
784 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
785 conntrack->helper = LIST_FIND(&helpers, helper_cmp,
786 struct ip_conntrack_helper *,
787 newreply);
788 WRITE_UNLOCK(&ip_conntrack_lock);
789 return 1;
790 }
791
792 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
793 {
794 MOD_INC_USE_COUNT;
795
796 WRITE_LOCK(&ip_conntrack_lock);
797 list_prepend(&helpers, me);
798 WRITE_UNLOCK(&ip_conntrack_lock);
799
800 return 0;
801 }
802
803 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
804 const struct ip_conntrack_helper *me)
805 {
806 if (i->ctrack->helper == me) {
807 i->ctrack->helper = NULL;
808 /* Get rid of any expected. */
809 if (i->ctrack->expected.expectant) {
810 IP_NF_ASSERT(i->ctrack->expected.expectant
811 == i->ctrack);
812 LIST_DELETE(&expect_list, &i->ctrack->expected);
813 i->ctrack->expected.expectant = NULL;
814 }
815 }
816 return 0;
817 }
818
819 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
820 {
821 unsigned int i;
822
823 /* Need write lock here, to delete helper. */
824 WRITE_LOCK(&ip_conntrack_lock);
825 LIST_DELETE(&helpers, me);
826
827 /* Get rid of expecteds, set helpers to NULL. */
828 for (i = 0; i < ip_conntrack_htable_size; i++)
829 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
830 struct ip_conntrack_tuple_hash *, me);
831 WRITE_UNLOCK(&ip_conntrack_lock);
832
833 /* Someone could be still looking at the helper in a bh. */
834 br_write_lock_bh(BR_NETPROTO_LOCK);
835 br_write_unlock_bh(BR_NETPROTO_LOCK);
836
837 MOD_DEC_USE_COUNT;
838 }
839
840 /* Refresh conntrack for this many jiffies. */
841 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
842 {
843 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
844
845 WRITE_LOCK(&ip_conntrack_lock);
846 /* If not in hash table, timer will not be active yet */
847 if (!is_confirmed(ct))
848 ct->timeout.expires = extra_jiffies;
849 else {
850 /* Need del_timer for race avoidance (may already be dying). */
851 if (del_timer(&ct->timeout)) {
852 ct->timeout.expires = jiffies + extra_jiffies;
853 add_timer(&ct->timeout);
854 }
855 }
856 WRITE_UNLOCK(&ip_conntrack_lock);
857 }
858
859 /* Returns new sk_buff, or NULL */
860 struct sk_buff *
861 ip_ct_gather_frags(struct sk_buff *skb)
862 {
863 struct sock *sk = skb->sk;
864 #ifdef CONFIG_NETFILTER_DEBUG
865 unsigned int olddebug = skb->nf_debug;
866 #endif
867 if (sk) {
868 sock_hold(sk);
869 skb_orphan(skb);
870 }
871
872 local_bh_disable();
873 skb = ip_defrag(skb);
874 local_bh_enable();
875
876 if (!skb) {
877 if (sk) sock_put(sk);
878 return skb;
879 } else if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
880 kfree_skb(skb);
881 if (sk) sock_put(sk);
882 return NULL;
883 }
884
885 if (sk) {
886 skb_set_owner_w(skb, sk);
887 sock_put(sk);
888 }
889
890 ip_send_check(skb->nh.iph);
891 skb->nfcache |= NFC_ALTERED;
892 #ifdef CONFIG_NETFILTER_DEBUG
893 /* Packet path as if nothing had happened. */
894 skb->nf_debug = olddebug;
895 #endif
896 return skb;
897 }
898
899 /* Used by ipt_REJECT. */
900 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
901 {
902 struct ip_conntrack *ct;
903 enum ip_conntrack_info ctinfo;
904
905 ct = __ip_conntrack_get(nfct, &ctinfo);
906
907 /* This ICMP is in reverse direction to the packet which
908 caused it */
909 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
910 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
911 else
912 ctinfo = IP_CT_RELATED;
913
914 /* Attach new skbuff, and increment count */
915 nskb->nfct = &ct->infos[ctinfo];
916 atomic_inc(&ct->ct_general.use);
917 }
918
919 static inline int
920 do_kill(const struct ip_conntrack_tuple_hash *i,
921 int (*kill)(const struct ip_conntrack *i, void *data),
922 void *data)
923 {
924 return kill(i->ctrack, data);
925 }
926
927 /* Bring out ya dead! */
928 static struct ip_conntrack_tuple_hash *
929 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
930 void *data)
931 {
932 struct ip_conntrack_tuple_hash *h = NULL;
933 unsigned int i;
934
935 READ_LOCK(&ip_conntrack_lock);
936 for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
937 h = LIST_FIND(&ip_conntrack_hash[i], do_kill,
938 struct ip_conntrack_tuple_hash *, kill, data);
939 }
940 if (h)
941 atomic_inc(&h->ctrack->ct_general.use);
942 READ_UNLOCK(&ip_conntrack_lock);
943
944 return h;
945 }
946
947 void
948 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
949 void *data)
950 {
951 struct ip_conntrack_tuple_hash *h;
952
953 /* This is order n^2, by the way. */
954 while ((h = get_next_corpse(kill, data)) != NULL) {
955 /* Time to push up daises... */
956 if (del_timer(&h->ctrack->timeout))
957 death_by_timeout((unsigned long)h->ctrack);
958 /* ... else the timer will get him soon. */
959
960 ip_conntrack_put(h->ctrack);
961 }
962 }
963
964 /* Fast function for those who don't want to parse /proc (and I don't
965 blame them). */
966 /* Reversing the socket's dst/src point of view gives us the reply
967 mapping. */
968 static int
969 getorigdst(struct sock *sk, int optval, void *user, int *len)
970 {
971 struct ip_conntrack_tuple_hash *h;
972 struct ip_conntrack_tuple tuple = { { sk->rcv_saddr, { sk->sport } },
973 { sk->daddr, { sk->dport },
974 IPPROTO_TCP } };
975
976 /* We only do TCP at the moment: is there a better way? */
977 if (strcmp(sk->prot->name, "TCP") != 0) {
978 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
979 return -ENOPROTOOPT;
980 }
981
982 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
983 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
984 *len, sizeof(struct sockaddr_in));
985 return -EINVAL;
986 }
987
988 h = ip_conntrack_find_get(&tuple, NULL);
989 if (h) {
990 struct sockaddr_in sin;
991
992 sin.sin_family = AF_INET;
993 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
994 .tuple.dst.u.tcp.port;
995 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
996 .tuple.dst.ip;
997
998 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
999 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1000 ip_conntrack_put(h->ctrack);
1001 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1002 return -EFAULT;
1003 else
1004 return 0;
1005 }
1006 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1007 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1008 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1009 return -ENOENT;
1010 }
1011
1012 static struct nf_sockopt_ops so_getorigdst
1013 = { { NULL, NULL }, PF_INET,
1014 0, 0, NULL, /* Setsockopts */
1015 SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst,
1016 0, NULL };
1017
1018 #define NET_IP_CONNTRACK_MAX 2089
1019 #define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max"
1020
1021 #ifdef CONFIG_SYSCTL
1022 static struct ctl_table_header *ip_conntrack_sysctl_header;
1023
1024 static ctl_table ip_conntrack_table[] = {
1025 { NET_IP_CONNTRACK_MAX, NET_IP_CONNTRACK_MAX_NAME, &ip_conntrack_max,
1026 sizeof(ip_conntrack_max), 0644, NULL, proc_dointvec },
1027 { 0 }
1028 };
1029
1030 static ctl_table ip_conntrack_dir_table[] = {
1031 {NET_IPV4, "ipv4", NULL, 0, 0555, ip_conntrack_table, 0, 0, 0, 0, 0},
1032 { 0 }
1033 };
1034
1035 static ctl_table ip_conntrack_root_table[] = {
1036 {CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0},
1037 { 0 }
1038 };
1039 #endif /*CONFIG_SYSCTL*/
1040
1041 static int kill_all(const struct ip_conntrack *i, void *data)
1042 {
1043 return 1;
1044 }
1045
1046 /* Mishearing the voices in his head, our hero wonders how he's
1047 supposed to kill the mall. */
1048 void ip_conntrack_cleanup(void)
1049 {
1050 #ifdef CONFIG_SYSCTL
1051 unregister_sysctl_table(ip_conntrack_sysctl_header);
1052 #endif
1053 ip_ct_attach = NULL;
1054 /* This makes sure all current packets have passed through
1055 netfilter framework. Roll on, two-stage module
1056 delete... */
1057 br_write_lock_bh(BR_NETPROTO_LOCK);
1058 br_write_unlock_bh(BR_NETPROTO_LOCK);
1059
1060 i_see_dead_people:
1061 ip_ct_selective_cleanup(kill_all, NULL);
1062 if (atomic_read(&ip_conntrack_count) != 0) {
1063 schedule();
1064 goto i_see_dead_people;
1065 }
1066
1067 kmem_cache_destroy(ip_conntrack_cachep);
1068 vfree(ip_conntrack_hash);
1069 nf_unregister_sockopt(&so_getorigdst);
1070 }
1071
1072 static int hashsize = 0;
1073 MODULE_PARM(hashsize, "i");
1074
1075 int __init ip_conntrack_init(void)
1076 {
1077 unsigned int i;
1078 int ret;
1079
1080 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1081 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1082 if (hashsize) {
1083 ip_conntrack_htable_size = hashsize;
1084 } else {
1085 ip_conntrack_htable_size
1086 = (((num_physpages << PAGE_SHIFT) / 16384)
1087 / sizeof(struct list_head));
1088 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1089 ip_conntrack_htable_size = 8192;
1090 if (ip_conntrack_htable_size < 16)
1091 ip_conntrack_htable_size = 16;
1092 }
1093 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1094
1095 printk("ip_conntrack (%u buckets, %d max)\n",
1096 ip_conntrack_htable_size, ip_conntrack_max);
1097
1098 ret = nf_register_sockopt(&so_getorigdst);
1099 if (ret != 0)
1100 return ret;
1101
1102 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1103 * ip_conntrack_htable_size);
1104 if (!ip_conntrack_hash) {
1105 nf_unregister_sockopt(&so_getorigdst);
1106 return -ENOMEM;
1107 }
1108
1109 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1110 sizeof(struct ip_conntrack), 0,
1111 SLAB_HWCACHE_ALIGN, NULL, NULL);
1112 if (!ip_conntrack_cachep) {
1113 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1114 vfree(ip_conntrack_hash);
1115 nf_unregister_sockopt(&so_getorigdst);
1116 return -ENOMEM;
1117 }
1118
1119 /* Don't NEED lock here, but good form anyway. */
1120 WRITE_LOCK(&ip_conntrack_lock);
1121 /* Sew in builtin protocols. */
1122 list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1123 list_append(&protocol_list, &ip_conntrack_protocol_udp);
1124 list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1125 WRITE_UNLOCK(&ip_conntrack_lock);
1126
1127 for (i = 0; i < ip_conntrack_htable_size; i++)
1128 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1129
1130 /* This is fucking braindead. There is NO WAY of doing this without
1131 the CONFIG_SYSCTL unless you don't want to detect errors.
1132 Grrr... --RR */
1133 #ifdef CONFIG_SYSCTL
1134 ip_conntrack_sysctl_header
1135 = register_sysctl_table(ip_conntrack_root_table, 0);
1136 if (ip_conntrack_sysctl_header == NULL) {
1137 kmem_cache_destroy(ip_conntrack_cachep);
1138 vfree(ip_conntrack_hash);
1139 nf_unregister_sockopt(&so_getorigdst);
1140 return -ENOMEM;
1141 }
1142 #endif /*CONFIG_SYSCTL*/
1143
1144 /* For use by ipt_REJECT */
1145 ip_ct_attach = ip_conntrack_attach;
1146 return ret;
1147 }
1148