File: /usr/src/linux/net/ipv4/tcp_ipv4.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.230 2001/09/01 00:31:50 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26 /*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 */
49
50 #include <linux/config.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/random.h>
54 #include <linux/cache.h>
55 #include <linux/init.h>
56
57 #include <net/icmp.h>
58 #include <net/tcp.h>
59 #include <net/ipv6.h>
60 #include <net/inet_common.h>
61
62 #include <linux/inet.h>
63 #include <linux/stddef.h>
64 #include <linux/ipsec.h>
65
66 extern int sysctl_ip_dynaddr;
67
68 /* Check TCP sequence numbers in ICMP packets. */
69 #define ICMP_MIN_LENGTH 8
70
71 /* Socket used for sending RSTs */
72 static struct inode tcp_inode;
73 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
74
75 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
76 struct sk_buff *skb);
77
78 /*
79 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
80 */
81 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
82 __tcp_ehash: NULL,
83 __tcp_bhash: NULL,
84 __tcp_bhash_size: 0,
85 __tcp_ehash_size: 0,
86 __tcp_listening_hash: { NULL, },
87 __tcp_lhash_lock: RW_LOCK_UNLOCKED,
88 __tcp_lhash_users: ATOMIC_INIT(0),
89 __tcp_lhash_wait:
90 __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
91 __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
92 };
93
94 /*
95 * This array holds the first and last local port number.
96 * For high-usage systems, use sysctl to change this to
97 * 32768-61000
98 */
99 int sysctl_local_port_range[2] = { 1024, 4999 };
100 int tcp_port_rover = (1024 - 1);
101
102 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
103 __u32 faddr, __u16 fport)
104 {
105 int h = ((laddr ^ lport) ^ (faddr ^ fport));
106 h ^= h>>16;
107 h ^= h>>8;
108 return h & (tcp_ehash_size - 1);
109 }
110
111 static __inline__ int tcp_sk_hashfn(struct sock *sk)
112 {
113 __u32 laddr = sk->rcv_saddr;
114 __u16 lport = sk->num;
115 __u32 faddr = sk->daddr;
116 __u16 fport = sk->dport;
117
118 return tcp_hashfn(laddr, lport, faddr, fport);
119 }
120
121 /* Allocate and initialize a new TCP local port bind bucket.
122 * The bindhash mutex for snum's hash chain must be held here.
123 */
124 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
125 unsigned short snum)
126 {
127 struct tcp_bind_bucket *tb;
128
129 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
130 if(tb != NULL) {
131 tb->port = snum;
132 tb->fastreuse = 0;
133 tb->owners = NULL;
134 if((tb->next = head->chain) != NULL)
135 tb->next->pprev = &tb->next;
136 head->chain = tb;
137 tb->pprev = &head->chain;
138 }
139 return tb;
140 }
141
142 /* Caller must disable local BH processing. */
143 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
144 {
145 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
146 struct tcp_bind_bucket *tb;
147
148 spin_lock(&head->lock);
149 tb = (struct tcp_bind_bucket *)sk->prev;
150 if ((child->bind_next = tb->owners) != NULL)
151 tb->owners->bind_pprev = &child->bind_next;
152 tb->owners = child;
153 child->bind_pprev = &tb->owners;
154 child->prev = (struct sock *) tb;
155 spin_unlock(&head->lock);
156 }
157
158 __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
159 {
160 local_bh_disable();
161 __tcp_inherit_port(sk, child);
162 local_bh_enable();
163 }
164
165 /* Obtain a reference to a local port for the given sock,
166 * if snum is zero it means select any available local port.
167 */
168 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
169 {
170 struct tcp_bind_hashbucket *head;
171 struct tcp_bind_bucket *tb;
172 int ret;
173
174 local_bh_disable();
175 if (snum == 0) {
176 int low = sysctl_local_port_range[0];
177 int high = sysctl_local_port_range[1];
178 int remaining = (high - low) + 1;
179 int rover;
180
181 spin_lock(&tcp_portalloc_lock);
182 rover = tcp_port_rover;
183 do { rover++;
184 if ((rover < low) || (rover > high))
185 rover = low;
186 head = &tcp_bhash[tcp_bhashfn(rover)];
187 spin_lock(&head->lock);
188 for (tb = head->chain; tb; tb = tb->next)
189 if (tb->port == rover)
190 goto next;
191 break;
192 next:
193 spin_unlock(&head->lock);
194 } while (--remaining > 0);
195 tcp_port_rover = rover;
196 spin_unlock(&tcp_portalloc_lock);
197
198 /* Exhausted local port range during search? */
199 ret = 1;
200 if (remaining <= 0)
201 goto fail;
202
203 /* OK, here is the one we will use. HEAD is
204 * non-NULL and we hold it's mutex.
205 */
206 snum = rover;
207 tb = NULL;
208 } else {
209 head = &tcp_bhash[tcp_bhashfn(snum)];
210 spin_lock(&head->lock);
211 for (tb = head->chain; tb != NULL; tb = tb->next)
212 if (tb->port == snum)
213 break;
214 }
215 if (tb != NULL && tb->owners != NULL) {
216 if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
217 goto success;
218 } else {
219 struct sock *sk2 = tb->owners;
220 int sk_reuse = sk->reuse;
221
222 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
223 if (sk != sk2 &&
224 sk->bound_dev_if == sk2->bound_dev_if) {
225 if (!sk_reuse ||
226 !sk2->reuse ||
227 sk2->state == TCP_LISTEN) {
228 if (!sk2->rcv_saddr ||
229 !sk->rcv_saddr ||
230 (sk2->rcv_saddr == sk->rcv_saddr))
231 break;
232 }
233 }
234 }
235 /* If we found a conflict, fail. */
236 ret = 1;
237 if (sk2 != NULL)
238 goto fail_unlock;
239 }
240 }
241 ret = 1;
242 if (tb == NULL &&
243 (tb = tcp_bucket_create(head, snum)) == NULL)
244 goto fail_unlock;
245 if (tb->owners == NULL) {
246 if (sk->reuse && sk->state != TCP_LISTEN)
247 tb->fastreuse = 1;
248 else
249 tb->fastreuse = 0;
250 } else if (tb->fastreuse &&
251 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
252 tb->fastreuse = 0;
253 success:
254 sk->num = snum;
255 if (sk->prev == NULL) {
256 if ((sk->bind_next = tb->owners) != NULL)
257 tb->owners->bind_pprev = &sk->bind_next;
258 tb->owners = sk;
259 sk->bind_pprev = &tb->owners;
260 sk->prev = (struct sock *) tb;
261 } else {
262 BUG_TRAP(sk->prev == (struct sock *) tb);
263 }
264 ret = 0;
265
266 fail_unlock:
267 spin_unlock(&head->lock);
268 fail:
269 local_bh_enable();
270 return ret;
271 }
272
273 /* Get rid of any references to a local port held by the
274 * given sock.
275 */
276 __inline__ void __tcp_put_port(struct sock *sk)
277 {
278 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
279 struct tcp_bind_bucket *tb;
280
281 spin_lock(&head->lock);
282 tb = (struct tcp_bind_bucket *) sk->prev;
283 if (sk->bind_next)
284 sk->bind_next->bind_pprev = sk->bind_pprev;
285 *(sk->bind_pprev) = sk->bind_next;
286 sk->prev = NULL;
287 sk->num = 0;
288 if (tb->owners == NULL) {
289 if (tb->next)
290 tb->next->pprev = tb->pprev;
291 *(tb->pprev) = tb->next;
292 kmem_cache_free(tcp_bucket_cachep, tb);
293 }
294 spin_unlock(&head->lock);
295 }
296
297 void tcp_put_port(struct sock *sk)
298 {
299 local_bh_disable();
300 __tcp_put_port(sk);
301 local_bh_enable();
302 }
303
304 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
305 * Look, when several writers sleep and reader wakes them up, all but one
306 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
307 * this, _but_ remember, it adds useless work on UP machines (wake up each
308 * exclusive lock release). It should be ifdefed really.
309 */
310
311 void tcp_listen_wlock(void)
312 {
313 write_lock(&tcp_lhash_lock);
314
315 if (atomic_read(&tcp_lhash_users)) {
316 DECLARE_WAITQUEUE(wait, current);
317
318 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
319 for (;;) {
320 set_current_state(TASK_UNINTERRUPTIBLE);
321 if (atomic_read(&tcp_lhash_users) == 0)
322 break;
323 write_unlock_bh(&tcp_lhash_lock);
324 schedule();
325 write_lock_bh(&tcp_lhash_lock);
326 }
327
328 __set_current_state(TASK_RUNNING);
329 remove_wait_queue(&tcp_lhash_wait, &wait);
330 }
331 }
332
333 static __inline__ void __tcp_v4_hash(struct sock *sk)
334 {
335 struct sock **skp;
336 rwlock_t *lock;
337
338 BUG_TRAP(sk->pprev==NULL);
339 if(sk->state == TCP_LISTEN) {
340 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
341 lock = &tcp_lhash_lock;
342 tcp_listen_wlock();
343 } else {
344 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
345 lock = &tcp_ehash[sk->hashent].lock;
346 write_lock(lock);
347 }
348 if((sk->next = *skp) != NULL)
349 (*skp)->pprev = &sk->next;
350 *skp = sk;
351 sk->pprev = skp;
352 sock_prot_inc_use(sk->prot);
353 write_unlock(lock);
354 if (sk->state == TCP_LISTEN)
355 wake_up(&tcp_lhash_wait);
356 }
357
358 static void tcp_v4_hash(struct sock *sk)
359 {
360 if (sk->state != TCP_CLOSE) {
361 local_bh_disable();
362 __tcp_v4_hash(sk);
363 local_bh_enable();
364 }
365 }
366
367 void tcp_unhash(struct sock *sk)
368 {
369 rwlock_t *lock;
370
371 if (sk->state == TCP_LISTEN) {
372 local_bh_disable();
373 tcp_listen_wlock();
374 lock = &tcp_lhash_lock;
375 } else {
376 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
377 lock = &head->lock;
378 write_lock_bh(&head->lock);
379 }
380
381 if(sk->pprev) {
382 if(sk->next)
383 sk->next->pprev = sk->pprev;
384 *sk->pprev = sk->next;
385 sk->pprev = NULL;
386 sock_prot_dec_use(sk->prot);
387 }
388 write_unlock_bh(lock);
389 if (sk->state == TCP_LISTEN)
390 wake_up(&tcp_lhash_wait);
391 }
392
393 /* Don't inline this cruft. Here are some nice properties to
394 * exploit here. The BSD API does not allow a listening TCP
395 * to specify the remote port nor the remote address for the
396 * connection. So always assume those are both wildcarded
397 * during the search since they can never be otherwise.
398 */
399 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
400 {
401 struct sock *result = NULL;
402 int score, hiscore;
403
404 hiscore=0;
405 for(; sk; sk = sk->next) {
406 if(sk->num == hnum) {
407 __u32 rcv_saddr = sk->rcv_saddr;
408
409 score = 1;
410 if(rcv_saddr) {
411 if (rcv_saddr != daddr)
412 continue;
413 score++;
414 }
415 if (sk->bound_dev_if) {
416 if (sk->bound_dev_if != dif)
417 continue;
418 score++;
419 }
420 if (score == 3)
421 return sk;
422 if (score > hiscore) {
423 hiscore = score;
424 result = sk;
425 }
426 }
427 }
428 return result;
429 }
430
431 /* Optimize the common listener case. */
432 __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
433 {
434 struct sock *sk;
435
436 read_lock(&tcp_lhash_lock);
437 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
438 if (sk) {
439 if (sk->num == hnum &&
440 sk->next == NULL &&
441 (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
442 !sk->bound_dev_if)
443 goto sherry_cache;
444 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
445 }
446 if (sk) {
447 sherry_cache:
448 sock_hold(sk);
449 }
450 read_unlock(&tcp_lhash_lock);
451 return sk;
452 }
453
454 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
455 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
456 *
457 * Local BH must be disabled here.
458 */
459
460 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
461 u32 daddr, u16 hnum, int dif)
462 {
463 struct tcp_ehash_bucket *head;
464 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
465 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
466 struct sock *sk;
467 int hash;
468
469 /* Optimize here for direct hit, only listening connections can
470 * have wildcards anyways.
471 */
472 hash = tcp_hashfn(daddr, hnum, saddr, sport);
473 head = &tcp_ehash[hash];
474 read_lock(&head->lock);
475 for(sk = head->chain; sk; sk = sk->next) {
476 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
477 goto hit; /* You sunk my battleship! */
478 }
479
480 /* Must check for a TIME_WAIT'er before going to listener hash. */
481 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
482 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
483 goto hit;
484 read_unlock(&head->lock);
485
486 return NULL;
487
488 hit:
489 sock_hold(sk);
490 read_unlock(&head->lock);
491 return sk;
492 }
493
494 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
495 u32 daddr, u16 hnum, int dif)
496 {
497 struct sock *sk;
498
499 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
500
501 if (sk)
502 return sk;
503
504 return tcp_v4_lookup_listener(daddr, hnum, dif);
505 }
506
507 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
508 {
509 struct sock *sk;
510
511 local_bh_disable();
512 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
513 local_bh_enable();
514
515 return sk;
516 }
517
518 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
519 {
520 return secure_tcp_sequence_number(skb->nh.iph->daddr,
521 skb->nh.iph->saddr,
522 skb->h.th->dest,
523 skb->h.th->source);
524 }
525
526 static int tcp_v4_check_established(struct sock *sk)
527 {
528 u32 daddr = sk->rcv_saddr;
529 u32 saddr = sk->daddr;
530 int dif = sk->bound_dev_if;
531 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
532 __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
533 int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
534 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
535 struct sock *sk2, **skp;
536 struct tcp_tw_bucket *tw;
537
538 write_lock_bh(&head->lock);
539
540 /* Check TIME-WAIT sockets first. */
541 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
542 skp = &sk2->next) {
543 tw = (struct tcp_tw_bucket*)sk2;
544
545 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
546 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
547
548 /* With PAWS, it is safe from the viewpoint
549 of data integrity. Even without PAWS it
550 is safe provided sequence spaces do not
551 overlap i.e. at data rates <= 80Mbit/sec.
552
553 Actually, the idea is close to VJ's one,
554 only timestamp cache is held not per host,
555 but per port pair and TW bucket is used
556 as state holder.
557
558 If TW bucket has been already destroyed we
559 fall back to VJ's scheme and use initial
560 timestamp retrieved from peer table.
561 */
562 if (tw->ts_recent_stamp) {
563 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
564 tp->write_seq = 1;
565 tp->ts_recent = tw->ts_recent;
566 tp->ts_recent_stamp = tw->ts_recent_stamp;
567 sock_hold(sk2);
568 skp = &head->chain;
569 goto unique;
570 } else
571 goto not_unique;
572 }
573 }
574 tw = NULL;
575
576 /* And established part... */
577 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
578 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
579 goto not_unique;
580 }
581
582 unique:
583 BUG_TRAP(sk->pprev==NULL);
584 if ((sk->next = *skp) != NULL)
585 (*skp)->pprev = &sk->next;
586
587 *skp = sk;
588 sk->pprev = skp;
589 sk->hashent = hash;
590 sock_prot_inc_use(sk->prot);
591 write_unlock_bh(&head->lock);
592
593 if (tw) {
594 /* Silly. Should hash-dance instead... */
595 local_bh_disable();
596 tcp_tw_deschedule(tw);
597 tcp_timewait_kill(tw);
598 NET_INC_STATS_BH(TimeWaitRecycled);
599 local_bh_enable();
600
601 tcp_tw_put(tw);
602 }
603
604 return 0;
605
606 not_unique:
607 write_unlock_bh(&head->lock);
608 return -EADDRNOTAVAIL;
609 }
610
611 /* Hash SYN-SENT socket to established hash table after
612 * checking that it is unique. Note, that without kernel lock
613 * we MUST make these two operations atomically.
614 *
615 * Optimization: if it is bound and tcp_bind_bucket has the only
616 * owner (us), we need not to scan established bucket.
617 */
618
619 int tcp_v4_hash_connecting(struct sock *sk)
620 {
621 unsigned short snum = sk->num;
622 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
623 struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
624
625 spin_lock_bh(&head->lock);
626 if (tb->owners == sk && sk->bind_next == NULL) {
627 __tcp_v4_hash(sk);
628 spin_unlock_bh(&head->lock);
629 return 0;
630 } else {
631 spin_unlock_bh(&head->lock);
632
633 /* No definite answer... Walk to established hash table */
634 return tcp_v4_check_established(sk);
635 }
636 }
637
638 /* This will initiate an outgoing connection. */
639 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
640 {
641 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
642 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
643 struct sk_buff *buff;
644 struct rtable *rt;
645 u32 daddr, nexthop;
646 int tmp;
647 int err;
648
649 if (addr_len < sizeof(struct sockaddr_in))
650 return(-EINVAL);
651
652 if (usin->sin_family != AF_INET)
653 return(-EAFNOSUPPORT);
654
655 nexthop = daddr = usin->sin_addr.s_addr;
656 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
657 if (daddr == 0)
658 return -EINVAL;
659 nexthop = sk->protinfo.af_inet.opt->faddr;
660 }
661
662 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
663 RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
664 if (tmp < 0)
665 return tmp;
666
667 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
668 ip_rt_put(rt);
669 return -ENETUNREACH;
670 }
671
672 __sk_dst_set(sk, &rt->u.dst);
673 sk->route_caps = rt->u.dst.dev->features;
674
675 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
676 daddr = rt->rt_dst;
677
678 err = -ENOBUFS;
679 buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);
680
681 if (buff == NULL)
682 goto failure;
683
684 if (!sk->saddr)
685 sk->saddr = rt->rt_src;
686 sk->rcv_saddr = sk->saddr;
687
688 if (tp->ts_recent_stamp && sk->daddr != daddr) {
689 /* Reset inherited state */
690 tp->ts_recent = 0;
691 tp->ts_recent_stamp = 0;
692 tp->write_seq = 0;
693 }
694
695 if (sysctl_tcp_tw_recycle &&
696 !tp->ts_recent_stamp &&
697 rt->rt_dst == daddr) {
698 struct inet_peer *peer = rt_get_peer(rt);
699
700 /* VJ's idea. We save last timestamp seen from
701 * the destination in peer table, when entering state TIME-WAIT
702 * and initialize ts_recent from it, when trying new connection.
703 */
704
705 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
706 tp->ts_recent_stamp = peer->tcp_ts_stamp;
707 tp->ts_recent = peer->tcp_ts;
708 }
709 }
710
711 sk->dport = usin->sin_port;
712 sk->daddr = daddr;
713
714 if (!tp->write_seq)
715 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
716 sk->sport, usin->sin_port);
717
718 tp->ext_header_len = 0;
719 if (sk->protinfo.af_inet.opt)
720 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
721 sk->protinfo.af_inet.id = tp->write_seq^jiffies;
722
723 tp->mss_clamp = 536;
724
725 err = tcp_connect(sk, buff);
726 if (err == 0)
727 return 0;
728
729 failure:
730 __sk_dst_reset(sk);
731 sk->route_caps = 0;
732 sk->dport = 0;
733 return err;
734 }
735
736 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
737 {
738 return ((struct rtable*)skb->dst)->rt_iif;
739 }
740
741 static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
742 {
743 unsigned h = raddr ^ rport;
744 h ^= h>>16;
745 h ^= h>>8;
746 return h&(TCP_SYNQ_HSIZE-1);
747 }
748
749 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
750 struct iphdr *iph,
751 struct tcphdr *th,
752 struct open_request ***prevp)
753 {
754 struct tcp_listen_opt *lopt = tp->listen_opt;
755 struct open_request *req, **prev;
756 __u16 rport = th->source;
757 __u32 raddr = iph->saddr;
758
759 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
760 (req = *prev) != NULL;
761 prev = &req->dl_next) {
762 if (req->rmt_port == rport &&
763 req->af.v4_req.rmt_addr == raddr &&
764 req->af.v4_req.loc_addr == iph->daddr &&
765 TCP_INET_FAMILY(req->class->family)) {
766 BUG_TRAP(req->sk == NULL);
767 *prevp = prev;
768 return req;
769 }
770 }
771
772 return NULL;
773 }
774
775 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
776 {
777 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
778 struct tcp_listen_opt *lopt = tp->listen_opt;
779 unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
780
781 req->expires = jiffies + TCP_TIMEOUT_INIT;
782 req->retrans = 0;
783 req->sk = NULL;
784 req->index = h;
785 req->dl_next = lopt->syn_table[h];
786
787 write_lock(&tp->syn_wait_lock);
788 lopt->syn_table[h] = req;
789 write_unlock(&tp->syn_wait_lock);
790
791 tcp_synq_added(sk);
792 }
793
794
795 /*
796 * This routine does path mtu discovery as defined in RFC1191.
797 */
798 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
799 {
800 struct dst_entry *dst;
801 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
802
803 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
804 * send out by Linux are always <576bytes so they should go through
805 * unfragmented).
806 */
807 if (sk->state == TCP_LISTEN)
808 return;
809
810 /* We don't check in the destentry if pmtu discovery is forbidden
811 * on this route. We just assume that no packet_to_big packets
812 * are send back when pmtu discovery is not active.
813 * There is a small race when the user changes this flag in the
814 * route, but I think that's acceptable.
815 */
816 if ((dst = __sk_dst_check(sk, 0)) == NULL)
817 return;
818
819 ip_rt_update_pmtu(dst, mtu);
820
821 /* Something is about to be wrong... Remember soft error
822 * for the case, if this connection will not able to recover.
823 */
824 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
825 sk->err_soft = EMSGSIZE;
826
827 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
828 tp->pmtu_cookie > dst->pmtu) {
829 tcp_sync_mss(sk, dst->pmtu);
830
831 /* Resend the TCP packet because it's
832 * clear that the old packet has been
833 * dropped. This is the new "fast" path mtu
834 * discovery.
835 */
836 tcp_simple_retransmit(sk);
837 } /* else let the usual retransmit timer handle it */
838 }
839
840 /*
841 * This routine is called by the ICMP module when it gets some
842 * sort of error condition. If err < 0 then the socket should
843 * be closed and the error returned to the user. If err > 0
844 * it's just the icmp type << 8 | icmp code. After adjustment
845 * header points to the first 8 bytes of the tcp header. We need
846 * to find the appropriate port.
847 *
848 * The locking strategy used here is very "optimistic". When
849 * someone else accesses the socket the ICMP is just dropped
850 * and for some paths there is no check at all.
851 * A more general error queue to queue errors for later handling
852 * is probably better.
853 *
854 */
855
856 void tcp_v4_err(struct sk_buff *skb, u32 info)
857 {
858 struct iphdr *iph = (struct iphdr*)skb->data;
859 struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
860 struct tcp_opt *tp;
861 int type = skb->h.icmph->type;
862 int code = skb->h.icmph->code;
863 struct sock *sk;
864 __u32 seq;
865 int err;
866
867 if (skb->len < (iph->ihl << 2) + 8) {
868 ICMP_INC_STATS_BH(IcmpInErrors);
869 return;
870 }
871
872 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
873 if (sk == NULL) {
874 ICMP_INC_STATS_BH(IcmpInErrors);
875 return;
876 }
877 if (sk->state == TCP_TIME_WAIT) {
878 tcp_tw_put((struct tcp_tw_bucket*)sk);
879 return;
880 }
881
882 bh_lock_sock(sk);
883 /* If too many ICMPs get dropped on busy
884 * servers this needs to be solved differently.
885 */
886 if (sk->lock.users != 0)
887 NET_INC_STATS_BH(LockDroppedIcmps);
888
889 if (sk->state == TCP_CLOSE)
890 goto out;
891
892 tp = &sk->tp_pinfo.af_tcp;
893 seq = ntohl(th->seq);
894 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
895 NET_INC_STATS(OutOfWindowIcmps);
896 goto out;
897 }
898
899 switch (type) {
900 case ICMP_SOURCE_QUENCH:
901 /* This is deprecated, but if someone generated it,
902 * we have no reasons to ignore it.
903 */
904 if (sk->lock.users == 0)
905 tcp_enter_cwr(tp);
906 goto out;
907 case ICMP_PARAMETERPROB:
908 err = EPROTO;
909 break;
910 case ICMP_DEST_UNREACH:
911 if (code > NR_ICMP_UNREACH)
912 goto out;
913
914 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
915 if (sk->lock.users == 0)
916 do_pmtu_discovery(sk, iph, info);
917 goto out;
918 }
919
920 err = icmp_err_convert[code].errno;
921 break;
922 case ICMP_TIME_EXCEEDED:
923 err = EHOSTUNREACH;
924 break;
925 default:
926 goto out;
927 }
928
929 switch (sk->state) {
930 struct open_request *req, **prev;
931 case TCP_LISTEN:
932 if (sk->lock.users != 0)
933 goto out;
934
935 req = tcp_v4_search_req(tp, iph, th, &prev);
936 if (!req)
937 goto out;
938
939 /* ICMPs are not backlogged, hence we cannot get
940 an established socket here.
941 */
942 BUG_TRAP(req->sk == NULL);
943
944 if (seq != req->snt_isn) {
945 NET_INC_STATS_BH(OutOfWindowIcmps);
946 goto out;
947 }
948
949 /*
950 * Still in SYN_RECV, just remove it silently.
951 * There is no good way to pass the error to the newly
952 * created socket, and POSIX does not want network
953 * errors returned from accept().
954 */
955 tcp_synq_drop(sk, req, prev);
956 goto out;
957
958 case TCP_SYN_SENT:
959 case TCP_SYN_RECV: /* Cannot happen.
960 It can f.e. if SYNs crossed.
961 */
962 if (sk->lock.users == 0) {
963 TCP_INC_STATS_BH(TcpAttemptFails);
964 sk->err = err;
965
966 sk->error_report(sk);
967
968 tcp_done(sk);
969 } else {
970 sk->err_soft = err;
971 }
972 goto out;
973 }
974
975 /* If we've already connected we will keep trying
976 * until we time out, or the user gives up.
977 *
978 * rfc1122 4.2.3.9 allows to consider as hard errors
979 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
980 * but it is obsoleted by pmtu discovery).
981 *
982 * Note, that in modern internet, where routing is unreliable
983 * and in each dark corner broken firewalls sit, sending random
984 * errors ordered by their masters even this two messages finally lose
985 * their original sense (even Linux sends invalid PORT_UNREACHs)
986 *
987 * Now we are in compliance with RFCs.
988 * --ANK (980905)
989 */
990
991 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
992 sk->err = err;
993 sk->error_report(sk);
994 } else { /* Only an error on timeout */
995 sk->err_soft = err;
996 }
997
998 out:
999 bh_unlock_sock(sk);
1000 sock_put(sk);
1001 }
1002
1003 /* This routine computes an IPv4 TCP checksum. */
1004 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1005 struct sk_buff *skb)
1006 {
1007 if (skb->ip_summed == CHECKSUM_HW) {
1008 th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1009 skb->csum = offsetof(struct tcphdr, check);
1010 } else {
1011 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1012 csum_partial((char *)th, th->doff<<2, skb->csum));
1013 }
1014 }
1015
1016 /*
1017 * This routine will send an RST to the other tcp.
1018 *
1019 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1020 * for reset.
1021 * Answer: if a packet caused RST, it is not for a socket
1022 * existing in our system, if it is matched to a socket,
1023 * it is just duplicate segment or bug in other side's TCP.
1024 * So that we build reply only basing on parameters
1025 * arrived with segment.
1026 * Exception: precedence violation. We do not implement it in any case.
1027 */
1028
1029 static void tcp_v4_send_reset(struct sk_buff *skb)
1030 {
1031 struct tcphdr *th = skb->h.th;
1032 struct tcphdr rth;
1033 struct ip_reply_arg arg;
1034
1035 /* Never send a reset in response to a reset. */
1036 if (th->rst)
1037 return;
1038
1039 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1040 return;
1041
1042 /* Swap the send and the receive. */
1043 memset(&rth, 0, sizeof(struct tcphdr));
1044 rth.dest = th->source;
1045 rth.source = th->dest;
1046 rth.doff = sizeof(struct tcphdr)/4;
1047 rth.rst = 1;
1048
1049 if (th->ack) {
1050 rth.seq = th->ack_seq;
1051 } else {
1052 rth.ack = 1;
1053 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1054 + skb->len - (th->doff<<2));
1055 }
1056
1057 memset(&arg, 0, sizeof arg);
1058 arg.iov[0].iov_base = (unsigned char *)&rth;
1059 arg.iov[0].iov_len = sizeof rth;
1060 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1061 skb->nh.iph->saddr, /*XXX*/
1062 sizeof(struct tcphdr),
1063 IPPROTO_TCP,
1064 0);
1065 arg.n_iov = 1;
1066 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1067
1068 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1069
1070 TCP_INC_STATS_BH(TcpOutSegs);
1071 TCP_INC_STATS_BH(TcpOutRsts);
1072 }
1073
1074 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1075 outside socket context is ugly, certainly. What can I do?
1076 */
1077
1078 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1079 {
1080 struct tcphdr *th = skb->h.th;
1081 struct {
1082 struct tcphdr th;
1083 u32 tsopt[3];
1084 } rep;
1085 struct ip_reply_arg arg;
1086
1087 memset(&rep.th, 0, sizeof(struct tcphdr));
1088 memset(&arg, 0, sizeof arg);
1089
1090 arg.iov[0].iov_base = (unsigned char *)&rep;
1091 arg.iov[0].iov_len = sizeof(rep.th);
1092 arg.n_iov = 1;
1093 if (ts) {
1094 rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1095 (TCPOPT_NOP << 16) |
1096 (TCPOPT_TIMESTAMP << 8) |
1097 TCPOLEN_TIMESTAMP);
1098 rep.tsopt[1] = htonl(tcp_time_stamp);
1099 rep.tsopt[2] = htonl(ts);
1100 arg.iov[0].iov_len = sizeof(rep);
1101 }
1102
1103 /* Swap the send and the receive. */
1104 rep.th.dest = th->source;
1105 rep.th.source = th->dest;
1106 rep.th.doff = arg.iov[0].iov_len/4;
1107 rep.th.seq = htonl(seq);
1108 rep.th.ack_seq = htonl(ack);
1109 rep.th.ack = 1;
1110 rep.th.window = htons(win);
1111
1112 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1113 skb->nh.iph->saddr, /*XXX*/
1114 arg.iov[0].iov_len,
1115 IPPROTO_TCP,
1116 0);
1117 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1118
1119 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1120
1121 TCP_INC_STATS_BH(TcpOutSegs);
1122 }
1123
1124 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1125 {
1126 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1127
1128 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1129 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1130
1131 tcp_tw_put(tw);
1132 }
1133
1134 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1135 {
1136 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1137 req->ts_recent);
1138 }
1139
1140 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1141 {
1142 struct rtable *rt;
1143 struct ip_options *opt;
1144
1145 opt = req->af.v4_req.opt;
1146 if(ip_route_output(&rt, ((opt && opt->srr) ?
1147 opt->faddr :
1148 req->af.v4_req.rmt_addr),
1149 req->af.v4_req.loc_addr,
1150 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1151 sk->bound_dev_if)) {
1152 IP_INC_STATS_BH(IpOutNoRoutes);
1153 return NULL;
1154 }
1155 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1156 ip_rt_put(rt);
1157 IP_INC_STATS_BH(IpOutNoRoutes);
1158 return NULL;
1159 }
1160 return &rt->u.dst;
1161 }
1162
1163 /*
1164 * Send a SYN-ACK after having received an ACK.
1165 * This still operates on a open_request only, not on a big
1166 * socket.
1167 */
1168 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1169 struct dst_entry *dst)
1170 {
1171 int err = -1;
1172 struct sk_buff * skb;
1173
1174 /* First, grab a route. */
1175 if (dst == NULL &&
1176 (dst = tcp_v4_route_req(sk, req)) == NULL)
1177 goto out;
1178
1179 skb = tcp_make_synack(sk, dst, req);
1180
1181 if (skb) {
1182 struct tcphdr *th = skb->h.th;
1183
1184 th->check = tcp_v4_check(th, skb->len,
1185 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1186 csum_partial((char *)th, skb->len, skb->csum));
1187
1188 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1189 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1190 if (err == NET_XMIT_CN)
1191 err = 0;
1192 }
1193
1194 out:
1195 dst_release(dst);
1196 return err;
1197 }
1198
1199 /*
1200 * IPv4 open_request destructor.
1201 */
1202 static void tcp_v4_or_free(struct open_request *req)
1203 {
1204 if (req->af.v4_req.opt)
1205 kfree(req->af.v4_req.opt);
1206 }
1207
1208 static inline void syn_flood_warning(struct sk_buff *skb)
1209 {
1210 static unsigned long warntime;
1211
1212 if (jiffies - warntime > HZ*60) {
1213 warntime = jiffies;
1214 printk(KERN_INFO
1215 "possible SYN flooding on port %d. Sending cookies.\n",
1216 ntohs(skb->h.th->dest));
1217 }
1218 }
1219
1220 /*
1221 * Save and compile IPv4 options into the open_request if needed.
1222 */
1223 static inline struct ip_options *
1224 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1225 {
1226 struct ip_options *opt = &(IPCB(skb)->opt);
1227 struct ip_options *dopt = NULL;
1228
1229 if (opt && opt->optlen) {
1230 int opt_size = optlength(opt);
1231 dopt = kmalloc(opt_size, GFP_ATOMIC);
1232 if (dopt) {
1233 if (ip_options_echo(dopt, skb)) {
1234 kfree(dopt);
1235 dopt = NULL;
1236 }
1237 }
1238 }
1239 return dopt;
1240 }
1241
1242 /*
1243 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1244 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1245 * It would be better to replace it with a global counter for all sockets
1246 * but then some measure against one socket starving all other sockets
1247 * would be needed.
1248 *
1249 * It was 128 by default. Experiments with real servers show, that
1250 * it is absolutely not enough even at 100conn/sec. 256 cures most
1251 * of problems. This value is adjusted to 128 for very small machines
1252 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1253 * Further increasing requires to change hash table size.
1254 */
1255 int sysctl_max_syn_backlog = 256;
1256
1257 struct or_calltable or_ipv4 = {
1258 PF_INET,
1259 tcp_v4_send_synack,
1260 tcp_v4_or_send_ack,
1261 tcp_v4_or_free,
1262 tcp_v4_send_reset
1263 };
1264
1265 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1266 {
1267 struct tcp_opt tp;
1268 struct open_request *req;
1269 __u32 saddr = skb->nh.iph->saddr;
1270 __u32 daddr = skb->nh.iph->daddr;
1271 __u32 isn = TCP_SKB_CB(skb)->when;
1272 struct dst_entry *dst = NULL;
1273 #ifdef CONFIG_SYN_COOKIES
1274 int want_cookie = 0;
1275 #else
1276 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1277 #endif
1278
1279 /* Never answer to SYNs send to broadcast or multicast */
1280 if (((struct rtable *)skb->dst)->rt_flags &
1281 (RTCF_BROADCAST|RTCF_MULTICAST))
1282 goto drop;
1283
1284 /* TW buckets are converted to open requests without
1285 * limitations, they conserve resources and peer is
1286 * evidently real one.
1287 */
1288 if (tcp_synq_is_full(sk) && !isn) {
1289 #ifdef CONFIG_SYN_COOKIES
1290 if (sysctl_tcp_syncookies) {
1291 want_cookie = 1;
1292 } else
1293 #endif
1294 goto drop;
1295 }
1296
1297 /* Accept backlog is full. If we have already queued enough
1298 * of warm entries in syn queue, drop request. It is better than
1299 * clogging syn queue with openreqs with exponentially increasing
1300 * timeout.
1301 */
1302 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1303 goto drop;
1304
1305 req = tcp_openreq_alloc();
1306 if (req == NULL)
1307 goto drop;
1308
1309 tcp_clear_options(&tp);
1310 tp.mss_clamp = 536;
1311 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1312
1313 tcp_parse_options(skb, &tp, 0);
1314
1315 if (want_cookie) {
1316 tcp_clear_options(&tp);
1317 tp.saw_tstamp = 0;
1318 }
1319
1320 if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1321 /* Some OSes (unknown ones, but I see them on web server, which
1322 * contains information interesting only for windows'
1323 * users) do not send their stamp in SYN. It is easy case.
1324 * We simply do not advertise TS support.
1325 */
1326 tp.saw_tstamp = 0;
1327 tp.tstamp_ok = 0;
1328 }
1329 tp.tstamp_ok = tp.saw_tstamp;
1330
1331 tcp_openreq_init(req, &tp, skb);
1332
1333 req->af.v4_req.loc_addr = daddr;
1334 req->af.v4_req.rmt_addr = saddr;
1335 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1336 req->class = &or_ipv4;
1337 if (!want_cookie)
1338 TCP_ECN_create_request(req, skb->h.th);
1339
1340 if (want_cookie) {
1341 #ifdef CONFIG_SYN_COOKIES
1342 syn_flood_warning(skb);
1343 #endif
1344 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1345 } else if (isn == 0) {
1346 struct inet_peer *peer = NULL;
1347
1348 /* VJ's idea. We save last timestamp seen
1349 * from the destination in peer table, when entering
1350 * state TIME-WAIT, and check against it before
1351 * accepting new connection request.
1352 *
1353 * If "isn" is not zero, this request hit alive
1354 * timewait bucket, so that all the necessary checks
1355 * are made in the function processing timewait state.
1356 */
1357 if (tp.saw_tstamp &&
1358 sysctl_tcp_tw_recycle &&
1359 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1360 (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1361 peer->v4daddr == saddr) {
1362 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1363 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1364 NET_INC_STATS_BH(PAWSPassiveRejected);
1365 dst_release(dst);
1366 goto drop_and_free;
1367 }
1368 }
1369 /* Kill the following clause, if you dislike this way. */
1370 else if (!sysctl_tcp_syncookies &&
1371 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1372 < (sysctl_max_syn_backlog>>2)) &&
1373 (!peer || !peer->tcp_ts_stamp) &&
1374 (!dst || !dst->rtt)) {
1375 /* Without syncookies last quarter of
1376 * backlog is filled with destinations, proven to be alive.
1377 * It means that we continue to communicate
1378 * to destinations, already remembered
1379 * to the moment of synflood.
1380 */
1381 NETDEBUG(if (net_ratelimit())
1382 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n",
1383 NIPQUAD(saddr), ntohs(skb->h.th->source)));
1384 TCP_INC_STATS_BH(TcpAttemptFails);
1385 dst_release(dst);
1386 goto drop_and_free;
1387 }
1388
1389 isn = tcp_v4_init_sequence(sk, skb);
1390 }
1391 req->snt_isn = isn;
1392
1393 if (tcp_v4_send_synack(sk, req, dst))
1394 goto drop_and_free;
1395
1396 if (want_cookie) {
1397 tcp_openreq_free(req);
1398 } else {
1399 tcp_v4_synq_add(sk, req);
1400 }
1401 return 0;
1402
1403 drop_and_free:
1404 tcp_openreq_free(req);
1405 drop:
1406 TCP_INC_STATS_BH(TcpAttemptFails);
1407 return 0;
1408 }
1409
1410
1411 /*
1412 * The three way handshake has completed - we got a valid synack -
1413 * now create the new socket.
1414 */
1415 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1416 struct open_request *req,
1417 struct dst_entry *dst)
1418 {
1419 struct tcp_opt *newtp;
1420 struct sock *newsk;
1421
1422 if (tcp_acceptq_is_full(sk))
1423 goto exit_overflow;
1424
1425 if (dst == NULL &&
1426 (dst = tcp_v4_route_req(sk, req)) == NULL)
1427 goto exit;
1428
1429 newsk = tcp_create_openreq_child(sk, req, skb);
1430 if (!newsk)
1431 goto exit;
1432
1433 newsk->dst_cache = dst;
1434 newsk->route_caps = dst->dev->features;
1435
1436 newtp = &(newsk->tp_pinfo.af_tcp);
1437 newsk->daddr = req->af.v4_req.rmt_addr;
1438 newsk->saddr = req->af.v4_req.loc_addr;
1439 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1440 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1441 req->af.v4_req.opt = NULL;
1442 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1443 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1444 newtp->ext_header_len = 0;
1445 if (newsk->protinfo.af_inet.opt)
1446 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1447 newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1448
1449 tcp_sync_mss(newsk, dst->pmtu);
1450 newtp->advmss = dst->advmss;
1451 tcp_initialize_rcv_mss(newsk);
1452
1453 __tcp_v4_hash(newsk);
1454 __tcp_inherit_port(sk, newsk);
1455
1456 return newsk;
1457
1458 exit_overflow:
1459 NET_INC_STATS_BH(ListenOverflows);
1460 exit:
1461 NET_INC_STATS_BH(ListenDrops);
1462 dst_release(dst);
1463 return NULL;
1464 }
1465
1466 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1467 {
1468 struct open_request *req, **prev;
1469 struct tcphdr *th = skb->h.th;
1470 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1471 struct sock *nsk;
1472
1473 /* Find possible connection requests. */
1474 req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1475 if (req)
1476 return tcp_check_req(sk, skb, req, prev);
1477
1478 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1479 th->source,
1480 skb->nh.iph->daddr,
1481 ntohs(th->dest),
1482 tcp_v4_iif(skb));
1483
1484 if (nsk) {
1485 if (nsk->state != TCP_TIME_WAIT) {
1486 bh_lock_sock(nsk);
1487 return nsk;
1488 }
1489 tcp_tw_put((struct tcp_tw_bucket*)sk);
1490 return NULL;
1491 }
1492
1493 #ifdef CONFIG_SYN_COOKIES
1494 if (!th->rst && !th->syn && th->ack)
1495 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1496 #endif
1497 return sk;
1498 }
1499
1500 static int tcp_v4_checksum_init(struct sk_buff *skb)
1501 {
1502 if (skb->ip_summed == CHECKSUM_HW) {
1503 skb->ip_summed = CHECKSUM_UNNECESSARY;
1504 if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1505 skb->nh.iph->daddr,skb->csum))
1506 return 0;
1507
1508 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1509 skb->ip_summed = CHECKSUM_NONE;
1510 }
1511 if (skb->len <= 76) {
1512 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1513 skb->nh.iph->daddr,
1514 skb_checksum(skb, 0, skb->len, 0)))
1515 return -1;
1516 skb->ip_summed = CHECKSUM_UNNECESSARY;
1517 } else {
1518 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1519 skb->nh.iph->daddr,0);
1520 }
1521 return 0;
1522 }
1523
1524
1525 /* The socket must have it's spinlock held when we get
1526 * here.
1527 *
1528 * We have a potential double-lock case here, so even when
1529 * doing backlog processing we use the BH locking scheme.
1530 * This is because we cannot sleep with the original spinlock
1531 * held.
1532 */
1533 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1534 {
1535 #ifdef CONFIG_FILTER
1536 struct sk_filter *filter = sk->filter;
1537 if (filter && sk_filter(skb, filter))
1538 goto discard;
1539 #endif /* CONFIG_FILTER */
1540
1541 IP_INC_STATS_BH(IpInDelivers);
1542
1543 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1544 TCP_CHECK_TIMER(sk);
1545 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1546 goto reset;
1547 TCP_CHECK_TIMER(sk);
1548 return 0;
1549 }
1550
1551 if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1552 goto csum_err;
1553
1554 if (sk->state == TCP_LISTEN) {
1555 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1556 if (!nsk)
1557 goto discard;
1558
1559 if (nsk != sk) {
1560 if (tcp_child_process(sk, nsk, skb))
1561 goto reset;
1562 return 0;
1563 }
1564 }
1565
1566 TCP_CHECK_TIMER(sk);
1567 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1568 goto reset;
1569 TCP_CHECK_TIMER(sk);
1570 return 0;
1571
1572 reset:
1573 tcp_v4_send_reset(skb);
1574 discard:
1575 kfree_skb(skb);
1576 /* Be careful here. If this function gets more complicated and
1577 * gcc suffers from register pressure on the x86, sk (in %ebx)
1578 * might be destroyed here. This current version compiles correctly,
1579 * but you have been warned.
1580 */
1581 return 0;
1582
1583 csum_err:
1584 TCP_INC_STATS_BH(TcpInErrs);
1585 goto discard;
1586 }
1587
1588 /*
1589 * From tcp_input.c
1590 */
1591
1592 int tcp_v4_rcv(struct sk_buff *skb)
1593 {
1594 struct tcphdr *th;
1595 struct sock *sk;
1596 int ret;
1597
1598 if (skb->pkt_type!=PACKET_HOST)
1599 goto discard_it;
1600
1601 /* Count it even if it's bad */
1602 TCP_INC_STATS_BH(TcpInSegs);
1603
1604 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1605 goto discard_it;
1606
1607 th = skb->h.th;
1608
1609 if (th->doff < sizeof(struct tcphdr)/4)
1610 goto bad_packet;
1611 if (!pskb_may_pull(skb, th->doff*4))
1612 goto discard_it;
1613
1614 /* An explanation is required here, I think.
1615 * Packet length and doff are validated by header prediction,
1616 * provided case of th->doff==0 is elimineted.
1617 * So, we defer the checks. */
1618 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1619 tcp_v4_checksum_init(skb) < 0))
1620 goto bad_packet;
1621
1622 th = skb->h.th;
1623 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1624 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1625 skb->len - th->doff*4);
1626 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1627 TCP_SKB_CB(skb)->when = 0;
1628 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1629 TCP_SKB_CB(skb)->sacked = 0;
1630
1631 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1632 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1633
1634 if (!sk)
1635 goto no_tcp_socket;
1636
1637 process:
1638 if(!ipsec_sk_policy(sk,skb))
1639 goto discard_and_relse;
1640
1641 if (sk->state == TCP_TIME_WAIT)
1642 goto do_time_wait;
1643
1644 skb->dev = NULL;
1645
1646 bh_lock_sock(sk);
1647 ret = 0;
1648 if (!sk->lock.users) {
1649 if (!tcp_prequeue(sk, skb))
1650 ret = tcp_v4_do_rcv(sk, skb);
1651 } else
1652 sk_add_backlog(sk, skb);
1653 bh_unlock_sock(sk);
1654
1655 sock_put(sk);
1656
1657 return ret;
1658
1659 no_tcp_socket:
1660 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1661 bad_packet:
1662 TCP_INC_STATS_BH(TcpInErrs);
1663 } else {
1664 tcp_v4_send_reset(skb);
1665 }
1666
1667 discard_it:
1668 /* Discard frame. */
1669 kfree_skb(skb);
1670 return 0;
1671
1672 discard_and_relse:
1673 sock_put(sk);
1674 goto discard_it;
1675
1676 do_time_wait:
1677 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1678 TCP_INC_STATS_BH(TcpInErrs);
1679 goto discard_and_relse;
1680 }
1681 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1682 skb, th, skb->len)) {
1683 case TCP_TW_SYN:
1684 {
1685 struct sock *sk2;
1686
1687 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1688 if (sk2 != NULL) {
1689 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1690 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1691 tcp_tw_put((struct tcp_tw_bucket *)sk);
1692 sk = sk2;
1693 goto process;
1694 }
1695 /* Fall through to ACK */
1696 }
1697 case TCP_TW_ACK:
1698 tcp_v4_timewait_ack(sk, skb);
1699 break;
1700 case TCP_TW_RST:
1701 goto no_tcp_socket;
1702 case TCP_TW_SUCCESS:;
1703 }
1704 goto discard_it;
1705 }
1706
1707 /* With per-bucket locks this operation is not-atomic, so that
1708 * this version is not worse.
1709 */
1710 static void __tcp_v4_rehash(struct sock *sk)
1711 {
1712 sk->prot->unhash(sk);
1713 sk->prot->hash(sk);
1714 }
1715
1716 static int tcp_v4_reselect_saddr(struct sock *sk)
1717 {
1718 int err;
1719 struct rtable *rt;
1720 __u32 old_saddr = sk->saddr;
1721 __u32 new_saddr;
1722 __u32 daddr = sk->daddr;
1723
1724 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1725 daddr = sk->protinfo.af_inet.opt->faddr;
1726
1727 /* Query new route. */
1728 err = ip_route_connect(&rt, daddr, 0,
1729 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1730 sk->bound_dev_if);
1731 if (err)
1732 return err;
1733
1734 __sk_dst_set(sk, &rt->u.dst);
1735 sk->route_caps = rt->u.dst.dev->features;
1736
1737 new_saddr = rt->rt_src;
1738
1739 if (new_saddr == old_saddr)
1740 return 0;
1741
1742 if (sysctl_ip_dynaddr > 1) {
1743 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1744 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1745 NIPQUAD(old_saddr),
1746 NIPQUAD(new_saddr));
1747 }
1748
1749 sk->saddr = new_saddr;
1750 sk->rcv_saddr = new_saddr;
1751
1752 /* XXX The only one ugly spot where we need to
1753 * XXX really change the sockets identity after
1754 * XXX it has entered the hashes. -DaveM
1755 *
1756 * Besides that, it does not check for connection
1757 * uniqueness. Wait for troubles.
1758 */
1759 __tcp_v4_rehash(sk);
1760 return 0;
1761 }
1762
1763 int tcp_v4_rebuild_header(struct sock *sk)
1764 {
1765 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1766 u32 daddr;
1767 int err;
1768
1769 /* Route is OK, nothing to do. */
1770 if (rt != NULL)
1771 return 0;
1772
1773 /* Reroute. */
1774 daddr = sk->daddr;
1775 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1776 daddr = sk->protinfo.af_inet.opt->faddr;
1777
1778 err = ip_route_output(&rt, daddr, sk->saddr,
1779 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1780 sk->bound_dev_if);
1781 if (!err) {
1782 __sk_dst_set(sk, &rt->u.dst);
1783 sk->route_caps = rt->u.dst.dev->features;
1784 return 0;
1785 }
1786
1787 /* Routing failed... */
1788 sk->route_caps = 0;
1789
1790 if (!sysctl_ip_dynaddr ||
1791 sk->state != TCP_SYN_SENT ||
1792 (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1793 (err = tcp_v4_reselect_saddr(sk)) != 0)
1794 sk->err_soft=-err;
1795
1796 return err;
1797 }
1798
1799 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1800 {
1801 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1802
1803 sin->sin_family = AF_INET;
1804 sin->sin_addr.s_addr = sk->daddr;
1805 sin->sin_port = sk->dport;
1806 }
1807
1808 /* VJ's idea. Save last timestamp seen from this destination
1809 * and hold it at least for normal timewait interval to use for duplicate
1810 * segment detection in subsequent connections, before they enter synchronized
1811 * state.
1812 */
1813
1814 int tcp_v4_remember_stamp(struct sock *sk)
1815 {
1816 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1817 struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1818 struct inet_peer *peer = NULL;
1819 int release_it = 0;
1820
1821 if (rt == NULL || rt->rt_dst != sk->daddr) {
1822 peer = inet_getpeer(sk->daddr, 1);
1823 release_it = 1;
1824 } else {
1825 if (rt->peer == NULL)
1826 rt_bind_peer(rt, 1);
1827 peer = rt->peer;
1828 }
1829
1830 if (peer) {
1831 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1832 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1833 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1834 peer->tcp_ts_stamp = tp->ts_recent_stamp;
1835 peer->tcp_ts = tp->ts_recent;
1836 }
1837 if (release_it)
1838 inet_putpeer(peer);
1839 return 1;
1840 }
1841
1842 return 0;
1843 }
1844
1845 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1846 {
1847 struct inet_peer *peer = NULL;
1848
1849 peer = inet_getpeer(tw->daddr, 1);
1850
1851 if (peer) {
1852 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1853 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1854 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1855 peer->tcp_ts_stamp = tw->ts_recent_stamp;
1856 peer->tcp_ts = tw->ts_recent;
1857 }
1858 inet_putpeer(peer);
1859 return 1;
1860 }
1861
1862 return 0;
1863 }
1864
1865 struct tcp_func ipv4_specific = {
1866 ip_queue_xmit,
1867 tcp_v4_send_check,
1868 tcp_v4_rebuild_header,
1869 tcp_v4_conn_request,
1870 tcp_v4_syn_recv_sock,
1871 tcp_v4_hash_connecting,
1872 tcp_v4_remember_stamp,
1873 sizeof(struct iphdr),
1874
1875 ip_setsockopt,
1876 ip_getsockopt,
1877 v4_addr2sockaddr,
1878 sizeof(struct sockaddr_in)
1879 };
1880
1881 /* NOTE: A lot of things set to zero explicitly by call to
1882 * sk_alloc() so need not be done here.
1883 */
1884 static int tcp_v4_init_sock(struct sock *sk)
1885 {
1886 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1887
1888 skb_queue_head_init(&tp->out_of_order_queue);
1889 tcp_init_xmit_timers(sk);
1890 tcp_prequeue_init(tp);
1891
1892 tp->rto = TCP_TIMEOUT_INIT;
1893 tp->mdev = TCP_TIMEOUT_INIT;
1894
1895 /* So many TCP implementations out there (incorrectly) count the
1896 * initial SYN frame in their delayed-ACK and congestion control
1897 * algorithms that we must have the following bandaid to talk
1898 * efficiently to them. -DaveM
1899 */
1900 tp->snd_cwnd = 2;
1901
1902 /* See draft-stevens-tcpca-spec-01 for discussion of the
1903 * initialization of these values.
1904 */
1905 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1906 tp->snd_cwnd_clamp = ~0;
1907 tp->mss_cache = 536;
1908
1909 tp->reordering = sysctl_tcp_reordering;
1910
1911 sk->state = TCP_CLOSE;
1912
1913 sk->write_space = tcp_write_space;
1914 sk->use_write_queue = 1;
1915
1916 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1917
1918 sk->sndbuf = sysctl_tcp_wmem[1];
1919 sk->rcvbuf = sysctl_tcp_rmem[1];
1920
1921 atomic_inc(&tcp_sockets_allocated);
1922
1923 return 0;
1924 }
1925
1926 static int tcp_v4_destroy_sock(struct sock *sk)
1927 {
1928 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1929
1930 tcp_clear_xmit_timers(sk);
1931
1932 /* Cleanup up the write buffer. */
1933 tcp_writequeue_purge(sk);
1934
1935 /* Cleans up our, hopefully empty, out_of_order_queue. */
1936 __skb_queue_purge(&tp->out_of_order_queue);
1937
1938 /* Clean prequeue, it must be empty really */
1939 __skb_queue_purge(&tp->ucopy.prequeue);
1940
1941 /* Clean up a referenced TCP bind bucket. */
1942 if(sk->prev != NULL)
1943 tcp_put_port(sk);
1944
1945 /* If sendmsg cached page exists, toss it. */
1946 if (tp->sndmsg_page != NULL)
1947 __free_page(tp->sndmsg_page);
1948
1949 atomic_dec(&tcp_sockets_allocated);
1950
1951 return 0;
1952 }
1953
1954 /* Proc filesystem TCP sock list dumping. */
1955 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
1956 {
1957 int ttd = req->expires - jiffies;
1958
1959 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1960 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1961 i,
1962 req->af.v4_req.loc_addr,
1963 ntohs(sk->sport),
1964 req->af.v4_req.rmt_addr,
1965 ntohs(req->rmt_port),
1966 TCP_SYN_RECV,
1967 0,0, /* could print option size, but that is af dependent. */
1968 1, /* timers active (only the expire timer) */
1969 ttd,
1970 req->retrans,
1971 uid,
1972 0, /* non standard timer */
1973 0, /* open_requests have no inode */
1974 atomic_read(&sk->refcnt),
1975 req
1976 );
1977 }
1978
1979 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
1980 {
1981 unsigned int dest, src;
1982 __u16 destp, srcp;
1983 int timer_active;
1984 unsigned long timer_expires;
1985 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
1986
1987 dest = sp->daddr;
1988 src = sp->rcv_saddr;
1989 destp = ntohs(sp->dport);
1990 srcp = ntohs(sp->sport);
1991 if (tp->pending == TCP_TIME_RETRANS) {
1992 timer_active = 1;
1993 timer_expires = tp->timeout;
1994 } else if (tp->pending == TCP_TIME_PROBE0) {
1995 timer_active = 4;
1996 timer_expires = tp->timeout;
1997 } else if (timer_pending(&sp->timer)) {
1998 timer_active = 2;
1999 timer_expires = sp->timer.expires;
2000 } else {
2001 timer_active = 0;
2002 timer_expires = jiffies;
2003 }
2004
2005 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2006 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2007 i, src, srcp, dest, destp, sp->state,
2008 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2009 timer_active, timer_expires-jiffies,
2010 tp->retransmits,
2011 sock_i_uid(sp),
2012 tp->probes_out,
2013 sock_i_ino(sp),
2014 atomic_read(&sp->refcnt), sp,
2015 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2016 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2017 );
2018 }
2019
2020 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2021 {
2022 unsigned int dest, src;
2023 __u16 destp, srcp;
2024 int ttd = tw->ttd - jiffies;
2025
2026 if (ttd < 0)
2027 ttd = 0;
2028
2029 dest = tw->daddr;
2030 src = tw->rcv_saddr;
2031 destp = ntohs(tw->dport);
2032 srcp = ntohs(tw->sport);
2033
2034 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2035 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2036 i, src, srcp, dest, destp, tw->substate, 0, 0,
2037 3, ttd, 0, 0, 0, 0,
2038 atomic_read(&tw->refcnt), tw);
2039 }
2040
2041 #define TMPSZ 150
2042
2043 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2044 {
2045 int len = 0, num = 0, i;
2046 off_t begin, pos = 0;
2047 char tmpbuf[TMPSZ+1];
2048
2049 if (offset < TMPSZ)
2050 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2051 " sl local_address rem_address st tx_queue "
2052 "rx_queue tr tm->when retrnsmt uid timeout inode");
2053
2054 pos = TMPSZ;
2055
2056 /* First, walk listening socket table. */
2057 tcp_listen_lock();
2058 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2059 struct sock *sk = tcp_listening_hash[i];
2060 struct tcp_listen_opt *lopt;
2061 int k;
2062
2063 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2064 struct open_request *req;
2065 int uid;
2066 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2067
2068 if (!TCP_INET_FAMILY(sk->family))
2069 goto skip_listen;
2070
2071 pos += TMPSZ;
2072 if (pos >= offset) {
2073 get_tcp_sock(sk, tmpbuf, num);
2074 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2075 if (pos >= offset + length) {
2076 tcp_listen_unlock();
2077 goto out_no_bh;
2078 }
2079 }
2080
2081 skip_listen:
2082 uid = sock_i_uid(sk);
2083 read_lock_bh(&tp->syn_wait_lock);
2084 lopt = tp->listen_opt;
2085 if (lopt && lopt->qlen != 0) {
2086 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2087 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2088 if (!TCP_INET_FAMILY(req->class->family))
2089 continue;
2090
2091 pos += TMPSZ;
2092 if (pos <= offset)
2093 continue;
2094 get_openreq(sk, req, tmpbuf, num, uid);
2095 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2096 if (pos >= offset + length) {
2097 read_unlock_bh(&tp->syn_wait_lock);
2098 tcp_listen_unlock();
2099 goto out_no_bh;
2100 }
2101 }
2102 }
2103 }
2104 read_unlock_bh(&tp->syn_wait_lock);
2105
2106 /* Completed requests are in normal socket hash table */
2107 }
2108 }
2109 tcp_listen_unlock();
2110
2111 local_bh_disable();
2112
2113 /* Next, walk established hash chain. */
2114 for (i = 0; i < tcp_ehash_size; i++) {
2115 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2116 struct sock *sk;
2117 struct tcp_tw_bucket *tw;
2118
2119 read_lock(&head->lock);
2120 for(sk = head->chain; sk; sk = sk->next, num++) {
2121 if (!TCP_INET_FAMILY(sk->family))
2122 continue;
2123 pos += TMPSZ;
2124 if (pos <= offset)
2125 continue;
2126 get_tcp_sock(sk, tmpbuf, num);
2127 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2128 if (pos >= offset + length) {
2129 read_unlock(&head->lock);
2130 goto out;
2131 }
2132 }
2133 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2134 tw != NULL;
2135 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2136 if (!TCP_INET_FAMILY(tw->family))
2137 continue;
2138 pos += TMPSZ;
2139 if (pos <= offset)
2140 continue;
2141 get_timewait_sock(tw, tmpbuf, num);
2142 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2143 if (pos >= offset + length) {
2144 read_unlock(&head->lock);
2145 goto out;
2146 }
2147 }
2148 read_unlock(&head->lock);
2149 }
2150
2151 out:
2152 local_bh_enable();
2153 out_no_bh:
2154
2155 begin = len - (pos - offset);
2156 *start = buffer + begin;
2157 len -= begin;
2158 if (len > length)
2159 len = length;
2160 if (len < 0)
2161 len = 0;
2162 return len;
2163 }
2164
2165 struct proto tcp_prot = {
2166 name: "TCP",
2167 close: tcp_close,
2168 connect: tcp_v4_connect,
2169 disconnect: tcp_disconnect,
2170 accept: tcp_accept,
2171 ioctl: tcp_ioctl,
2172 init: tcp_v4_init_sock,
2173 destroy: tcp_v4_destroy_sock,
2174 shutdown: tcp_shutdown,
2175 setsockopt: tcp_setsockopt,
2176 getsockopt: tcp_getsockopt,
2177 sendmsg: tcp_sendmsg,
2178 recvmsg: tcp_recvmsg,
2179 backlog_rcv: tcp_v4_do_rcv,
2180 hash: tcp_v4_hash,
2181 unhash: tcp_unhash,
2182 get_port: tcp_v4_get_port,
2183 };
2184
2185
2186
2187 void __init tcp_v4_init(struct net_proto_family *ops)
2188 {
2189 int err;
2190
2191 tcp_inode.i_mode = S_IFSOCK;
2192 tcp_inode.i_sock = 1;
2193 tcp_inode.i_uid = 0;
2194 tcp_inode.i_gid = 0;
2195 init_waitqueue_head(&tcp_inode.i_wait);
2196 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2197
2198 tcp_socket->inode = &tcp_inode;
2199 tcp_socket->state = SS_UNCONNECTED;
2200 tcp_socket->type=SOCK_RAW;
2201
2202 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2203 panic("Failed to create the TCP control socket.\n");
2204 tcp_socket->sk->allocation=GFP_ATOMIC;
2205 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2206
2207 /* Unhash it so that IP input processing does not even
2208 * see it, we do not wish this socket to see incoming
2209 * packets.
2210 */
2211 tcp_socket->sk->prot->unhash(tcp_socket->sk);
2212 }
2213