File: /usr/src/linux/net/ipv4/tcp_ipv4.c

1     /*
2      * INET		An implementation of the TCP/IP protocol suite for the LINUX
3      *		operating system.  INET is implemented using the  BSD Socket
4      *		interface as the means of communication with the user level.
5      *
6      *		Implementation of the Transmission Control Protocol(TCP).
7      *
8      * Version:	$Id: tcp_ipv4.c,v 1.230 2001/09/01 00:31:50 davem Exp $
9      *
10      *		IPv4 specific functions
11      *
12      *
13      *		code split from:
14      *		linux/ipv4/tcp.c
15      *		linux/ipv4/tcp_input.c
16      *		linux/ipv4/tcp_output.c
17      *
18      *		See tcp.c for author information
19      *
20      *	This program is free software; you can redistribute it and/or
21      *      modify it under the terms of the GNU General Public License
22      *      as published by the Free Software Foundation; either version
23      *      2 of the License, or (at your option) any later version.
24      */
25     
26     /*
27      * Changes:
28      *		David S. Miller	:	New socket lookup architecture.
29      *					This code is dedicated to John Dyson.
30      *		David S. Miller :	Change semantics of established hash,
31      *					half is devoted to TIME_WAIT sockets
32      *					and the rest go in the other half.
33      *		Andi Kleen :		Add support for syncookies and fixed
34      *					some bugs: ip options weren't passed to
35      *					the TCP layer, missed a check for an ACK bit.
36      *		Andi Kleen :		Implemented fast path mtu discovery.
37      *	     				Fixed many serious bugs in the
38      *					open_request handling and moved
39      *					most of it into the af independent code.
40      *					Added tail drop and some other bugfixes.
41      *					Added new listen sematics.
42      *		Mike McLagan	:	Routing by source
43      *	Juan Jose Ciarlante:		ip_dynaddr bits
44      *		Andi Kleen:		various fixes.
45      *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
46      *	Andi Kleen		:	Fix new listen.
47      *	Andi Kleen		:	Fix accept error reporting.
48      */
49     
50     #include <linux/config.h>
51     #include <linux/types.h>
52     #include <linux/fcntl.h>
53     #include <linux/random.h>
54     #include <linux/cache.h>
55     #include <linux/init.h>
56     
57     #include <net/icmp.h>
58     #include <net/tcp.h>
59     #include <net/ipv6.h>
60     #include <net/inet_common.h>
61     
62     #include <linux/inet.h>
63     #include <linux/stddef.h>
64     #include <linux/ipsec.h>
65     
66     extern int sysctl_ip_dynaddr;
67     
68     /* Check TCP sequence numbers in ICMP packets. */
69     #define ICMP_MIN_LENGTH 8
70     
71     /* Socket used for sending RSTs */ 	
72     static struct inode tcp_inode;
73     static struct socket *tcp_socket=&tcp_inode.u.socket_i;
74     
75     void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 
76     		       struct sk_buff *skb);
77     
78     /*
79      * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
80      */
81     struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
82     	__tcp_ehash:          NULL,
83     	__tcp_bhash:          NULL,
84     	__tcp_bhash_size:     0,
85     	__tcp_ehash_size:     0,
86     	__tcp_listening_hash: { NULL, },
87     	__tcp_lhash_lock:     RW_LOCK_UNLOCKED,
88     	__tcp_lhash_users:    ATOMIC_INIT(0),
89     	__tcp_lhash_wait:
90     	  __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
91     	__tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
92     };
93     
94     /*
95      * This array holds the first and last local port number.
96      * For high-usage systems, use sysctl to change this to
97      * 32768-61000
98      */
99     int sysctl_local_port_range[2] = { 1024, 4999 };
100     int tcp_port_rover = (1024 - 1);
101     
102     static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
103     				 __u32 faddr, __u16 fport)
104     {
105     	int h = ((laddr ^ lport) ^ (faddr ^ fport));
106     	h ^= h>>16;
107     	h ^= h>>8;
108     	return h & (tcp_ehash_size - 1);
109     }
110     
111     static __inline__ int tcp_sk_hashfn(struct sock *sk)
112     {
113     	__u32 laddr = sk->rcv_saddr;
114     	__u16 lport = sk->num;
115     	__u32 faddr = sk->daddr;
116     	__u16 fport = sk->dport;
117     
118     	return tcp_hashfn(laddr, lport, faddr, fport);
119     }
120     
121     /* Allocate and initialize a new TCP local port bind bucket.
122      * The bindhash mutex for snum's hash chain must be held here.
123      */
124     struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
125     					  unsigned short snum)
126     {
127     	struct tcp_bind_bucket *tb;
128     
129     	tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
130     	if(tb != NULL) {
131     		tb->port = snum;
132     		tb->fastreuse = 0;
133     		tb->owners = NULL;
134     		if((tb->next = head->chain) != NULL)
135     			tb->next->pprev = &tb->next;
136     		head->chain = tb;
137     		tb->pprev = &head->chain;
138     	}
139     	return tb;
140     }
141     
142     /* Caller must disable local BH processing. */
143     static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
144     {
145     	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
146     	struct tcp_bind_bucket *tb;
147     
148     	spin_lock(&head->lock);
149     	tb = (struct tcp_bind_bucket *)sk->prev;
150     	if ((child->bind_next = tb->owners) != NULL)
151     		tb->owners->bind_pprev = &child->bind_next;
152     	tb->owners = child;
153     	child->bind_pprev = &tb->owners;
154     	child->prev = (struct sock *) tb;
155     	spin_unlock(&head->lock);
156     }
157     
158     __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
159     {
160     	local_bh_disable();
161     	__tcp_inherit_port(sk, child);
162     	local_bh_enable();
163     }
164     
165     /* Obtain a reference to a local port for the given sock,
166      * if snum is zero it means select any available local port.
167      */
168     static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
169     {
170     	struct tcp_bind_hashbucket *head;
171     	struct tcp_bind_bucket *tb;
172     	int ret;
173     
174     	local_bh_disable();
175     	if (snum == 0) {
176     		int low = sysctl_local_port_range[0];
177     		int high = sysctl_local_port_range[1];
178     		int remaining = (high - low) + 1;
179     		int rover;
180     
181     		spin_lock(&tcp_portalloc_lock);
182     		rover = tcp_port_rover;
183     		do {	rover++;
184     			if ((rover < low) || (rover > high))
185     				rover = low;
186     			head = &tcp_bhash[tcp_bhashfn(rover)];
187     			spin_lock(&head->lock);
188     			for (tb = head->chain; tb; tb = tb->next)
189     				if (tb->port == rover)
190     					goto next;
191     			break;
192     		next:
193     			spin_unlock(&head->lock);
194     		} while (--remaining > 0);
195     		tcp_port_rover = rover;
196     		spin_unlock(&tcp_portalloc_lock);
197     
198     		/* Exhausted local port range during search? */
199     		ret = 1;
200     		if (remaining <= 0)
201     			goto fail;
202     
203     		/* OK, here is the one we will use.  HEAD is
204     		 * non-NULL and we hold it's mutex.
205     		 */
206     		snum = rover;
207     		tb = NULL;
208     	} else {
209     		head = &tcp_bhash[tcp_bhashfn(snum)];
210     		spin_lock(&head->lock);
211     		for (tb = head->chain; tb != NULL; tb = tb->next)
212     			if (tb->port == snum)
213     				break;
214     	}
215     	if (tb != NULL && tb->owners != NULL) {
216     		if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
217     			goto success;
218     		} else {
219     			struct sock *sk2 = tb->owners;
220     			int sk_reuse = sk->reuse;
221     
222     			for( ; sk2 != NULL; sk2 = sk2->bind_next) {
223     				if (sk != sk2 &&
224     				    sk->bound_dev_if == sk2->bound_dev_if) {
225     					if (!sk_reuse	||
226     					    !sk2->reuse	||
227     					    sk2->state == TCP_LISTEN) {
228     						if (!sk2->rcv_saddr	||
229     						    !sk->rcv_saddr	||
230     						    (sk2->rcv_saddr == sk->rcv_saddr))
231     							break;
232     					}
233     				}
234     			}
235     			/* If we found a conflict, fail. */
236     			ret = 1;
237     			if (sk2 != NULL)
238     				goto fail_unlock;
239     		}
240     	}
241     	ret = 1;
242     	if (tb == NULL &&
243     	    (tb = tcp_bucket_create(head, snum)) == NULL)
244     			goto fail_unlock;
245     	if (tb->owners == NULL) {
246     		if (sk->reuse && sk->state != TCP_LISTEN)
247     			tb->fastreuse = 1;
248     		else
249     			tb->fastreuse = 0;
250     	} else if (tb->fastreuse &&
251     		   ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
252     		tb->fastreuse = 0;
253     success:
254     	sk->num = snum;
255     	if (sk->prev == NULL) {
256     		if ((sk->bind_next = tb->owners) != NULL)
257     			tb->owners->bind_pprev = &sk->bind_next;
258     		tb->owners = sk;
259     		sk->bind_pprev = &tb->owners;
260     		sk->prev = (struct sock *) tb;
261     	} else {
262     		BUG_TRAP(sk->prev == (struct sock *) tb);
263     	}
264     	ret = 0;
265     
266     fail_unlock:
267     	spin_unlock(&head->lock);
268     fail:
269     	local_bh_enable();
270     	return ret;
271     }
272     
273     /* Get rid of any references to a local port held by the
274      * given sock.
275      */
276     __inline__ void __tcp_put_port(struct sock *sk)
277     {
278     	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
279     	struct tcp_bind_bucket *tb;
280     
281     	spin_lock(&head->lock);
282     	tb = (struct tcp_bind_bucket *) sk->prev;
283     	if (sk->bind_next)
284     		sk->bind_next->bind_pprev = sk->bind_pprev;
285     	*(sk->bind_pprev) = sk->bind_next;
286     	sk->prev = NULL;
287     	sk->num = 0;
288     	if (tb->owners == NULL) {
289     		if (tb->next)
290     			tb->next->pprev = tb->pprev;
291     		*(tb->pprev) = tb->next;
292     		kmem_cache_free(tcp_bucket_cachep, tb);
293     	}
294     	spin_unlock(&head->lock);
295     }
296     
297     void tcp_put_port(struct sock *sk)
298     {
299     	local_bh_disable();
300     	__tcp_put_port(sk);
301     	local_bh_enable();
302     }
303     
304     /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
305      * Look, when several writers sleep and reader wakes them up, all but one
306      * immediately hit write lock and grab all the cpus. Exclusive sleep solves
307      * this, _but_ remember, it adds useless work on UP machines (wake up each
308      * exclusive lock release). It should be ifdefed really.
309      */
310     
311     void tcp_listen_wlock(void)
312     {
313     	write_lock(&tcp_lhash_lock);
314     
315     	if (atomic_read(&tcp_lhash_users)) {
316     		DECLARE_WAITQUEUE(wait, current);
317     
318     		add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
319     		for (;;) {
320     			set_current_state(TASK_UNINTERRUPTIBLE);
321     			if (atomic_read(&tcp_lhash_users) == 0)
322     				break;
323     			write_unlock_bh(&tcp_lhash_lock);
324     			schedule();
325     			write_lock_bh(&tcp_lhash_lock);
326     		}
327     
328     		__set_current_state(TASK_RUNNING);
329     		remove_wait_queue(&tcp_lhash_wait, &wait);
330     	}
331     }
332     
333     static __inline__ void __tcp_v4_hash(struct sock *sk)
334     {
335     	struct sock **skp;
336     	rwlock_t *lock;
337     
338     	BUG_TRAP(sk->pprev==NULL);
339     	if(sk->state == TCP_LISTEN) {
340     		skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
341     		lock = &tcp_lhash_lock;
342     		tcp_listen_wlock();
343     	} else {
344     		skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
345     		lock = &tcp_ehash[sk->hashent].lock;
346     		write_lock(lock);
347     	}
348     	if((sk->next = *skp) != NULL)
349     		(*skp)->pprev = &sk->next;
350     	*skp = sk;
351     	sk->pprev = skp;
352     	sock_prot_inc_use(sk->prot);
353     	write_unlock(lock);
354     	if (sk->state == TCP_LISTEN)
355     		wake_up(&tcp_lhash_wait);
356     }
357     
358     static void tcp_v4_hash(struct sock *sk)
359     {
360     	if (sk->state != TCP_CLOSE) {
361     		local_bh_disable();
362     		__tcp_v4_hash(sk);
363     		local_bh_enable();
364     	}
365     }
366     
367     void tcp_unhash(struct sock *sk)
368     {
369     	rwlock_t *lock;
370     
371     	if (sk->state == TCP_LISTEN) {
372     		local_bh_disable();
373     		tcp_listen_wlock();
374     		lock = &tcp_lhash_lock;
375     	} else {
376     		struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
377     		lock = &head->lock;
378     		write_lock_bh(&head->lock);
379     	}
380     
381     	if(sk->pprev) {
382     		if(sk->next)
383     			sk->next->pprev = sk->pprev;
384     		*sk->pprev = sk->next;
385     		sk->pprev = NULL;
386     		sock_prot_dec_use(sk->prot);
387     	}
388     	write_unlock_bh(lock);
389     	if (sk->state == TCP_LISTEN)
390     		wake_up(&tcp_lhash_wait);
391     }
392     
393     /* Don't inline this cruft.  Here are some nice properties to
394      * exploit here.  The BSD API does not allow a listening TCP
395      * to specify the remote port nor the remote address for the
396      * connection.  So always assume those are both wildcarded
397      * during the search since they can never be otherwise.
398      */
399     static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
400     {
401     	struct sock *result = NULL;
402     	int score, hiscore;
403     
404     	hiscore=0;
405     	for(; sk; sk = sk->next) {
406     		if(sk->num == hnum) {
407     			__u32 rcv_saddr = sk->rcv_saddr;
408     
409     			score = 1;
410     			if(rcv_saddr) {
411     				if (rcv_saddr != daddr)
412     					continue;
413     				score++;
414     			}
415     			if (sk->bound_dev_if) {
416     				if (sk->bound_dev_if != dif)
417     					continue;
418     				score++;
419     			}
420     			if (score == 3)
421     				return sk;
422     			if (score > hiscore) {
423     				hiscore = score;
424     				result = sk;
425     			}
426     		}
427     	}
428     	return result;
429     }
430     
431     /* Optimize the common listener case. */
432     __inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
433     {
434     	struct sock *sk;
435     
436     	read_lock(&tcp_lhash_lock);
437     	sk = tcp_listening_hash[tcp_lhashfn(hnum)];
438     	if (sk) {
439     		if (sk->num == hnum &&
440     		    sk->next == NULL &&
441     		    (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
442     		    !sk->bound_dev_if)
443     			goto sherry_cache;
444     		sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
445     	}
446     	if (sk) {
447     sherry_cache:
448     		sock_hold(sk);
449     	}
450     	read_unlock(&tcp_lhash_lock);
451     	return sk;
452     }
453     
454     /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
455      * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
456      *
457      * Local BH must be disabled here.
458      */
459     
460     static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
461     						       u32 daddr, u16 hnum, int dif)
462     {
463     	struct tcp_ehash_bucket *head;
464     	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
465     	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
466     	struct sock *sk;
467     	int hash;
468     
469     	/* Optimize here for direct hit, only listening connections can
470     	 * have wildcards anyways.
471     	 */
472     	hash = tcp_hashfn(daddr, hnum, saddr, sport);
473     	head = &tcp_ehash[hash];
474     	read_lock(&head->lock);
475     	for(sk = head->chain; sk; sk = sk->next) {
476     		if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
477     			goto hit; /* You sunk my battleship! */
478     	}
479     
480     	/* Must check for a TIME_WAIT'er before going to listener hash. */
481     	for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
482     		if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
483     			goto hit;
484     	read_unlock(&head->lock);
485     
486     	return NULL;
487     
488     hit:
489     	sock_hold(sk);
490     	read_unlock(&head->lock);
491     	return sk;
492     }
493     
494     static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
495     					   u32 daddr, u16 hnum, int dif)
496     {
497     	struct sock *sk;
498     
499     	sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
500     
501     	if (sk)
502     		return sk;
503     		
504     	return tcp_v4_lookup_listener(daddr, hnum, dif);
505     }
506     
507     __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
508     {
509     	struct sock *sk;
510     
511     	local_bh_disable();
512     	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
513     	local_bh_enable();
514     
515     	return sk;
516     }
517     
518     static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
519     {
520     	return secure_tcp_sequence_number(skb->nh.iph->daddr,
521     					  skb->nh.iph->saddr,
522     					  skb->h.th->dest,
523     					  skb->h.th->source);
524     }
525     
526     static int tcp_v4_check_established(struct sock *sk)
527     {
528     	u32 daddr = sk->rcv_saddr;
529     	u32 saddr = sk->daddr;
530     	int dif = sk->bound_dev_if;
531     	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
532     	__u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
533     	int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
534     	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
535     	struct sock *sk2, **skp;
536     	struct tcp_tw_bucket *tw;
537     
538     	write_lock_bh(&head->lock);
539     
540     	/* Check TIME-WAIT sockets first. */
541     	for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
542     	    skp = &sk2->next) {
543     		tw = (struct tcp_tw_bucket*)sk2;
544     
545     		if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
546     			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
547     
548     			/* With PAWS, it is safe from the viewpoint
549     			   of data integrity. Even without PAWS it
550     			   is safe provided sequence spaces do not
551     			   overlap i.e. at data rates <= 80Mbit/sec.
552     
553     			   Actually, the idea is close to VJ's one,
554     			   only timestamp cache is held not per host,
555     			   but per port pair and TW bucket is used
556     			   as state holder.
557     
558     			   If TW bucket has been already destroyed we
559     			   fall back to VJ's scheme and use initial
560     			   timestamp retrieved from peer table.
561     			 */
562     			if (tw->ts_recent_stamp) {
563     				if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
564     					tp->write_seq = 1;
565     				tp->ts_recent = tw->ts_recent;
566     				tp->ts_recent_stamp = tw->ts_recent_stamp;
567     				sock_hold(sk2);
568     				skp = &head->chain;
569     				goto unique;
570     			} else
571     				goto not_unique;
572     		}
573     	}
574     	tw = NULL;
575     
576     	/* And established part... */
577     	for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
578     		if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
579     			goto not_unique;
580     	}
581     
582     unique:
583     	BUG_TRAP(sk->pprev==NULL);
584     	if ((sk->next = *skp) != NULL)
585     		(*skp)->pprev = &sk->next;
586     
587     	*skp = sk;
588     	sk->pprev = skp;
589     	sk->hashent = hash;
590     	sock_prot_inc_use(sk->prot);
591     	write_unlock_bh(&head->lock);
592     
593     	if (tw) {
594     		/* Silly. Should hash-dance instead... */
595     		local_bh_disable();
596     		tcp_tw_deschedule(tw);
597     		tcp_timewait_kill(tw);
598     		NET_INC_STATS_BH(TimeWaitRecycled);
599     		local_bh_enable();
600     
601     		tcp_tw_put(tw);
602     	}
603     
604     	return 0;
605     
606     not_unique:
607     	write_unlock_bh(&head->lock);
608     	return -EADDRNOTAVAIL;
609     }
610     
611     /* Hash SYN-SENT socket to established hash table after
612      * checking that it is unique. Note, that without kernel lock
613      * we MUST make these two operations atomically.
614      *
615      * Optimization: if it is bound and tcp_bind_bucket has the only
616      * owner (us), we need not to scan established bucket.
617      */
618     
619     int tcp_v4_hash_connecting(struct sock *sk)
620     {
621     	unsigned short snum = sk->num;
622     	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
623     	struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
624     
625     	spin_lock_bh(&head->lock);
626     	if (tb->owners == sk && sk->bind_next == NULL) {
627     		__tcp_v4_hash(sk);
628     		spin_unlock_bh(&head->lock);
629     		return 0;
630     	} else {
631     		spin_unlock_bh(&head->lock);
632     
633     		/* No definite answer... Walk to established hash table */
634     		return tcp_v4_check_established(sk);
635     	}
636     }
637     
638     /* This will initiate an outgoing connection. */
639     int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
640     {
641     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
642     	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
643     	struct sk_buff *buff;
644     	struct rtable *rt;
645     	u32 daddr, nexthop;
646     	int tmp;
647     	int err;
648     
649     	if (addr_len < sizeof(struct sockaddr_in))
650     		return(-EINVAL);
651     
652     	if (usin->sin_family != AF_INET)
653     		return(-EAFNOSUPPORT);
654     
655     	nexthop = daddr = usin->sin_addr.s_addr;
656     	if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
657     		if (daddr == 0)
658     			return -EINVAL;
659     		nexthop = sk->protinfo.af_inet.opt->faddr;
660     	}
661     
662     	tmp = ip_route_connect(&rt, nexthop, sk->saddr,
663     			       RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
664     	if (tmp < 0)
665     		return tmp;
666     
667     	if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
668     		ip_rt_put(rt);
669     		return -ENETUNREACH;
670     	}
671     
672     	__sk_dst_set(sk, &rt->u.dst);
673     	sk->route_caps = rt->u.dst.dev->features;
674     
675     	if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
676     		daddr = rt->rt_dst;
677     
678     	err = -ENOBUFS;
679     	buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);
680     
681     	if (buff == NULL)
682     		goto failure;
683     
684     	if (!sk->saddr)
685     		sk->saddr = rt->rt_src;
686     	sk->rcv_saddr = sk->saddr;
687     
688     	if (tp->ts_recent_stamp && sk->daddr != daddr) {
689     		/* Reset inherited state */
690     		tp->ts_recent = 0;
691     		tp->ts_recent_stamp = 0;
692     		tp->write_seq = 0;
693     	}
694     
695     	if (sysctl_tcp_tw_recycle &&
696     	    !tp->ts_recent_stamp &&
697     	    rt->rt_dst == daddr) {
698     		struct inet_peer *peer = rt_get_peer(rt);
699     
700     		/* VJ's idea. We save last timestamp seen from
701     		 * the destination in peer table, when entering state TIME-WAIT
702     		 * and initialize ts_recent from it, when trying new connection.
703     		 */
704     
705     		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
706     			tp->ts_recent_stamp = peer->tcp_ts_stamp;
707     			tp->ts_recent = peer->tcp_ts;
708     		}
709     	}
710     
711     	sk->dport = usin->sin_port;
712     	sk->daddr = daddr;
713     
714     	if (!tp->write_seq)
715     		tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
716     							   sk->sport, usin->sin_port);
717     
718     	tp->ext_header_len = 0;
719     	if (sk->protinfo.af_inet.opt)
720     		tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
721     	sk->protinfo.af_inet.id = tp->write_seq^jiffies;
722     
723     	tp->mss_clamp = 536;
724     
725     	err = tcp_connect(sk, buff);
726     	if (err == 0)
727     		return 0;
728     
729     failure:
730     	__sk_dst_reset(sk);
731     	sk->route_caps = 0;
732     	sk->dport = 0;
733     	return err;
734     }
735     
736     static __inline__ int tcp_v4_iif(struct sk_buff *skb)
737     {
738     	return ((struct rtable*)skb->dst)->rt_iif;
739     }
740     
741     static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
742     {
743     	unsigned h = raddr ^ rport;
744     	h ^= h>>16;
745     	h ^= h>>8;
746     	return h&(TCP_SYNQ_HSIZE-1);
747     }
748     
749     static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, 
750     					      struct iphdr *iph,
751     					      struct tcphdr *th,
752     					      struct open_request ***prevp)
753     {
754     	struct tcp_listen_opt *lopt = tp->listen_opt;
755     	struct open_request *req, **prev;  
756     	__u16 rport = th->source;
757     	__u32 raddr = iph->saddr;
758     
759     	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
760     	     (req = *prev) != NULL;
761     	     prev = &req->dl_next) {
762     		if (req->rmt_port == rport &&
763     		    req->af.v4_req.rmt_addr == raddr &&
764     		    req->af.v4_req.loc_addr == iph->daddr &&
765     		    TCP_INET_FAMILY(req->class->family)) {
766     			BUG_TRAP(req->sk == NULL);
767     			*prevp = prev;
768     			return req; 
769     		}
770     	}
771     
772     	return NULL;
773     }
774     
775     static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
776     {
777     	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
778     	struct tcp_listen_opt *lopt = tp->listen_opt;
779     	unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
780     
781     	req->expires = jiffies + TCP_TIMEOUT_INIT;
782     	req->retrans = 0;
783     	req->sk = NULL;
784     	req->index = h;
785     	req->dl_next = lopt->syn_table[h];
786     
787     	write_lock(&tp->syn_wait_lock);
788     	lopt->syn_table[h] = req;
789     	write_unlock(&tp->syn_wait_lock);
790     
791     	tcp_synq_added(sk);
792     }
793     
794     
795     /* 
796      * This routine does path mtu discovery as defined in RFC1191.
797      */
798     static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
799     {
800     	struct dst_entry *dst;
801     	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
802     
803     	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
804     	 * send out by Linux are always <576bytes so they should go through
805     	 * unfragmented).
806     	 */
807     	if (sk->state == TCP_LISTEN)
808     		return; 
809     
810     	/* We don't check in the destentry if pmtu discovery is forbidden
811     	 * on this route. We just assume that no packet_to_big packets
812     	 * are send back when pmtu discovery is not active.
813          	 * There is a small race when the user changes this flag in the
814     	 * route, but I think that's acceptable.
815     	 */
816     	if ((dst = __sk_dst_check(sk, 0)) == NULL)
817     		return;
818     
819     	ip_rt_update_pmtu(dst, mtu);
820     
821     	/* Something is about to be wrong... Remember soft error
822     	 * for the case, if this connection will not able to recover.
823     	 */
824     	if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
825     		sk->err_soft = EMSGSIZE;
826     
827     	if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
828     	    tp->pmtu_cookie > dst->pmtu) {
829     		tcp_sync_mss(sk, dst->pmtu);
830     
831     		/* Resend the TCP packet because it's  
832     		 * clear that the old packet has been
833     		 * dropped. This is the new "fast" path mtu
834     		 * discovery.
835     		 */
836     		tcp_simple_retransmit(sk);
837     	} /* else let the usual retransmit timer handle it */
838     }
839     
840     /*
841      * This routine is called by the ICMP module when it gets some
842      * sort of error condition.  If err < 0 then the socket should
843      * be closed and the error returned to the user.  If err > 0
844      * it's just the icmp type << 8 | icmp code.  After adjustment
845      * header points to the first 8 bytes of the tcp header.  We need
846      * to find the appropriate port.
847      *
848      * The locking strategy used here is very "optimistic". When
849      * someone else accesses the socket the ICMP is just dropped
850      * and for some paths there is no check at all.
851      * A more general error queue to queue errors for later handling
852      * is probably better.
853      *
854      */
855     
856     void tcp_v4_err(struct sk_buff *skb, u32 info)
857     {
858     	struct iphdr *iph = (struct iphdr*)skb->data;
859     	struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
860     	struct tcp_opt *tp;
861     	int type = skb->h.icmph->type;
862     	int code = skb->h.icmph->code;
863     	struct sock *sk;
864     	__u32 seq;
865     	int err;
866     
867     	if (skb->len < (iph->ihl << 2) + 8) {
868     		ICMP_INC_STATS_BH(IcmpInErrors); 
869     		return;
870     	}
871     
872     	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
873     	if (sk == NULL) {
874     		ICMP_INC_STATS_BH(IcmpInErrors);
875     		return;
876     	}
877     	if (sk->state == TCP_TIME_WAIT) {
878     		tcp_tw_put((struct tcp_tw_bucket*)sk);
879     		return;
880     	}
881     
882     	bh_lock_sock(sk);
883     	/* If too many ICMPs get dropped on busy
884     	 * servers this needs to be solved differently.
885     	 */
886     	if (sk->lock.users != 0)
887     		NET_INC_STATS_BH(LockDroppedIcmps);
888     
889     	if (sk->state == TCP_CLOSE)
890     		goto out;
891     
892     	tp = &sk->tp_pinfo.af_tcp;
893     	seq = ntohl(th->seq);
894     	if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
895     		NET_INC_STATS(OutOfWindowIcmps);
896     		goto out;
897     	}
898     
899     	switch (type) {
900     	case ICMP_SOURCE_QUENCH:
901     		/* This is deprecated, but if someone generated it,
902     		 * we have no reasons to ignore it.
903     		 */
904     		if (sk->lock.users == 0)
905     			tcp_enter_cwr(tp);
906     		goto out;
907     	case ICMP_PARAMETERPROB:
908     		err = EPROTO;
909     		break; 
910     	case ICMP_DEST_UNREACH:
911     		if (code > NR_ICMP_UNREACH)
912     			goto out;
913     
914     		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
915     			if (sk->lock.users == 0)
916     				do_pmtu_discovery(sk, iph, info);
917     			goto out;
918     		}
919     
920     		err = icmp_err_convert[code].errno;
921     		break;
922     	case ICMP_TIME_EXCEEDED:
923     		err = EHOSTUNREACH;
924     		break;
925     	default:
926     		goto out;
927     	}
928     
929     	switch (sk->state) {
930     		struct open_request *req, **prev;
931     	case TCP_LISTEN:
932     		if (sk->lock.users != 0)
933     			goto out;
934     
935     		req = tcp_v4_search_req(tp, iph, th, &prev); 
936     		if (!req)
937     			goto out;
938     
939     		/* ICMPs are not backlogged, hence we cannot get
940     		   an established socket here.
941     		 */
942     		BUG_TRAP(req->sk == NULL);
943     
944     		if (seq != req->snt_isn) {
945     			NET_INC_STATS_BH(OutOfWindowIcmps);
946     			goto out;
947     		}
948     
949     		/* 
950     		 * Still in SYN_RECV, just remove it silently.
951     		 * There is no good way to pass the error to the newly
952     		 * created socket, and POSIX does not want network
953     		 * errors returned from accept(). 
954     		 */ 
955     		tcp_synq_drop(sk, req, prev);
956     		goto out;
957     
958     	case TCP_SYN_SENT:
959     	case TCP_SYN_RECV:  /* Cannot happen.
960     			       It can f.e. if SYNs crossed.
961     			     */ 
962     		if (sk->lock.users == 0) {
963     			TCP_INC_STATS_BH(TcpAttemptFails);
964     			sk->err = err;
965     
966     			sk->error_report(sk);
967     
968     			tcp_done(sk);
969     		} else {
970     			sk->err_soft = err;
971     		}
972     		goto out;
973     	}
974     
975     	/* If we've already connected we will keep trying
976     	 * until we time out, or the user gives up.
977     	 *
978     	 * rfc1122 4.2.3.9 allows to consider as hard errors
979     	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
980     	 * but it is obsoleted by pmtu discovery).
981     	 *
982     	 * Note, that in modern internet, where routing is unreliable
983     	 * and in each dark corner broken firewalls sit, sending random
984     	 * errors ordered by their masters even this two messages finally lose
985     	 * their original sense (even Linux sends invalid PORT_UNREACHs)
986     	 *
987     	 * Now we are in compliance with RFCs.
988     	 *							--ANK (980905)
989     	 */
990     
991     	if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
992     		sk->err = err;
993     		sk->error_report(sk);
994     	} else	{ /* Only an error on timeout */
995     		sk->err_soft = err;
996     	}
997     
998     out:
999     	bh_unlock_sock(sk);
1000     	sock_put(sk);
1001     }
1002     
1003     /* This routine computes an IPv4 TCP checksum. */
1004     void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 
1005     		       struct sk_buff *skb)
1006     {
1007     	if (skb->ip_summed == CHECKSUM_HW) {
1008     		th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1009     		skb->csum = offsetof(struct tcphdr, check);
1010     	} else {
1011     		th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1012     					 csum_partial((char *)th, th->doff<<2, skb->csum));
1013     	}
1014     }
1015     
1016     /*
1017      *	This routine will send an RST to the other tcp.
1018      *
1019      *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1020      *		      for reset.
1021      *	Answer: if a packet caused RST, it is not for a socket
1022      *		existing in our system, if it is matched to a socket,
1023      *		it is just duplicate segment or bug in other side's TCP.
1024      *		So that we build reply only basing on parameters
1025      *		arrived with segment.
1026      *	Exception: precedence violation. We do not implement it in any case.
1027      */
1028     
1029     static void tcp_v4_send_reset(struct sk_buff *skb)
1030     {
1031     	struct tcphdr *th = skb->h.th;
1032     	struct tcphdr rth;
1033     	struct ip_reply_arg arg;
1034     
1035     	/* Never send a reset in response to a reset. */
1036     	if (th->rst)
1037     		return;
1038     
1039     	if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1040     		return;
1041     
1042     	/* Swap the send and the receive. */
1043     	memset(&rth, 0, sizeof(struct tcphdr)); 
1044     	rth.dest = th->source;
1045     	rth.source = th->dest; 
1046     	rth.doff = sizeof(struct tcphdr)/4;
1047     	rth.rst = 1;
1048     
1049     	if (th->ack) {
1050     		rth.seq = th->ack_seq;
1051     	} else {
1052     		rth.ack = 1;
1053     		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1054     				    + skb->len - (th->doff<<2));
1055     	}
1056     
1057     	memset(&arg, 0, sizeof arg); 
1058     	arg.iov[0].iov_base = (unsigned char *)&rth; 
1059     	arg.iov[0].iov_len  = sizeof rth;
1060     	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 
1061     				      skb->nh.iph->saddr, /*XXX*/
1062     				      sizeof(struct tcphdr),
1063     				      IPPROTO_TCP,
1064     				      0); 
1065     	arg.n_iov = 1;
1066     	arg.csumoffset = offsetof(struct tcphdr, check) / 2; 
1067     
1068     	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1069     
1070     	TCP_INC_STATS_BH(TcpOutSegs);
1071     	TCP_INC_STATS_BH(TcpOutRsts);
1072     }
1073     
1074     /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1075        outside socket context is ugly, certainly. What can I do?
1076      */
1077     
1078     static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1079     {
1080     	struct tcphdr *th = skb->h.th;
1081     	struct {
1082     		struct tcphdr th;
1083     		u32 tsopt[3];
1084     	} rep;
1085     	struct ip_reply_arg arg;
1086     
1087     	memset(&rep.th, 0, sizeof(struct tcphdr));
1088     	memset(&arg, 0, sizeof arg);
1089     
1090     	arg.iov[0].iov_base = (unsigned char *)&rep; 
1091     	arg.iov[0].iov_len  = sizeof(rep.th);
1092     	arg.n_iov = 1;
1093     	if (ts) {
1094     		rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
1095     						(TCPOPT_NOP << 16) |
1096     						(TCPOPT_TIMESTAMP << 8) |
1097     						TCPOLEN_TIMESTAMP);
1098     		rep.tsopt[1] = htonl(tcp_time_stamp);
1099     		rep.tsopt[2] = htonl(ts);
1100     		arg.iov[0].iov_len = sizeof(rep);
1101     	}
1102     
1103     	/* Swap the send and the receive. */
1104     	rep.th.dest = th->source;
1105     	rep.th.source = th->dest; 
1106     	rep.th.doff = arg.iov[0].iov_len/4;
1107     	rep.th.seq = htonl(seq);
1108     	rep.th.ack_seq = htonl(ack);
1109     	rep.th.ack = 1;
1110     	rep.th.window = htons(win);
1111     
1112     	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 
1113     				      skb->nh.iph->saddr, /*XXX*/
1114     				      arg.iov[0].iov_len,
1115     				      IPPROTO_TCP,
1116     				      0);
1117     	arg.csumoffset = offsetof(struct tcphdr, check) / 2; 
1118     
1119     	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1120     
1121     	TCP_INC_STATS_BH(TcpOutSegs);
1122     }
1123     
1124     static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1125     {
1126     	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1127     
1128     	tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1129     			tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1130     
1131     	tcp_tw_put(tw);
1132     }
1133     
1134     static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1135     {
1136     	tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1137     			req->ts_recent);
1138     }
1139     
1140     static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1141     {
1142     	struct rtable *rt;
1143     	struct ip_options *opt;
1144     
1145     	opt = req->af.v4_req.opt;
1146     	if(ip_route_output(&rt, ((opt && opt->srr) ?
1147     				 opt->faddr :
1148     				 req->af.v4_req.rmt_addr),
1149     			   req->af.v4_req.loc_addr,
1150     			   RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1151     			   sk->bound_dev_if)) {
1152     		IP_INC_STATS_BH(IpOutNoRoutes);
1153     		return NULL;
1154     	}
1155     	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1156     		ip_rt_put(rt);
1157     		IP_INC_STATS_BH(IpOutNoRoutes);
1158     		return NULL;
1159     	}
1160     	return &rt->u.dst;
1161     }
1162     
1163     /*
1164      *	Send a SYN-ACK after having received an ACK. 
1165      *	This still operates on a open_request only, not on a big
1166      *	socket.
1167      */ 
1168     static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1169     			      struct dst_entry *dst)
1170     {
1171     	int err = -1;
1172     	struct sk_buff * skb;
1173     
1174     	/* First, grab a route. */
1175     	if (dst == NULL &&
1176     	    (dst = tcp_v4_route_req(sk, req)) == NULL)
1177     		goto out;
1178     
1179     	skb = tcp_make_synack(sk, dst, req);
1180     
1181     	if (skb) {
1182     		struct tcphdr *th = skb->h.th;
1183     
1184     		th->check = tcp_v4_check(th, skb->len,
1185     					 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1186     					 csum_partial((char *)th, skb->len, skb->csum));
1187     
1188     		err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1189     					    req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1190     		if (err == NET_XMIT_CN)
1191     			err = 0;
1192     	}
1193     
1194     out:
1195     	dst_release(dst);
1196     	return err;
1197     }
1198     
1199     /*
1200      *	IPv4 open_request destructor.
1201      */ 
1202     static void tcp_v4_or_free(struct open_request *req)
1203     {
1204     	if (req->af.v4_req.opt)
1205     		kfree(req->af.v4_req.opt);
1206     }
1207     
1208     static inline void syn_flood_warning(struct sk_buff *skb)
1209     {
1210     	static unsigned long warntime;
1211     	
1212     	if (jiffies - warntime > HZ*60) {
1213     		warntime = jiffies;
1214     		printk(KERN_INFO 
1215     		       "possible SYN flooding on port %d. Sending cookies.\n",  
1216     		       ntohs(skb->h.th->dest));
1217     	}
1218     }
1219     
1220     /* 
1221      * Save and compile IPv4 options into the open_request if needed. 
1222      */
1223     static inline struct ip_options * 
1224     tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1225     {
1226     	struct ip_options *opt = &(IPCB(skb)->opt);
1227     	struct ip_options *dopt = NULL; 
1228     
1229     	if (opt && opt->optlen) {
1230     		int opt_size = optlength(opt); 
1231     		dopt = kmalloc(opt_size, GFP_ATOMIC);
1232     		if (dopt) {
1233     			if (ip_options_echo(dopt, skb)) {
1234     				kfree(dopt);
1235     				dopt = NULL;
1236     			}
1237     		}
1238     	}
1239     	return dopt;
1240     }
1241     
1242     /* 
1243      * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1244      * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1245      * It would be better to replace it with a global counter for all sockets
1246      * but then some measure against one socket starving all other sockets
1247      * would be needed.
1248      *
1249      * It was 128 by default. Experiments with real servers show, that
1250      * it is absolutely not enough even at 100conn/sec. 256 cures most
1251      * of problems. This value is adjusted to 128 for very small machines
1252      * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1253      * Further increasing requires to change hash table size.
1254      */
1255     int sysctl_max_syn_backlog = 256; 
1256     
1257     struct or_calltable or_ipv4 = {
1258     	PF_INET,
1259     	tcp_v4_send_synack,
1260     	tcp_v4_or_send_ack,
1261     	tcp_v4_or_free,
1262     	tcp_v4_send_reset
1263     };
1264     
1265     int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1266     {
1267     	struct tcp_opt tp;
1268     	struct open_request *req;
1269     	__u32 saddr = skb->nh.iph->saddr;
1270     	__u32 daddr = skb->nh.iph->daddr;
1271     	__u32 isn = TCP_SKB_CB(skb)->when;
1272     	struct dst_entry *dst = NULL;
1273     #ifdef CONFIG_SYN_COOKIES
1274     	int want_cookie = 0;
1275     #else
1276     #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1277     #endif
1278     
1279     	/* Never answer to SYNs send to broadcast or multicast */
1280     	if (((struct rtable *)skb->dst)->rt_flags & 
1281     	    (RTCF_BROADCAST|RTCF_MULTICAST))
1282     		goto drop; 
1283     
1284     	/* TW buckets are converted to open requests without
1285     	 * limitations, they conserve resources and peer is
1286     	 * evidently real one.
1287     	 */
1288     	if (tcp_synq_is_full(sk) && !isn) {
1289     #ifdef CONFIG_SYN_COOKIES
1290     		if (sysctl_tcp_syncookies) {
1291     			want_cookie = 1; 
1292     		} else
1293     #endif
1294     		goto drop;
1295     	}
1296     
1297     	/* Accept backlog is full. If we have already queued enough
1298     	 * of warm entries in syn queue, drop request. It is better than
1299     	 * clogging syn queue with openreqs with exponentially increasing
1300     	 * timeout.
1301     	 */
1302     	if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1303     		goto drop;
1304     
1305     	req = tcp_openreq_alloc();
1306     	if (req == NULL)
1307     		goto drop;
1308     
1309     	tcp_clear_options(&tp);
1310     	tp.mss_clamp = 536;
1311     	tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1312     
1313     	tcp_parse_options(skb, &tp, 0);
1314     
1315     	if (want_cookie) {
1316     		tcp_clear_options(&tp);
1317     		tp.saw_tstamp = 0;
1318     	}
1319     
1320     	if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1321     		/* Some OSes (unknown ones, but I see them on web server, which
1322     		 * contains information interesting only for windows'
1323     		 * users) do not send their stamp in SYN. It is easy case.
1324     		 * We simply do not advertise TS support.
1325     		 */
1326     		tp.saw_tstamp = 0;
1327     		tp.tstamp_ok = 0;
1328     	}
1329     	tp.tstamp_ok = tp.saw_tstamp;
1330     
1331     	tcp_openreq_init(req, &tp, skb);
1332     
1333     	req->af.v4_req.loc_addr = daddr;
1334     	req->af.v4_req.rmt_addr = saddr;
1335     	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1336     	req->class = &or_ipv4;
1337     	if (!want_cookie)
1338     		TCP_ECN_create_request(req, skb->h.th);
1339     
1340     	if (want_cookie) {
1341     #ifdef CONFIG_SYN_COOKIES
1342     		syn_flood_warning(skb);
1343     #endif
1344     		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1345     	} else if (isn == 0) {
1346     		struct inet_peer *peer = NULL;
1347     
1348     		/* VJ's idea. We save last timestamp seen
1349     		 * from the destination in peer table, when entering
1350     		 * state TIME-WAIT, and check against it before
1351     		 * accepting new connection request.
1352     		 *
1353     		 * If "isn" is not zero, this request hit alive
1354     		 * timewait bucket, so that all the necessary checks
1355     		 * are made in the function processing timewait state.
1356     		 */
1357     		if (tp.saw_tstamp &&
1358     		    sysctl_tcp_tw_recycle &&
1359     		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
1360     		    (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1361     		    peer->v4daddr == saddr) {
1362     			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1363     			    (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1364     				NET_INC_STATS_BH(PAWSPassiveRejected);
1365     				dst_release(dst);
1366     				goto drop_and_free;
1367     			}
1368     		}
1369     		/* Kill the following clause, if you dislike this way. */
1370     		else if (!sysctl_tcp_syncookies &&
1371     			 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1372     			  < (sysctl_max_syn_backlog>>2)) &&
1373     			 (!peer || !peer->tcp_ts_stamp) &&
1374     			 (!dst || !dst->rtt)) {
1375     			/* Without syncookies last quarter of
1376     			 * backlog is filled with destinations, proven to be alive.
1377     			 * It means that we continue to communicate
1378     			 * to destinations, already remembered
1379     			 * to the moment of synflood.
1380     			 */
1381     			NETDEBUG(if (net_ratelimit()) 
1382     				printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", 
1383     					NIPQUAD(saddr), ntohs(skb->h.th->source)));
1384     			TCP_INC_STATS_BH(TcpAttemptFails);
1385     			dst_release(dst);
1386     			goto drop_and_free;
1387     		}
1388     
1389     		isn = tcp_v4_init_sequence(sk, skb);
1390     	}
1391     	req->snt_isn = isn;
1392     
1393     	if (tcp_v4_send_synack(sk, req, dst))
1394     		goto drop_and_free;
1395     
1396     	if (want_cookie) {
1397     	   	tcp_openreq_free(req); 
1398     	} else {
1399     		tcp_v4_synq_add(sk, req);
1400     	}
1401     	return 0;
1402     
1403     drop_and_free:
1404     	tcp_openreq_free(req); 
1405     drop:
1406     	TCP_INC_STATS_BH(TcpAttemptFails);
1407     	return 0;
1408     }
1409     
1410     
1411     /* 
1412      * The three way handshake has completed - we got a valid synack - 
1413      * now create the new socket. 
1414      */
1415     struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1416     				   struct open_request *req,
1417     				   struct dst_entry *dst)
1418     {
1419     	struct tcp_opt *newtp;
1420     	struct sock *newsk;
1421     
1422     	if (tcp_acceptq_is_full(sk))
1423     		goto exit_overflow;
1424     
1425     	if (dst == NULL &&
1426     	    (dst = tcp_v4_route_req(sk, req)) == NULL)
1427     		goto exit;
1428     
1429     	newsk = tcp_create_openreq_child(sk, req, skb);
1430     	if (!newsk)
1431     		goto exit;
1432     
1433     	newsk->dst_cache = dst;
1434     	newsk->route_caps = dst->dev->features;
1435     
1436     	newtp = &(newsk->tp_pinfo.af_tcp);
1437     	newsk->daddr = req->af.v4_req.rmt_addr;
1438     	newsk->saddr = req->af.v4_req.loc_addr;
1439     	newsk->rcv_saddr = req->af.v4_req.loc_addr;
1440     	newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1441     	req->af.v4_req.opt = NULL;
1442     	newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1443     	newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1444     	newtp->ext_header_len = 0;
1445     	if (newsk->protinfo.af_inet.opt)
1446     		newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1447     	newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1448     
1449     	tcp_sync_mss(newsk, dst->pmtu);
1450     	newtp->advmss = dst->advmss;
1451     	tcp_initialize_rcv_mss(newsk);
1452     
1453     	__tcp_v4_hash(newsk);
1454     	__tcp_inherit_port(sk, newsk);
1455     
1456     	return newsk;
1457     
1458     exit_overflow:
1459     	NET_INC_STATS_BH(ListenOverflows);
1460     exit:
1461     	NET_INC_STATS_BH(ListenDrops);
1462     	dst_release(dst);
1463     	return NULL;
1464     }
1465     
1466     static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1467     {
1468     	struct open_request *req, **prev;
1469     	struct tcphdr *th = skb->h.th;
1470     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1471     	struct sock *nsk;
1472     
1473     	/* Find possible connection requests. */
1474     	req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
1475     	if (req)
1476     		return tcp_check_req(sk, skb, req, prev);
1477     
1478     	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1479     					  th->source,
1480     					  skb->nh.iph->daddr,
1481     					  ntohs(th->dest),
1482     					  tcp_v4_iif(skb));
1483     
1484     	if (nsk) {
1485     		if (nsk->state != TCP_TIME_WAIT) {
1486     			bh_lock_sock(nsk);
1487     			return nsk;
1488     		}
1489     		tcp_tw_put((struct tcp_tw_bucket*)sk);
1490     		return NULL;
1491     	}
1492     
1493     #ifdef CONFIG_SYN_COOKIES
1494     	if (!th->rst && !th->syn && th->ack)
1495     		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1496     #endif
1497     	return sk;
1498     }
1499     
1500     static int tcp_v4_checksum_init(struct sk_buff *skb)
1501     {
1502     	if (skb->ip_summed == CHECKSUM_HW) {
1503     		skb->ip_summed = CHECKSUM_UNNECESSARY;
1504     		if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1505     				  skb->nh.iph->daddr,skb->csum))
1506     			return 0;
1507     
1508     		NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1509     		skb->ip_summed = CHECKSUM_NONE;
1510     	}
1511     	if (skb->len <= 76) {
1512     		if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1513     				 skb->nh.iph->daddr,
1514     				 skb_checksum(skb, 0, skb->len, 0)))
1515     			return -1;
1516     		skb->ip_summed = CHECKSUM_UNNECESSARY;
1517     	} else {
1518     		skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1519     					  skb->nh.iph->daddr,0);
1520     	}
1521     	return 0;
1522     }
1523     
1524     
1525     /* The socket must have it's spinlock held when we get
1526      * here.
1527      *
1528      * We have a potential double-lock case here, so even when
1529      * doing backlog processing we use the BH locking scheme.
1530      * This is because we cannot sleep with the original spinlock
1531      * held.
1532      */
1533     int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1534     {
1535     #ifdef CONFIG_FILTER
1536     	struct sk_filter *filter = sk->filter;
1537     	if (filter && sk_filter(skb, filter))
1538     		goto discard;
1539     #endif /* CONFIG_FILTER */
1540     
1541       	IP_INC_STATS_BH(IpInDelivers);
1542     
1543     	if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1544     		TCP_CHECK_TIMER(sk);
1545     		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1546     			goto reset;
1547     		TCP_CHECK_TIMER(sk);
1548     		return 0; 
1549     	}
1550     
1551     	if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1552     		goto csum_err;
1553     
1554     	if (sk->state == TCP_LISTEN) { 
1555     		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1556     		if (!nsk)
1557     			goto discard;
1558     
1559     		if (nsk != sk) {
1560     			if (tcp_child_process(sk, nsk, skb))
1561     				goto reset;
1562     			return 0;
1563     		}
1564     	}
1565     
1566     	TCP_CHECK_TIMER(sk);
1567     	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1568     		goto reset;
1569     	TCP_CHECK_TIMER(sk);
1570     	return 0;
1571     
1572     reset:
1573     	tcp_v4_send_reset(skb);
1574     discard:
1575     	kfree_skb(skb);
1576     	/* Be careful here. If this function gets more complicated and
1577     	 * gcc suffers from register pressure on the x86, sk (in %ebx) 
1578     	 * might be destroyed here. This current version compiles correctly,
1579     	 * but you have been warned.
1580     	 */
1581     	return 0;
1582     
1583     csum_err:
1584     	TCP_INC_STATS_BH(TcpInErrs);
1585     	goto discard;
1586     }
1587     
1588     /*
1589      *	From tcp_input.c
1590      */
1591     
1592     int tcp_v4_rcv(struct sk_buff *skb)
1593     {
1594     	struct tcphdr *th;
1595     	struct sock *sk;
1596     	int ret;
1597     
1598     	if (skb->pkt_type!=PACKET_HOST)
1599     		goto discard_it;
1600     
1601     	/* Count it even if it's bad */
1602     	TCP_INC_STATS_BH(TcpInSegs);
1603     
1604     	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1605     		goto discard_it;
1606     
1607     	th = skb->h.th;
1608     
1609     	if (th->doff < sizeof(struct tcphdr)/4)
1610     		goto bad_packet;
1611     	if (!pskb_may_pull(skb, th->doff*4))
1612     		goto discard_it;
1613     
1614     	/* An explanation is required here, I think.
1615     	 * Packet length and doff are validated by header prediction,
1616     	 * provided case of th->doff==0 is elimineted.
1617     	 * So, we defer the checks. */
1618     	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1619     	     tcp_v4_checksum_init(skb) < 0))
1620     		goto bad_packet;
1621     
1622     	th = skb->h.th;
1623     	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1624     	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1625     				    skb->len - th->doff*4);
1626     	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1627     	TCP_SKB_CB(skb)->when = 0;
1628     	TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1629     	TCP_SKB_CB(skb)->sacked = 0;
1630     
1631     	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1632     			     skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1633     
1634     	if (!sk)
1635     		goto no_tcp_socket;
1636     
1637     process:
1638     	if(!ipsec_sk_policy(sk,skb))
1639     		goto discard_and_relse;
1640     
1641     	if (sk->state == TCP_TIME_WAIT)
1642     		goto do_time_wait;
1643     
1644     	skb->dev = NULL;
1645     
1646     	bh_lock_sock(sk);
1647     	ret = 0;
1648     	if (!sk->lock.users) {
1649     		if (!tcp_prequeue(sk, skb))
1650     			ret = tcp_v4_do_rcv(sk, skb);
1651     	} else
1652     		sk_add_backlog(sk, skb);
1653     	bh_unlock_sock(sk);
1654     
1655     	sock_put(sk);
1656     
1657     	return ret;
1658     
1659     no_tcp_socket:
1660     	if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1661     bad_packet:
1662     		TCP_INC_STATS_BH(TcpInErrs);
1663     	} else {
1664     		tcp_v4_send_reset(skb);
1665     	}
1666     
1667     discard_it:
1668     	/* Discard frame. */
1669     	kfree_skb(skb);
1670       	return 0;
1671     
1672     discard_and_relse:
1673     	sock_put(sk);
1674     	goto discard_it;
1675     
1676     do_time_wait:
1677     	if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1678     		TCP_INC_STATS_BH(TcpInErrs);
1679     		goto discard_and_relse;
1680     	}
1681     	switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1682     					  skb, th, skb->len)) {
1683     	case TCP_TW_SYN:
1684     	{
1685     		struct sock *sk2;
1686     
1687     		sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1688     		if (sk2 != NULL) {
1689     			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1690     			tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1691     			tcp_tw_put((struct tcp_tw_bucket *)sk);
1692     			sk = sk2;
1693     			goto process;
1694     		}
1695     		/* Fall through to ACK */
1696     	}
1697     	case TCP_TW_ACK:
1698     		tcp_v4_timewait_ack(sk, skb);
1699     		break;
1700     	case TCP_TW_RST:
1701     		goto no_tcp_socket;
1702     	case TCP_TW_SUCCESS:;
1703     	}
1704     	goto discard_it;
1705     }
1706     
1707     /* With per-bucket locks this operation is not-atomic, so that
1708      * this version is not worse.
1709      */
1710     static void __tcp_v4_rehash(struct sock *sk)
1711     {
1712     	sk->prot->unhash(sk);
1713     	sk->prot->hash(sk);
1714     }
1715     
1716     static int tcp_v4_reselect_saddr(struct sock *sk)
1717     {
1718     	int err;
1719     	struct rtable *rt;
1720     	__u32 old_saddr = sk->saddr;
1721     	__u32 new_saddr;
1722     	__u32 daddr = sk->daddr;
1723     
1724     	if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1725     		daddr = sk->protinfo.af_inet.opt->faddr;
1726     
1727     	/* Query new route. */
1728     	err = ip_route_connect(&rt, daddr, 0,
1729     			       RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1730     			       sk->bound_dev_if);
1731     	if (err)
1732     		return err;
1733     
1734     	__sk_dst_set(sk, &rt->u.dst);
1735     	sk->route_caps = rt->u.dst.dev->features;
1736     
1737     	new_saddr = rt->rt_src;
1738     
1739     	if (new_saddr == old_saddr)
1740     		return 0;
1741     
1742     	if (sysctl_ip_dynaddr > 1) {
1743     		printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1744     		       "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1745     		       NIPQUAD(old_saddr), 
1746     		       NIPQUAD(new_saddr));
1747     	}
1748     
1749     	sk->saddr = new_saddr;
1750     	sk->rcv_saddr = new_saddr;
1751     
1752     	/* XXX The only one ugly spot where we need to
1753     	 * XXX really change the sockets identity after
1754     	 * XXX it has entered the hashes. -DaveM
1755     	 *
1756     	 * Besides that, it does not check for connection
1757     	 * uniqueness. Wait for troubles.
1758     	 */
1759     	__tcp_v4_rehash(sk);
1760     	return 0;
1761     }
1762     
1763     int tcp_v4_rebuild_header(struct sock *sk)
1764     {
1765     	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1766     	u32 daddr;
1767     	int err;
1768     
1769     	/* Route is OK, nothing to do. */
1770     	if (rt != NULL)
1771     		return 0;
1772     
1773     	/* Reroute. */
1774     	daddr = sk->daddr;
1775     	if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1776     		daddr = sk->protinfo.af_inet.opt->faddr;
1777     
1778     	err = ip_route_output(&rt, daddr, sk->saddr,
1779     			      RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
1780     			      sk->bound_dev_if);
1781     	if (!err) {
1782     		__sk_dst_set(sk, &rt->u.dst);
1783     		sk->route_caps = rt->u.dst.dev->features;
1784     		return 0;
1785     	}
1786     
1787     	/* Routing failed... */
1788     	sk->route_caps = 0;
1789     
1790     	if (!sysctl_ip_dynaddr ||
1791     	    sk->state != TCP_SYN_SENT ||
1792     	    (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1793     	    (err = tcp_v4_reselect_saddr(sk)) != 0)
1794     		sk->err_soft=-err;
1795     
1796     	return err;
1797     }
1798     
1799     static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1800     {
1801     	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1802     
1803     	sin->sin_family		= AF_INET;
1804     	sin->sin_addr.s_addr	= sk->daddr;
1805     	sin->sin_port		= sk->dport;
1806     }
1807     
1808     /* VJ's idea. Save last timestamp seen from this destination
1809      * and hold it at least for normal timewait interval to use for duplicate
1810      * segment detection in subsequent connections, before they enter synchronized
1811      * state.
1812      */
1813     
1814     int tcp_v4_remember_stamp(struct sock *sk)
1815     {
1816     	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1817     	struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1818     	struct inet_peer *peer = NULL;
1819     	int release_it = 0;
1820     
1821     	if (rt == NULL || rt->rt_dst != sk->daddr) {
1822     		peer = inet_getpeer(sk->daddr, 1);
1823     		release_it = 1;
1824     	} else {
1825     		if (rt->peer == NULL)
1826     			rt_bind_peer(rt, 1);
1827     		peer = rt->peer;
1828     	}
1829     
1830     	if (peer) {
1831     		if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1832     		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1833     		     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1834     			peer->tcp_ts_stamp = tp->ts_recent_stamp;
1835     			peer->tcp_ts = tp->ts_recent;
1836     		}
1837     		if (release_it)
1838     			inet_putpeer(peer);
1839     		return 1;
1840     	}
1841     
1842     	return 0;
1843     }
1844     
1845     int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1846     {
1847     	struct inet_peer *peer = NULL;
1848     
1849     	peer = inet_getpeer(tw->daddr, 1);
1850     
1851     	if (peer) {
1852     		if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1853     		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1854     		     peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1855     			peer->tcp_ts_stamp = tw->ts_recent_stamp;
1856     			peer->tcp_ts = tw->ts_recent;
1857     		}
1858     		inet_putpeer(peer);
1859     		return 1;
1860     	}
1861     
1862     	return 0;
1863     }
1864     
1865     struct tcp_func ipv4_specific = {
1866     	ip_queue_xmit,
1867     	tcp_v4_send_check,
1868     	tcp_v4_rebuild_header,
1869     	tcp_v4_conn_request,
1870     	tcp_v4_syn_recv_sock,
1871     	tcp_v4_hash_connecting,
1872     	tcp_v4_remember_stamp,
1873     	sizeof(struct iphdr),
1874     
1875     	ip_setsockopt,
1876     	ip_getsockopt,
1877     	v4_addr2sockaddr,
1878     	sizeof(struct sockaddr_in)
1879     };
1880     
1881     /* NOTE: A lot of things set to zero explicitly by call to
1882      *       sk_alloc() so need not be done here.
1883      */
1884     static int tcp_v4_init_sock(struct sock *sk)
1885     {
1886     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1887     
1888     	skb_queue_head_init(&tp->out_of_order_queue);
1889     	tcp_init_xmit_timers(sk);
1890     	tcp_prequeue_init(tp);
1891     
1892     	tp->rto  = TCP_TIMEOUT_INIT;
1893     	tp->mdev = TCP_TIMEOUT_INIT;
1894           
1895     	/* So many TCP implementations out there (incorrectly) count the
1896     	 * initial SYN frame in their delayed-ACK and congestion control
1897     	 * algorithms that we must have the following bandaid to talk
1898     	 * efficiently to them.  -DaveM
1899     	 */
1900     	tp->snd_cwnd = 2;
1901     
1902     	/* See draft-stevens-tcpca-spec-01 for discussion of the
1903     	 * initialization of these values.
1904     	 */
1905     	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1906     	tp->snd_cwnd_clamp = ~0;
1907     	tp->mss_cache = 536;
1908     
1909     	tp->reordering = sysctl_tcp_reordering;
1910     
1911     	sk->state = TCP_CLOSE;
1912     
1913     	sk->write_space = tcp_write_space;
1914     	sk->use_write_queue = 1;
1915     
1916     	sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
1917     
1918     	sk->sndbuf = sysctl_tcp_wmem[1];
1919     	sk->rcvbuf = sysctl_tcp_rmem[1];
1920     
1921     	atomic_inc(&tcp_sockets_allocated);
1922     
1923     	return 0;
1924     }
1925     
1926     static int tcp_v4_destroy_sock(struct sock *sk)
1927     {
1928     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1929     
1930     	tcp_clear_xmit_timers(sk);
1931     
1932     	/* Cleanup up the write buffer. */
1933       	tcp_writequeue_purge(sk);
1934     
1935     	/* Cleans up our, hopefully empty, out_of_order_queue. */
1936       	__skb_queue_purge(&tp->out_of_order_queue);
1937     
1938     	/* Clean prequeue, it must be empty really */
1939     	__skb_queue_purge(&tp->ucopy.prequeue);
1940     
1941     	/* Clean up a referenced TCP bind bucket. */
1942     	if(sk->prev != NULL)
1943     		tcp_put_port(sk);
1944     
1945     	/* If sendmsg cached page exists, toss it. */
1946     	if (tp->sndmsg_page != NULL)
1947     		__free_page(tp->sndmsg_page);
1948     
1949     	atomic_dec(&tcp_sockets_allocated);
1950     
1951     	return 0;
1952     }
1953     
1954     /* Proc filesystem TCP sock list dumping. */
1955     static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
1956     {
1957     	int ttd = req->expires - jiffies;
1958     
1959     	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1960     		" %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
1961     		i,
1962     		req->af.v4_req.loc_addr,
1963     		ntohs(sk->sport),
1964     		req->af.v4_req.rmt_addr,
1965     		ntohs(req->rmt_port),
1966     		TCP_SYN_RECV,
1967     		0,0, /* could print option size, but that is af dependent. */
1968     		1,   /* timers active (only the expire timer) */  
1969     		ttd, 
1970     		req->retrans,
1971     		uid,
1972     		0,  /* non standard timer */  
1973     		0, /* open_requests have no inode */
1974     		atomic_read(&sk->refcnt),
1975     		req
1976     		); 
1977     }
1978     
1979     static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
1980     {
1981     	unsigned int dest, src;
1982     	__u16 destp, srcp;
1983     	int timer_active;
1984     	unsigned long timer_expires;
1985     	struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
1986     
1987     	dest  = sp->daddr;
1988     	src   = sp->rcv_saddr;
1989     	destp = ntohs(sp->dport);
1990     	srcp  = ntohs(sp->sport);
1991     	if (tp->pending == TCP_TIME_RETRANS) {
1992     		timer_active	= 1;
1993     		timer_expires	= tp->timeout;
1994     	} else if (tp->pending == TCP_TIME_PROBE0) {
1995     		timer_active	= 4;
1996     		timer_expires	= tp->timeout;
1997     	} else if (timer_pending(&sp->timer)) {
1998     		timer_active	= 2;
1999     		timer_expires	= sp->timer.expires;
2000     	} else {
2001     		timer_active	= 0;
2002     		timer_expires = jiffies;
2003     	}
2004     
2005     	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2006     		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2007     		i, src, srcp, dest, destp, sp->state, 
2008     		tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2009     		timer_active, timer_expires-jiffies,
2010     		tp->retransmits,
2011     		sock_i_uid(sp),
2012     		tp->probes_out,
2013     		sock_i_ino(sp),
2014     		atomic_read(&sp->refcnt), sp,
2015     		tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2016     		tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2017     		);
2018     }
2019     
2020     static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2021     {
2022     	unsigned int dest, src;
2023     	__u16 destp, srcp;
2024     	int ttd = tw->ttd - jiffies;
2025     
2026     	if (ttd < 0)
2027     		ttd = 0;
2028     
2029     	dest  = tw->daddr;
2030     	src   = tw->rcv_saddr;
2031     	destp = ntohs(tw->dport);
2032     	srcp  = ntohs(tw->sport);
2033     
2034     	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2035     		" %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2036     		i, src, srcp, dest, destp, tw->substate, 0, 0,
2037     		3, ttd, 0, 0, 0, 0,
2038     		atomic_read(&tw->refcnt), tw);
2039     }
2040     
2041     #define TMPSZ 150
2042     
2043     int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2044     {
2045     	int len = 0, num = 0, i;
2046     	off_t begin, pos = 0;
2047     	char tmpbuf[TMPSZ+1];
2048     
2049     	if (offset < TMPSZ)
2050     		len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2051     			       "  sl  local_address rem_address   st tx_queue "
2052     			       "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2053     
2054     	pos = TMPSZ;
2055     
2056     	/* First, walk listening socket table. */
2057     	tcp_listen_lock();
2058     	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2059     		struct sock *sk = tcp_listening_hash[i];
2060     		struct tcp_listen_opt *lopt;
2061     		int k;
2062     
2063     		for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2064     			struct open_request *req;
2065     			int uid;
2066     			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2067     
2068     			if (!TCP_INET_FAMILY(sk->family))
2069     				goto skip_listen;
2070     
2071     			pos += TMPSZ;
2072     			if (pos >= offset) {
2073     				get_tcp_sock(sk, tmpbuf, num);
2074     				len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2075     				if (pos >= offset + length) {
2076     					tcp_listen_unlock();
2077     					goto out_no_bh;
2078     				}
2079     			}
2080     
2081     skip_listen:
2082     			uid = sock_i_uid(sk);
2083     			read_lock_bh(&tp->syn_wait_lock);
2084     			lopt = tp->listen_opt;
2085     			if (lopt && lopt->qlen != 0) {
2086     				for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2087     					for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2088     						if (!TCP_INET_FAMILY(req->class->family))
2089     							continue;
2090     
2091     						pos += TMPSZ;
2092     						if (pos <= offset)
2093     							continue;
2094     						get_openreq(sk, req, tmpbuf, num, uid);
2095     						len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2096     						if (pos >= offset + length) {
2097     							read_unlock_bh(&tp->syn_wait_lock);
2098     							tcp_listen_unlock();
2099     							goto out_no_bh;
2100     						}
2101     					}
2102     				}
2103     			}
2104     			read_unlock_bh(&tp->syn_wait_lock);
2105     
2106     			/* Completed requests are in normal socket hash table */
2107     		}
2108     	}
2109     	tcp_listen_unlock();
2110     
2111     	local_bh_disable();
2112     
2113     	/* Next, walk established hash chain. */
2114     	for (i = 0; i < tcp_ehash_size; i++) {
2115     		struct tcp_ehash_bucket *head = &tcp_ehash[i];
2116     		struct sock *sk;
2117     		struct tcp_tw_bucket *tw;
2118     
2119     		read_lock(&head->lock);
2120     		for(sk = head->chain; sk; sk = sk->next, num++) {
2121     			if (!TCP_INET_FAMILY(sk->family))
2122     				continue;
2123     			pos += TMPSZ;
2124     			if (pos <= offset)
2125     				continue;
2126     			get_tcp_sock(sk, tmpbuf, num);
2127     			len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2128     			if (pos >= offset + length) {
2129     				read_unlock(&head->lock);
2130     				goto out;
2131     			}
2132     		}
2133     		for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2134     		     tw != NULL;
2135     		     tw = (struct tcp_tw_bucket *)tw->next, num++) {
2136     			if (!TCP_INET_FAMILY(tw->family))
2137     				continue;
2138     			pos += TMPSZ;
2139     			if (pos <= offset)
2140     				continue;
2141     			get_timewait_sock(tw, tmpbuf, num);
2142     			len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2143     			if (pos >= offset + length) {
2144     				read_unlock(&head->lock);
2145     				goto out;
2146     			}
2147     		}
2148     		read_unlock(&head->lock);
2149     	}
2150     
2151     out:
2152     	local_bh_enable();
2153     out_no_bh:
2154     
2155     	begin = len - (pos - offset);
2156     	*start = buffer + begin;
2157     	len -= begin;
2158     	if (len > length)
2159     		len = length;
2160     	if (len < 0)
2161     		len = 0; 
2162     	return len;
2163     }
2164     
2165     struct proto tcp_prot = {
2166     	name:		"TCP",
2167     	close:		tcp_close,
2168     	connect:	tcp_v4_connect,
2169     	disconnect:	tcp_disconnect,
2170     	accept:		tcp_accept,
2171     	ioctl:		tcp_ioctl,
2172     	init:		tcp_v4_init_sock,
2173     	destroy:	tcp_v4_destroy_sock,
2174     	shutdown:	tcp_shutdown,
2175     	setsockopt:	tcp_setsockopt,
2176     	getsockopt:	tcp_getsockopt,
2177     	sendmsg:	tcp_sendmsg,
2178     	recvmsg:	tcp_recvmsg,
2179     	backlog_rcv:	tcp_v4_do_rcv,
2180     	hash:		tcp_v4_hash,
2181     	unhash:		tcp_unhash,
2182     	get_port:	tcp_v4_get_port,
2183     };
2184     
2185     
2186     
2187     void __init tcp_v4_init(struct net_proto_family *ops)
2188     {
2189     	int err;
2190     
2191     	tcp_inode.i_mode = S_IFSOCK;
2192     	tcp_inode.i_sock = 1;
2193     	tcp_inode.i_uid = 0;
2194     	tcp_inode.i_gid = 0;
2195     	init_waitqueue_head(&tcp_inode.i_wait);
2196     	init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2197     
2198     	tcp_socket->inode = &tcp_inode;
2199     	tcp_socket->state = SS_UNCONNECTED;
2200     	tcp_socket->type=SOCK_RAW;
2201     
2202     	if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2203     		panic("Failed to create the TCP control socket.\n");
2204     	tcp_socket->sk->allocation=GFP_ATOMIC;
2205     	tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2206     
2207     	/* Unhash it so that IP input processing does not even
2208     	 * see it, we do not wish this socket to see incoming
2209     	 * packets.
2210     	 */
2211     	tcp_socket->sk->prot->unhash(tcp_socket->sk);
2212     }
2213