File: /usr/src/linux/net/ipv4/tcp_minisocks.c

1     /*
2      * INET		An implementation of the TCP/IP protocol suite for the LINUX
3      *		operating system.  INET is implemented using the  BSD Socket
4      *		interface as the means of communication with the user level.
5      *
6      *		Implementation of the Transmission Control Protocol(TCP).
7      *
8      * Version:	$Id: tcp_minisocks.c,v 1.13 2001/09/18 22:29:10 davem Exp $
9      *
10      * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11      *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12      *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13      *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14      *		Florian La Roche, <flla@stud.uni-sb.de>
15      *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16      *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17      *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18      *		Matthew Dillon, <dillon@apollo.west.oic.com>
19      *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20      *		Jorge Cwik, <jorge@laser.satlink.net>
21      */
22     
23     #include <linux/config.h>
24     #include <linux/mm.h>
25     #include <linux/sysctl.h>
26     #include <net/tcp.h>
27     #include <net/inet_common.h>
28     
29     #ifdef CONFIG_SYSCTL
30     #define SYNC_INIT 0 /* let the user enable it */
31     #else
32     #define SYNC_INIT 1
33     #endif
34     
35     int sysctl_tcp_tw_recycle = 0;
36     int sysctl_tcp_max_tw_buckets = NR_FILE*2;
37     
38     int sysctl_tcp_syncookies = SYNC_INIT; 
39     int sysctl_tcp_abort_on_overflow = 0;
40     
41     static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
42     {
43     	if (seq == s_win)
44     		return 1;
45     	if (after(end_seq, s_win) && before(seq, e_win))
46     		return 1;
47     	return (seq == e_win && seq == end_seq);
48     }
49     
50     /* New-style handling of TIME_WAIT sockets. */
51     
52     int tcp_tw_count = 0;
53     
54     
55     /* Must be called with locally disabled BHs. */
56     void tcp_timewait_kill(struct tcp_tw_bucket *tw)
57     {
58     	struct tcp_ehash_bucket *ehead;
59     	struct tcp_bind_hashbucket *bhead;
60     	struct tcp_bind_bucket *tb;
61     
62     	/* Unlink from established hashes. */
63     	ehead = &tcp_ehash[tw->hashent];
64     	write_lock(&ehead->lock);
65     	if (!tw->pprev) {
66     		write_unlock(&ehead->lock);
67     		return;
68     	}
69     	if(tw->next)
70     		tw->next->pprev = tw->pprev;
71     	*(tw->pprev) = tw->next;
72     	tw->pprev = NULL;
73     	write_unlock(&ehead->lock);
74     
75     	/* Disassociate with bind bucket. */
76     	bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
77     	spin_lock(&bhead->lock);
78     	if ((tb = tw->tb) != NULL) {
79     		if(tw->bind_next)
80     			tw->bind_next->bind_pprev = tw->bind_pprev;
81     		*(tw->bind_pprev) = tw->bind_next;
82     		tw->tb = NULL;
83     		if (tb->owners == NULL) {
84     			if (tb->next)
85     				tb->next->pprev = tb->pprev;
86     			*(tb->pprev) = tb->next;
87     			kmem_cache_free(tcp_bucket_cachep, tb);
88     		}
89     	}
90     	spin_unlock(&bhead->lock);
91     
92     #ifdef INET_REFCNT_DEBUG
93     	if (atomic_read(&tw->refcnt) != 1) {
94     		printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
95     	}
96     #endif
97     	tcp_tw_put(tw);
98     }
99     
100     /* 
101      * * Main purpose of TIME-WAIT state is to close connection gracefully,
102      *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
103      *   (and, probably, tail of data) and one or more our ACKs are lost.
104      * * What is TIME-WAIT timeout? It is associated with maximal packet
105      *   lifetime in the internet, which results in wrong conclusion, that
106      *   it is set to catch "old duplicate segments" wandering out of their path.
107      *   It is not quite correct. This timeout is calculated so that it exceeds
108      *   maximal retransmission timeout enough to allow to lose one (or more)
109      *   segments sent by peer and our ACKs. This time may be calculated from RTO.
110      * * When TIME-WAIT socket receives RST, it means that another end
111      *   finally closed and we are allowed to kill TIME-WAIT too.
112      * * Second purpose of TIME-WAIT is catching old duplicate segments.
113      *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
114      *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
115      * * If we invented some more clever way to catch duplicates
116      *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
117      *
118      * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
119      * When you compare it to RFCs, please, read section SEGMENT ARRIVES
120      * from the very beginning.
121      *
122      * NOTE. With recycling (and later with fin-wait-2) TW bucket
123      * is _not_ stateless. It means, that strictly speaking we must
124      * spinlock it. I do not want! Well, probability of misbehaviour
125      * is ridiculously low and, seems, we could use some mb() tricks
126      * to avoid misread sequence numbers, states etc.  --ANK
127      */
128     enum tcp_tw_status
129     tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
130     			   struct tcphdr *th, unsigned len)
131     {
132     	struct tcp_opt tp;
133     	int paws_reject = 0;
134     
135     	tp.saw_tstamp = 0;
136     	if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
137     		tcp_parse_options(skb, &tp, 0);
138     
139     		if (tp.saw_tstamp) {
140     			tp.ts_recent = tw->ts_recent;
141     			tp.ts_recent_stamp = tw->ts_recent_stamp;
142     			paws_reject = tcp_paws_check(&tp, th->rst);
143     		}
144     	}
145     
146     	if (tw->substate == TCP_FIN_WAIT2) {
147     		/* Just repeat all the checks of tcp_rcv_state_process() */
148     
149     		/* Out of window, send ACK */
150     		if (paws_reject ||
151     		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
152     				   tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd))
153     			return TCP_TW_ACK;
154     
155     		if (th->rst)
156     			goto kill;
157     
158     		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->rcv_nxt))
159     			goto kill_with_rst;
160     
161     		/* Dup ACK? */
162     		if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) ||
163     		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
164     			tcp_tw_put(tw);
165     			return TCP_TW_SUCCESS;
166     		}
167     
168     		/* New data or FIN. If new data arrive after half-duplex close,
169     		 * reset.
170     		 */
171     		if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) {
172     kill_with_rst:
173     			tcp_tw_deschedule(tw);
174     			tcp_timewait_kill(tw);
175     			tcp_tw_put(tw);
176     			return TCP_TW_RST;
177     		}
178     
179     		/* FIN arrived, enter true time-wait state. */
180     		tw->substate = TCP_TIME_WAIT;
181     		tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
182     		if (tp.saw_tstamp) {
183     			tw->ts_recent_stamp = xtime.tv_sec;
184     			tw->ts_recent = tp.rcv_tsval;
185     		}
186     
187     		/* I am shamed, but failed to make it more elegant.
188     		 * Yes, it is direct reference to IP, which is impossible
189     		 * to generalize to IPv6. Taking into account that IPv6
190     		 * do not undertsnad recycling in any case, it not
191     		 * a big problem in practice. --ANK */
192     		if (tw->family == AF_INET &&
193     		    sysctl_tcp_tw_recycle && tw->ts_recent_stamp &&
194     		    tcp_v4_tw_remember_stamp(tw))
195     			tcp_tw_schedule(tw, tw->timeout);
196     		else
197     			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
198     		return TCP_TW_ACK;
199     	}
200     
201     	/*
202     	 *	Now real TIME-WAIT state.
203     	 *
204     	 *	RFC 1122:
205     	 *	"When a connection is [...] on TIME-WAIT state [...]
206     	 *	[a TCP] MAY accept a new SYN from the remote TCP to
207     	 *	reopen the connection directly, if it:
208     	 *	
209     	 *	(1)  assigns its initial sequence number for the new
210     	 *	connection to be larger than the largest sequence
211     	 *	number it used on the previous connection incarnation,
212     	 *	and
213     	 *
214     	 *	(2)  returns to TIME-WAIT state if the SYN turns out 
215     	 *	to be an old duplicate".
216     	 */
217     
218     	if (!paws_reject &&
219     	    (TCP_SKB_CB(skb)->seq == tw->rcv_nxt &&
220     	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
221     		/* In window segment, it may be only reset or bare ack. */
222     
223     		if (th->rst) {
224     			/* This is TIME_WAIT assasination, in two flavors.
225     			 * Oh well... nobody has a sufficient solution to this
226     			 * protocol bug yet.
227     			 */
228     			if (sysctl_tcp_rfc1337 == 0) {
229     kill:
230     				tcp_tw_deschedule(tw);
231     				tcp_timewait_kill(tw);
232     				tcp_tw_put(tw);
233     				return TCP_TW_SUCCESS;
234     			}
235     		}
236     		tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
237     
238     		if (tp.saw_tstamp) {
239     			tw->ts_recent = tp.rcv_tsval;
240     			tw->ts_recent_stamp = xtime.tv_sec;
241     		}
242     
243     		tcp_tw_put(tw);
244     		return TCP_TW_SUCCESS;
245     	}
246     
247     	/* Out of window segment.
248     
249     	   All the segments are ACKed immediately.
250     
251     	   The only exception is new SYN. We accept it, if it is
252     	   not old duplicate and we are not in danger to be killed
253     	   by delayed old duplicates. RFC check is that it has
254     	   newer sequence number works at rates <40Mbit/sec.
255     	   However, if paws works, it is reliable AND even more,
256     	   we even may relax silly seq space cutoff.
257     
258     	   RED-PEN: we violate main RFC requirement, if this SYN will appear
259     	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
260     	   we must return socket to time-wait state. It is not good,
261     	   but not fatal yet.
262     	 */
263     
264     	if (th->syn && !th->rst && !th->ack && !paws_reject &&
265     	    (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
266     	     (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) {
267     		u32 isn = tw->snd_nxt+65535+2;
268     		if (isn == 0)
269     			isn++;
270     		TCP_SKB_CB(skb)->when = isn;
271     		return TCP_TW_SYN;
272     	}
273     
274     	if (paws_reject)
275     		NET_INC_STATS_BH(PAWSEstabRejected);
276     
277     	if(!th->rst) {
278     		/* In this case we must reset the TIMEWAIT timer.
279     		 *
280     		 * If it is ACKless SYN it may be both old duplicate
281     		 * and new good SYN with random sequence number <rcv_nxt.
282     		 * Do not reschedule in the last case.
283     		 */
284     		if (paws_reject || th->ack)
285     			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
286     
287     		/* Send ACK. Note, we do not put the bucket,
288     		 * it will be released by caller.
289     		 */
290     		return TCP_TW_ACK;
291     	}
292     	tcp_tw_put(tw);
293     	return TCP_TW_SUCCESS;
294     }
295     
296     /* Enter the time wait state.  This is called with locally disabled BH.
297      * Essentially we whip up a timewait bucket, copy the
298      * relevant info into it from the SK, and mess with hash chains
299      * and list linkage.
300      */
301     static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
302     {
303     	struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
304     	struct tcp_bind_hashbucket *bhead;
305     	struct sock **head, *sktw;
306     
307     	write_lock(&ehead->lock);
308     
309     	/* Step 1: Remove SK from established hash. */
310     	if (sk->pprev) {
311     		if(sk->next)
312     			sk->next->pprev = sk->pprev;
313     		*sk->pprev = sk->next;
314     		sk->pprev = NULL;
315     		sock_prot_dec_use(sk->prot);
316     	}
317     
318     	/* Step 2: Hash TW into TIMEWAIT half of established hash table. */
319     	head = &(ehead + tcp_ehash_size)->chain;
320     	sktw = (struct sock *)tw;
321     	if((sktw->next = *head) != NULL)
322     		(*head)->pprev = &sktw->next;
323     	*head = sktw;
324     	sktw->pprev = head;
325     	atomic_inc(&tw->refcnt);
326     
327     	write_unlock(&ehead->lock);
328     
329     	/* Step 3: Put TW into bind hash. Original socket stays there too.
330     	   Note, that any socket with sk->num!=0 MUST be bound in binding
331     	   cache, even if it is closed.
332     	 */
333     	bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
334     	spin_lock(&bhead->lock);
335     	tw->tb = (struct tcp_bind_bucket *)sk->prev;
336     	BUG_TRAP(sk->prev!=NULL);
337     	if ((tw->bind_next = tw->tb->owners) != NULL)
338     		tw->tb->owners->bind_pprev = &tw->bind_next;
339     	tw->tb->owners = (struct sock*)tw;
340     	tw->bind_pprev = &tw->tb->owners;
341     	spin_unlock(&bhead->lock);
342     }
343     
344     /* 
345      * Move a socket to time-wait or dead fin-wait-2 state.
346      */ 
347     void tcp_time_wait(struct sock *sk, int state, int timeo)
348     {
349     	struct tcp_tw_bucket *tw = NULL;
350     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
351     	int recycle_ok = 0;
352     
353     	if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
354     		recycle_ok = tp->af_specific->remember_stamp(sk);
355     
356     	if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
357     		tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
358     
359     	if(tw != NULL) {
360     		int rto = (tp->rto<<2) - (tp->rto>>1);
361     
362     		/* Give us an identity. */
363     		tw->daddr	= sk->daddr;
364     		tw->rcv_saddr	= sk->rcv_saddr;
365     		tw->bound_dev_if= sk->bound_dev_if;
366     		tw->num		= sk->num;
367     		tw->state	= TCP_TIME_WAIT;
368     		tw->substate	= state;
369     		tw->sport	= sk->sport;
370     		tw->dport	= sk->dport;
371     		tw->family	= sk->family;
372     		tw->reuse	= sk->reuse;
373     		tw->rcv_wscale	= tp->rcv_wscale;
374     		atomic_set(&tw->refcnt, 1);
375     
376     		tw->hashent	= sk->hashent;
377     		tw->rcv_nxt	= tp->rcv_nxt;
378     		tw->snd_nxt	= tp->snd_nxt;
379     		tw->rcv_wnd	= tcp_receive_window(tp);
380     		tw->ts_recent	= tp->ts_recent;
381     		tw->ts_recent_stamp= tp->ts_recent_stamp;
382     		tw->pprev_death = NULL;
383     
384     #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
385     		if(tw->family == PF_INET6) {
386     			memcpy(&tw->v6_daddr,
387     			       &sk->net_pinfo.af_inet6.daddr,
388     			       sizeof(struct in6_addr));
389     			memcpy(&tw->v6_rcv_saddr,
390     			       &sk->net_pinfo.af_inet6.rcv_saddr,
391     			       sizeof(struct in6_addr));
392     		}
393     #endif
394     		/* Linkage updates. */
395     		__tcp_tw_hashdance(sk, tw);
396     
397     		/* Get the TIME_WAIT timeout firing. */
398     		if (timeo < rto)
399     			timeo = rto;
400     
401     		if (recycle_ok) {
402     			tw->timeout = rto;
403     		} else {
404     			tw->timeout = TCP_TIMEWAIT_LEN;
405     			if (state == TCP_TIME_WAIT)
406     				timeo = TCP_TIMEWAIT_LEN;
407     		}
408     
409     		tcp_tw_schedule(tw, timeo);
410     		tcp_tw_put(tw);
411     	} else {
412     		/* Sorry, if we're out of memory, just CLOSE this
413     		 * socket up.  We've got bigger problems than
414     		 * non-graceful socket closings.
415     		 */
416     		if (net_ratelimit())
417     			printk(KERN_INFO "TCP: time wait bucket table overflow\n");
418     	}
419     
420     	tcp_update_metrics(sk);
421     	tcp_done(sk);
422     }
423     
424     /* Kill off TIME_WAIT sockets once their lifetime has expired. */
425     static int tcp_tw_death_row_slot = 0;
426     
427     static void tcp_twkill(unsigned long);
428     
429     static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS];
430     static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
431     static struct timer_list tcp_tw_timer = { function: tcp_twkill };
432     
433     static void SMP_TIMER_NAME(tcp_twkill)(unsigned long dummy)
434     {
435     	struct tcp_tw_bucket *tw;
436     	int killed = 0;
437     
438     	/* NOTE: compare this to previous version where lock
439     	 * was released after detaching chain. It was racy,
440     	 * because tw buckets are scheduled in not serialized context
441     	 * in 2.3 (with netfilter), and with softnet it is common, because
442     	 * soft irqs are not sequenced.
443     	 */
444     	spin_lock(&tw_death_lock);
445     
446     	if (tcp_tw_count == 0)
447     		goto out;
448     
449     	while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) {
450     		tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death;
451     		tw->pprev_death = NULL;
452     		spin_unlock(&tw_death_lock);
453     
454     		tcp_timewait_kill(tw);
455     		tcp_tw_put(tw);
456     
457     		killed++;
458     
459     		spin_lock(&tw_death_lock);
460     	}
461     	tcp_tw_death_row_slot =
462     		((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
463     
464     	if ((tcp_tw_count -= killed) != 0)
465     		mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
466     	net_statistics[smp_processor_id()*2].TimeWaited += killed;
467     out:
468     	spin_unlock(&tw_death_lock);
469     }
470     
471     SMP_TIMER_DEFINE(tcp_twkill, tcp_twkill_task);
472     
473     /* These are always called from BH context.  See callers in
474      * tcp_input.c to verify this.
475      */
476     
477     /* This is for handling early-kills of TIME_WAIT sockets. */
478     void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
479     {
480     	spin_lock(&tw_death_lock);
481     	if (tw->pprev_death) {
482     		if(tw->next_death)
483     			tw->next_death->pprev_death = tw->pprev_death;
484     		*tw->pprev_death = tw->next_death;
485     		tw->pprev_death = NULL;
486     		tcp_tw_put(tw);
487     		if (--tcp_tw_count == 0)
488     			del_timer(&tcp_tw_timer);
489     	}
490     	spin_unlock(&tw_death_lock);
491     }
492     
493     /* Short-time timewait calendar */
494     
495     static int tcp_twcal_hand = -1;
496     static int tcp_twcal_jiffie;
497     static void tcp_twcal_tick(unsigned long);
498     static struct timer_list tcp_twcal_timer = {function: tcp_twcal_tick};
499     static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
500     
501     void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
502     {
503     	struct tcp_tw_bucket **tpp;
504     	int slot;
505     
506     	/* timeout := RTO * 3.5
507     	 *
508     	 * 3.5 = 1+2+0.5 to wait for two retransmits.
509     	 *
510     	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
511     	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
512     	 * FINs (or previous seqments) are lost (probability of such event
513     	 * is p^(N+1), where p is probability to lose single packet and
514     	 * time to detect the loss is about RTO*(2^N - 1) with exponential
515     	 * backoff). Normal timewait length is calculated so, that we
516     	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
517     	 * [ BTW Linux. following BSD, violates this requirement waiting
518     	 *   only for 60sec, we should wait at least for 240 secs.
519     	 *   Well, 240 consumes too much of resources 8)
520     	 * ]
521     	 * This interval is not reduced to catch old duplicate and
522     	 * responces to our wandering segments living for two MSLs.
523     	 * However, if we use PAWS to detect
524     	 * old duplicates, we can reduce the interval to bounds required
525     	 * by RTO, rather than MSL. So, if peer understands PAWS, we
526     	 * kill tw bucket after 3.5*RTO (it is important that this number
527     	 * is greater than TS tick!) and detect old duplicates with help
528     	 * of PAWS.
529     	 */
530     	slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
531     
532     	spin_lock(&tw_death_lock);
533     
534     	/* Unlink it, if it was scheduled */
535     	if (tw->pprev_death) {
536     		if(tw->next_death)
537     			tw->next_death->pprev_death = tw->pprev_death;
538     		*tw->pprev_death = tw->next_death;
539     		tw->pprev_death = NULL;
540     		tcp_tw_count--;
541     	} else
542     		atomic_inc(&tw->refcnt);
543     
544     	if (slot >= TCP_TW_RECYCLE_SLOTS) {
545     		/* Schedule to slow timer */
546     		if (timeo >= TCP_TIMEWAIT_LEN) {
547     			slot = TCP_TWKILL_SLOTS-1;
548     		} else {
549     			slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
550     			if (slot >= TCP_TWKILL_SLOTS)
551     				slot = TCP_TWKILL_SLOTS-1;
552     		}
553     		tw->ttd = jiffies + timeo;
554     		slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
555     		tpp = &tcp_tw_death_row[slot];
556     	} else {
557     		tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK);
558     
559     		if (tcp_twcal_hand < 0) {
560     			tcp_twcal_hand = 0;
561     			tcp_twcal_jiffie = jiffies;
562     			tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
563     			add_timer(&tcp_twcal_timer);
564     		} else {
565     			if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK))
566     				mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
567     			slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
568     		}
569     		tpp = &tcp_twcal_row[slot];
570     	}
571     
572     	if((tw->next_death = *tpp) != NULL)
573     		(*tpp)->pprev_death = &tw->next_death;
574     	*tpp = tw;
575     	tw->pprev_death = tpp;
576     
577     	if (tcp_tw_count++ == 0)
578     		mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
579     	spin_unlock(&tw_death_lock);
580     }
581     
582     void SMP_TIMER_NAME(tcp_twcal_tick)(unsigned long dummy)
583     {
584     	int n, slot;
585     	unsigned long j;
586     	unsigned long now = jiffies;
587     	int killed = 0;
588     	int adv = 0;
589     
590     	spin_lock(&tw_death_lock);
591     	if (tcp_twcal_hand < 0)
592     		goto out;
593     
594     	slot = tcp_twcal_hand;
595     	j = tcp_twcal_jiffie;
596     
597     	for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
598     		if ((long)(j - now) <= 0) {
599     			struct tcp_tw_bucket *tw;
600     
601     			while((tw = tcp_twcal_row[slot]) != NULL) {
602     				tcp_twcal_row[slot] = tw->next_death;
603     				tw->pprev_death = NULL;
604     
605     				tcp_timewait_kill(tw);
606     				tcp_tw_put(tw);
607     				killed++;
608     			}
609     		} else {
610     			if (!adv) {
611     				adv = 1;
612     				tcp_twcal_jiffie = j;
613     				tcp_twcal_hand = slot;
614     			}
615     
616     			if (tcp_twcal_row[slot] != NULL) {
617     				mod_timer(&tcp_twcal_timer, j);
618     				goto out;
619     			}
620     		}
621     		j += (1<<TCP_TW_RECYCLE_TICK);
622     		slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
623     	}
624     	tcp_twcal_hand = -1;
625     
626     out:
627     	if ((tcp_tw_count -= killed) == 0)
628     		del_timer(&tcp_tw_timer);
629     	net_statistics[smp_processor_id()*2].TimeWaitKilled += killed;
630     	spin_unlock(&tw_death_lock);
631     }
632     
633     SMP_TIMER_DEFINE(tcp_twcal_tick, tcp_twcal_tasklet);
634     
635     
636     /* This is not only more efficient than what we used to do, it eliminates
637      * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
638      *
639      * Actually, we could lots of memory writes here. tp of listening
640      * socket contains all necessary default parameters.
641      */
642     struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
643     {
644     	struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
645     
646     	if(newsk != NULL) {
647     		struct tcp_opt *newtp;
648     #ifdef CONFIG_FILTER
649     		struct sk_filter *filter;
650     #endif
651     
652     		memcpy(newsk, sk, sizeof(*newsk));
653     		newsk->state = TCP_SYN_RECV;
654     
655     		/* SANITY */
656     		newsk->pprev = NULL;
657     		newsk->prev = NULL;
658     
659     		/* Clone the TCP header template */
660     		newsk->dport = req->rmt_port;
661     
662     		sock_lock_init(newsk);
663     		bh_lock_sock(newsk);
664     
665     		newsk->dst_lock	= RW_LOCK_UNLOCKED;
666     		atomic_set(&newsk->rmem_alloc, 0);
667     		skb_queue_head_init(&newsk->receive_queue);
668     		atomic_set(&newsk->wmem_alloc, 0);
669     		skb_queue_head_init(&newsk->write_queue);
670     		atomic_set(&newsk->omem_alloc, 0);
671     		newsk->wmem_queued = 0;
672     		newsk->forward_alloc = 0;
673     
674     		newsk->done = 0;
675     		newsk->userlocks = sk->userlocks & ~SOCK_BINDPORT_LOCK;
676     		newsk->proc = 0;
677     		newsk->backlog.head = newsk->backlog.tail = NULL;
678     		newsk->callback_lock = RW_LOCK_UNLOCKED;
679     		skb_queue_head_init(&newsk->error_queue);
680     		newsk->write_space = tcp_write_space;
681     #ifdef CONFIG_FILTER
682     		if ((filter = newsk->filter) != NULL)
683     			sk_filter_charge(newsk, filter);
684     #endif
685     
686     		/* Now setup tcp_opt */
687     		newtp = &(newsk->tp_pinfo.af_tcp);
688     		newtp->pred_flags = 0;
689     		newtp->rcv_nxt = req->rcv_isn + 1;
690     		newtp->snd_nxt = req->snt_isn + 1;
691     		newtp->snd_una = req->snt_isn + 1;
692     		newtp->snd_sml = req->snt_isn + 1;
693     
694     		tcp_prequeue_init(newtp);
695     
696     		tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
697     
698     		newtp->retransmits = 0;
699     		newtp->backoff = 0;
700     		newtp->srtt = 0;
701     		newtp->mdev = TCP_TIMEOUT_INIT;
702     		newtp->rto = TCP_TIMEOUT_INIT;
703     
704     		newtp->packets_out = 0;
705     		newtp->left_out = 0;
706     		newtp->retrans_out = 0;
707     		newtp->sacked_out = 0;
708     		newtp->fackets_out = 0;
709     		newtp->snd_ssthresh = 0x7fffffff;
710     
711     		/* So many TCP implementations out there (incorrectly) count the
712     		 * initial SYN frame in their delayed-ACK and congestion control
713     		 * algorithms that we must have the following bandaid to talk
714     		 * efficiently to them.  -DaveM
715     		 */
716     		newtp->snd_cwnd = 2;
717     		newtp->snd_cwnd_cnt = 0;
718     
719     		newtp->ca_state = TCP_CA_Open;
720     		tcp_init_xmit_timers(newsk);
721     		skb_queue_head_init(&newtp->out_of_order_queue);
722     		newtp->send_head = NULL;
723     		newtp->rcv_wup = req->rcv_isn + 1;
724     		newtp->write_seq = req->snt_isn + 1;
725     		newtp->pushed_seq = newtp->write_seq;
726     		newtp->copied_seq = req->rcv_isn + 1;
727     
728     		newtp->saw_tstamp = 0;
729     
730     		newtp->dsack = 0;
731     		newtp->eff_sacks = 0;
732     
733     		newtp->probes_out = 0;
734     		newtp->num_sacks = 0;
735     		newtp->urg_data = 0;
736     		newtp->listen_opt = NULL;
737     		newtp->accept_queue = newtp->accept_queue_tail = NULL;
738     		/* Deinitialize syn_wait_lock to trap illegal accesses. */
739     		memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
740     
741     		/* Back to base struct sock members. */
742     		newsk->err = 0;
743     		newsk->priority = 0;
744     		atomic_set(&newsk->refcnt, 2);
745     #ifdef INET_REFCNT_DEBUG
746     		atomic_inc(&inet_sock_nr);
747     #endif
748     		atomic_inc(&tcp_sockets_allocated);
749     
750     		if (newsk->keepopen)
751     			tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
752     		newsk->socket = NULL;
753     		newsk->sleep = NULL;
754     
755     		newtp->tstamp_ok = req->tstamp_ok;
756     		if((newtp->sack_ok = req->sack_ok) != 0) {
757     			if (sysctl_tcp_fack)
758     				newtp->sack_ok |= 2;
759     		}
760     		newtp->window_clamp = req->window_clamp;
761     		newtp->rcv_ssthresh = req->rcv_wnd;
762     		newtp->rcv_wnd = req->rcv_wnd;
763     		newtp->wscale_ok = req->wscale_ok;
764     		if (newtp->wscale_ok) {
765     			newtp->snd_wscale = req->snd_wscale;
766     			newtp->rcv_wscale = req->rcv_wscale;
767     		} else {
768     			newtp->snd_wscale = newtp->rcv_wscale = 0;
769     			newtp->window_clamp = min_t(u32, newtp->window_clamp, 65535);
770     		}
771     		newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
772     		newtp->max_window = newtp->snd_wnd;
773     
774     		if (newtp->tstamp_ok) {
775     			newtp->ts_recent = req->ts_recent;
776     			newtp->ts_recent_stamp = xtime.tv_sec;
777     			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
778     		} else {
779     			newtp->ts_recent_stamp = 0;
780     			newtp->tcp_header_len = sizeof(struct tcphdr);
781     		}
782     		if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
783     			newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
784     		newtp->mss_clamp = req->mss;
785     		TCP_ECN_openreq_child(newtp, req);
786     	}
787     	return newsk;
788     }
789     
790     /* 
791      *	Process an incoming packet for SYN_RECV sockets represented
792      *	as an open_request.
793      */
794     
795     struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
796     			   struct open_request *req,
797     			   struct open_request **prev)
798     {
799     	struct tcphdr *th = skb->h.th;
800     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
801     	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
802     	int paws_reject = 0;
803     	struct tcp_opt ttp;
804     	struct sock *child;
805     
806     	ttp.saw_tstamp = 0;
807     	if (th->doff > (sizeof(struct tcphdr)>>2)) {
808     		tcp_parse_options(skb, &ttp, 0);
809     
810     		if (ttp.saw_tstamp) {
811     			ttp.ts_recent = req->ts_recent;
812     			/* We do not store true stamp, but it is not required,
813     			 * it can be estimated (approximately)
814     			 * from another data.
815     			 */
816     			ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
817     			paws_reject = tcp_paws_check(&ttp, th->rst);
818     		}
819     	}
820     
821     	/* Check for pure retransmitted SYN. */
822     	if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
823     	    flg == TCP_FLAG_SYN &&
824     	    !paws_reject) {
825     		/*
826     		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
827     		 * this case on figure 6 and figure 8, but formal
828     		 * protocol description says NOTHING.
829     		 * To be more exact, it says that we should send ACK,
830     		 * because this segment (at least, if it has no data)
831     		 * is out of window.
832     		 *
833     		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
834     		 *  describe SYN-RECV state. All the description
835     		 *  is wrong, we cannot believe to it and should
836     		 *  rely only on common sense and implementation
837     		 *  experience.
838     		 *
839     		 * Enforce "SYN-ACK" according to figure 8, figure 6
840     		 * of RFC793, fixed by RFC1122.
841     		 */
842     		req->class->rtx_syn_ack(sk, req, NULL);
843     		return NULL;
844     	}
845     
846     	/* Further reproduces section "SEGMENT ARRIVES"
847     	   for state SYN-RECEIVED of RFC793.
848     	   It is broken, however, it does not work only
849     	   when SYNs are crossed, which is impossible in our
850     	   case.
851     
852     	   But generally, we should (RFC lies!) to accept ACK
853     	   from SYNACK both here and in tcp_rcv_state_process().
854     	   tcp_rcv_state_process() does not, hence, we do not too.
855     
856     	   Note that the case is absolutely generic:
857     	   we cannot optimize anything here without
858     	   violating protocol. All the checks must be made
859     	   before attempt to create socket.
860     	 */
861     
862     	/* RFC793: "first check sequence number". */
863     
864     	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
865     					  req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
866     		/* Out of window: send ACK and drop. */
867     		if (!(flg & TCP_FLAG_RST))
868     			req->class->send_ack(skb, req);
869     		if (paws_reject)
870     			NET_INC_STATS_BH(PAWSEstabRejected);
871     		return NULL;
872     	}
873     
874     	/* In sequence, PAWS is OK. */
875     
876     	if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
877     		req->ts_recent = ttp.rcv_tsval;
878     
879     	if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
880     		/* Truncate SYN, it is out of window starting
881     		   at req->rcv_isn+1. */
882     		flg &= ~TCP_FLAG_SYN;
883     	}
884     
885     	/* RFC793: "second check the RST bit" and
886     	 *	   "fourth, check the SYN bit"
887     	 */
888     	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
889     		goto embryonic_reset;
890     
891     	/* RFC793: "fifth check the ACK field" */
892     
893     	if (!(flg & TCP_FLAG_ACK))
894     		return NULL;
895     
896     	/* Invalid ACK: reset will be sent by listening socket */
897     	if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
898     		return sk;
899     	/* Also, it would be not so bad idea to check rcv_tsecr, which
900     	 * is essentially ACK extension and too early or too late values
901     	 * should cause reset in unsynchronized states.
902     	 */
903     
904     	/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
905     	if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
906     		req->acked = 1;
907     		return NULL;
908     	}
909     
910     	/* OK, ACK is valid, create big socket and
911     	 * feed this segment to it. It will repeat all
912     	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
913     	 * ESTABLISHED STATE. If it will be dropped after
914     	 * socket is created, wait for troubles.
915     	 */
916     	child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
917     	if (child == NULL)
918     		goto listen_overflow;
919     
920     	tcp_synq_unlink(tp, req, prev);
921     	tcp_synq_removed(sk, req);
922     
923     	tcp_acceptq_queue(sk, req, child);
924     	return child;
925     
926     listen_overflow:
927     	if (!sysctl_tcp_abort_on_overflow) {
928     		req->acked = 1;
929     		return NULL;
930     	}
931     
932     embryonic_reset:
933     	NET_INC_STATS_BH(EmbryonicRsts);
934     	if (!(flg & TCP_FLAG_RST))
935     		req->class->send_reset(skb);
936     
937     	tcp_synq_drop(sk, req, prev);
938     	return NULL;
939     }
940     
941     /*
942      * Queue segment on the new socket if the new socket is active,
943      * otherwise we just shortcircuit this and continue with
944      * the new socket.
945      */
946     
947     int tcp_child_process(struct sock *parent, struct sock *child,
948     		      struct sk_buff *skb)
949     {
950     	int ret = 0;
951     	int state = child->state;
952     
953     	if (child->lock.users == 0) {
954     		ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
955     
956     		/* Wakeup parent, send SIGIO */
957     		if (state == TCP_SYN_RECV && child->state != state)
958     			parent->data_ready(parent, 0);
959     	} else {
960     		/* Alas, it is possible again, because we do lookup
961     		 * in main socket hash table and lock on listening
962     		 * socket does not protect us more.
963     		 */
964     		sk_add_backlog(child, skb);
965     	}
966     
967     	bh_unlock_sock(child);
968     	sock_put(child);
969     	return ret;
970     }
971