File: /usr/src/linux/net/ipv4/tcp_output.c

1     /*
2      * INET		An implementation of the TCP/IP protocol suite for the LINUX
3      *		operating system.  INET is implemented using the  BSD Socket
4      *		interface as the means of communication with the user level.
5      *
6      *		Implementation of the Transmission Control Protocol(TCP).
7      *
8      * Version:	$Id: tcp_output.c,v 1.141 2001/09/18 22:29:10 davem Exp $
9      *
10      * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11      *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12      *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13      *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14      *		Florian La Roche, <flla@stud.uni-sb.de>
15      *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16      *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17      *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18      *		Matthew Dillon, <dillon@apollo.west.oic.com>
19      *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20      *		Jorge Cwik, <jorge@laser.satlink.net>
21      */
22     
23     /*
24      * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
25      *				:	Fragmentation on mtu decrease
26      *				:	Segment collapse on retransmit
27      *				:	AF independence
28      *
29      *		Linus Torvalds	:	send_delayed_ack
30      *		David S. Miller	:	Charge memory using the right skb
31      *					during syn/ack processing.
32      *		David S. Miller :	Output engine completely rewritten.
33      *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
34      *		Cacophonix Gaul :	draft-minshall-nagle-01
35      *		J Hadi Salim	:	ECN support
36      *
37      */
38     
39     #include <net/tcp.h>
40     
41     #include <linux/smp_lock.h>
42     
43     /* People can turn this off for buggy TCP's found in printers etc. */
44     int sysctl_tcp_retrans_collapse = 1;
45     
46     static __inline__
47     void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
48     {
49     	tp->send_head = skb->next;
50     	if (tp->send_head == (struct sk_buff *) &sk->write_queue)
51     		tp->send_head = NULL;
52     	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
53     	if (tp->packets_out++ == 0)
54     		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
55     }
56     
57     /* SND.NXT, if window was not shrunk.
58      * If window has been shrunk, what should we make? It is not clear at all.
59      * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
60      * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
61      * invalid. OK, let's make this for now:
62      */
63     static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
64     {
65     	if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
66     		return tp->snd_nxt;
67     	else
68     		return tp->snd_una+tp->snd_wnd;
69     }
70     
71     /* Calculate mss to advertise in SYN segment.
72      * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
73      *
74      * 1. It is independent of path mtu.
75      * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
76      * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
77      *    attached devices, because some buggy hosts are confused by
78      *    large MSS.
79      * 4. We do not make 3, we advertise MSS, calculated from first
80      *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
81      *    This may be overriden via information stored in routing table.
82      * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
83      *    probably even Jumbo".
84      */
85     static __u16 tcp_advertise_mss(struct sock *sk)
86     {
87     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
88     	struct dst_entry *dst = __sk_dst_get(sk);
89     	int mss = tp->advmss;
90     
91     	if (dst && dst->advmss < mss) {
92     		mss = dst->advmss;
93     		tp->advmss = mss;
94     	}
95     
96     	return (__u16)mss;
97     }
98     
99     /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
100      * This is the first part of cwnd validation mechanism. */
101     static void tcp_cwnd_restart(struct tcp_opt *tp)
102     {
103     	s32 delta = tcp_time_stamp - tp->lsndtime;
104     	u32 restart_cwnd = tcp_init_cwnd(tp);
105     	u32 cwnd = tp->snd_cwnd;
106     
107     	tp->snd_ssthresh = tcp_current_ssthresh(tp);
108     	restart_cwnd = min_t(u32, restart_cwnd, cwnd);
109     
110     	while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
111     		cwnd >>= 1;
112     	tp->snd_cwnd = max_t(u32, cwnd, restart_cwnd);
113     	tp->snd_cwnd_stamp = tcp_time_stamp;
114     	tp->snd_cwnd_used = 0;
115     }
116     
117     static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
118     {
119     	u32 now = tcp_time_stamp;
120     
121     	if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
122     		tcp_cwnd_restart(tp);
123     
124     	tp->lsndtime = now;
125     
126     	/* If it is a reply for ato after last received
127     	 * packet, enter pingpong mode.
128     	 */
129     	if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
130     		tp->ack.pingpong = 1;
131     }
132     
133     static __inline__ void tcp_event_ack_sent(struct sock *sk)
134     {
135     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
136     
137     	tcp_dec_quickack_mode(tp);
138     	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
139     }
140     
141     /* Chose a new window to advertise, update state in tcp_opt for the
142      * socket, and return result with RFC1323 scaling applied.  The return
143      * value can be stuffed directly into th->window for an outgoing
144      * frame.
145      */
146     static __inline__ u16 tcp_select_window(struct sock *sk)
147     {
148     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
149     	u32 cur_win = tcp_receive_window(tp);
150     	u32 new_win = __tcp_select_window(sk);
151     
152     	/* Never shrink the offered window */
153     	if(new_win < cur_win) {
154     		/* Danger Will Robinson!
155     		 * Don't update rcv_wup/rcv_wnd here or else
156     		 * we will not be able to advertise a zero
157     		 * window in time.  --DaveM
158     		 *
159     		 * Relax Will Robinson.
160     		 */
161     		new_win = cur_win;
162     	}
163     	tp->rcv_wnd = new_win;
164     	tp->rcv_wup = tp->rcv_nxt;
165     
166     	/* RFC1323 scaling applied */
167     	new_win >>= tp->rcv_wscale;
168     
169     	/* If we advertise zero window, disable fast path. */
170     	if (new_win == 0)
171     		tp->pred_flags = 0;
172     
173     	return new_win;
174     }
175     
176     
177     /* This routine actually transmits TCP packets queued in by
178      * tcp_do_sendmsg().  This is used by both the initial
179      * transmission and possible later retransmissions.
180      * All SKB's seen here are completely headerless.  It is our
181      * job to build the TCP header, and pass the packet down to
182      * IP so it can do the same plus pass the packet off to the
183      * device.
184      *
185      * We are working here with either a clone of the original
186      * SKB, or a fresh unique copy made by the retransmit engine.
187      */
188     int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
189     {
190     	if(skb != NULL) {
191     		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
192     		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
193     		int tcp_header_size = tp->tcp_header_len;
194     		struct tcphdr *th;
195     		int sysctl_flags;
196     		int err;
197     
198     #define SYSCTL_FLAG_TSTAMPS	0x1
199     #define SYSCTL_FLAG_WSCALE	0x2
200     #define SYSCTL_FLAG_SACK	0x4
201     
202     		sysctl_flags = 0;
203     		if (tcb->flags & TCPCB_FLAG_SYN) {
204     			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
205     			if(sysctl_tcp_timestamps) {
206     				tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
207     				sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
208     			}
209     			if(sysctl_tcp_window_scaling) {
210     				tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
211     				sysctl_flags |= SYSCTL_FLAG_WSCALE;
212     			}
213     			if(sysctl_tcp_sack) {
214     				sysctl_flags |= SYSCTL_FLAG_SACK;
215     				if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
216     					tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
217     			}
218     		} else if (tp->eff_sacks) {
219     			/* A SACK is 2 pad bytes, a 2 byte header, plus
220     			 * 2 32-bit sequence numbers for each SACK block.
221     			 */
222     			tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
223     					    (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
224     		}
225     		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
226     		skb->h.th = th;
227     		skb_set_owner_w(skb, sk);
228     
229     		/* Build TCP header and checksum it. */
230     		th->source		= sk->sport;
231     		th->dest		= sk->dport;
232     		th->seq			= htonl(tcb->seq);
233     		th->ack_seq		= htonl(tp->rcv_nxt);
234     		*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | tcb->flags);
235     		if (tcb->flags & TCPCB_FLAG_SYN) {
236     			/* RFC1323: The window in SYN & SYN/ACK segments
237     			 * is never scaled.
238     			 */
239     			th->window	= htons(tp->rcv_wnd);
240     		} else {
241     			th->window	= htons(tcp_select_window(sk));
242     		}
243     		th->check		= 0;
244     		th->urg_ptr		= 0;
245     
246     		if (tp->urg_mode &&
247     		    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
248     			th->urg_ptr		= htons(tp->snd_up-tcb->seq);
249     			th->urg			= 1;
250     		}
251     
252     		if (tcb->flags & TCPCB_FLAG_SYN) {
253     			tcp_syn_build_options((__u32 *)(th + 1),
254     					      tcp_advertise_mss(sk),
255     					      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
256     					      (sysctl_flags & SYSCTL_FLAG_SACK),
257     					      (sysctl_flags & SYSCTL_FLAG_WSCALE),
258     					      tp->rcv_wscale,
259     					      tcb->when,
260     		      			      tp->ts_recent);
261     		} else {
262     			tcp_build_and_update_options((__u32 *)(th + 1),
263     						     tp, tcb->when);
264     
265     			TCP_ECN_send(sk, tp, skb, tcp_header_size);
266     		}
267     		tp->af_specific->send_check(sk, th, skb->len, skb);
268     
269     		if (tcb->flags & TCPCB_FLAG_ACK)
270     			tcp_event_ack_sent(sk);
271     
272     		if (skb->len != tcp_header_size)
273     			tcp_event_data_sent(tp, skb);
274     
275     		TCP_INC_STATS(TcpOutSegs);
276     
277     		err = tp->af_specific->queue_xmit(skb);
278     		if (err <= 0)
279     			return err;
280     
281     		tcp_enter_cwr(tp);
282     
283     		/* NET_XMIT_CN is special. It does not guarantee,
284     		 * that this packet is lost. It tells that device
285     		 * is about to start to drop packets or already
286     		 * drops some packets of the same priority and
287     		 * invokes us to send less aggressively.
288     		 */
289     		return err == NET_XMIT_CN ? 0 : err;
290     	}
291     	return -ENOBUFS;
292     #undef SYSCTL_FLAG_TSTAMPS
293     #undef SYSCTL_FLAG_WSCALE
294     #undef SYSCTL_FLAG_SACK
295     }
296     
297     
298     /* This is the main buffer sending routine. We queue the buffer
299      * and decide whether to queue or transmit now.
300      *
301      * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
302      * otherwise socket can stall.
303      */
304     void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
305     {
306     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
307     
308     	/* Advance write_seq and place onto the write_queue. */
309     	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
310     	__skb_queue_tail(&sk->write_queue, skb);
311     	tcp_charge_skb(sk, skb);
312     
313     	if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
314     		/* Send it out now. */
315     		TCP_SKB_CB(skb)->when = tcp_time_stamp;
316     		if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
317     			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
318     			tcp_minshall_update(tp, cur_mss, skb);
319     			if (tp->packets_out++ == 0)
320     				tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
321     			return;
322     		}
323     	}
324     	/* Queue it, remembering where we must start sending. */
325     	if (tp->send_head == NULL)
326     		tp->send_head = skb;
327     }
328     
329     /* Send _single_ skb sitting at the send head. This function requires
330      * true push pending frames to setup probe timer etc.
331      */
332     void tcp_push_one(struct sock *sk, unsigned cur_mss)
333     {
334     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
335     	struct sk_buff *skb = tp->send_head;
336     
337     	if (tcp_snd_test(tp, skb, cur_mss, 1)) {
338     		/* Send it out now. */
339     		TCP_SKB_CB(skb)->when = tcp_time_stamp;
340     		if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
341     			tp->send_head = NULL;
342     			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
343     			if (tp->packets_out++ == 0)
344     				tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
345     			return;
346     		}
347     	}
348     }
349     
350     /* Split fragmented skb to two parts at length len. */
351     
352     static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len)
353     {
354     	int i;
355     	int pos = skb->len - skb->data_len;
356     
357     	if (len < pos) {
358     		/* Split line is inside header. */
359     		memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len);
360     
361     		/* And move data appendix as is. */
362     		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
363     			skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
364     
365     		skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
366     		skb_shinfo(skb)->nr_frags = 0;
367     
368     		skb1->data_len = skb->data_len;
369     		skb1->len += skb1->data_len;
370     		skb->data_len = 0;
371     		skb->len = len;
372     		skb->tail = skb->data+len;
373     	} else {
374     		int k = 0;
375     		int nfrags = skb_shinfo(skb)->nr_frags;
376     
377     		/* Second chunk has no header, nothing to copy. */
378     
379     		skb_shinfo(skb)->nr_frags = 0;
380     		skb1->len = skb1->data_len = skb->len - len;
381     		skb->len = len;
382     		skb->data_len = len - pos;
383     
384     		for (i=0; i<nfrags; i++) {
385     			int size = skb_shinfo(skb)->frags[i].size;
386     			if (pos + size > len) {
387     				skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
388     
389     				if (pos < len) {
390     					/* Split frag.
391     					 * We have to variants in this case:
392     					 * 1. Move all the frag to the second
393     					 *    part, if it is possible. F.e.
394     					 *    this approach is mandatory for TUX,
395     					 *    where splitting is expensive.
396     					 * 2. Split is accurately. We make this.
397     					 */
398     					get_page(skb_shinfo(skb)->frags[i].page);
399     					skb_shinfo(skb1)->frags[0].page_offset += (len-pos);
400     					skb_shinfo(skb1)->frags[0].size -= (len-pos);
401     					skb_shinfo(skb)->frags[i].size = len-pos;
402     					skb_shinfo(skb)->nr_frags++;
403     				}
404     				k++;
405     			} else {
406     				skb_shinfo(skb)->nr_frags++;
407     			}
408     			pos += size;
409     		}
410     		skb_shinfo(skb1)->nr_frags = k;
411     	}
412     }
413     
414     /* Function to create two new TCP segments.  Shrinks the given segment
415      * to the specified size and appends a new segment with the rest of the
416      * packet to the list.  This won't be called frequently, I hope. 
417      * Remember, these are still headerless SKBs at this point.
418      */
419     static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
420     {
421     	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
422     	struct sk_buff *buff;
423     	int nsize = skb->len - len;
424     	u16 flags;
425     
426     	if (skb_cloned(skb) &&
427     	    skb_is_nonlinear(skb) &&
428     	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
429     		return -ENOMEM;
430     
431     	/* Get a new skb... force flag on. */
432     	buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC);
433     	if (buff == NULL)
434     		return -ENOMEM; /* We'll just try again later. */
435     	tcp_charge_skb(sk, buff);
436     
437     	/* Correct the sequence numbers. */
438     	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
439     	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
440     	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
441     
442     	/* PSH and FIN should only be set in the second packet. */
443     	flags = TCP_SKB_CB(skb)->flags;
444     	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
445     	TCP_SKB_CB(buff)->flags = flags;
446     	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
447     	if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
448     		tp->lost_out++;
449     		tp->left_out++;
450     	}
451     	TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
452     
453     	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
454     		/* Copy and checksum data tail into the new buffer. */
455     		buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
456     						       nsize, 0);
457     
458     		skb_trim(skb, len);
459     
460     		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
461     	} else {
462     		skb->ip_summed = CHECKSUM_HW;
463     		skb_split(skb, buff, len);
464     	}
465     
466     	buff->ip_summed = skb->ip_summed;
467     
468     	/* Looks stupid, but our code really uses when of
469     	 * skbs, which it never sent before. --ANK
470     	 */
471     	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
472     
473     	/* Link BUFF into the send queue. */
474     	__skb_append(skb, buff);
475     
476     	return 0;
477     }
478     
479     /* This function synchronize snd mss to current pmtu/exthdr set.
480     
481        tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
482        for TCP options, but includes only bare TCP header.
483     
484        tp->mss_clamp is mss negotiated at connection setup.
485        It is minumum of user_mss and mss received with SYN.
486        It also does not include TCP options.
487     
488        tp->pmtu_cookie is last pmtu, seen by this function.
489     
490        tp->mss_cache is current effective sending mss, including
491        all tcp options except for SACKs. It is evaluated,
492        taking into account current pmtu, but never exceeds
493        tp->mss_clamp.
494     
495        NOTE1. rfc1122 clearly states that advertised MSS
496        DOES NOT include either tcp or ip options.
497     
498        NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
499        this function.			--ANK (980731)
500      */
501     
502     int tcp_sync_mss(struct sock *sk, u32 pmtu)
503     {
504     	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
505     	int mss_now;
506     
507     	/* Calculate base mss without TCP options:
508     	   It is MMS_S - sizeof(tcphdr) of rfc1122
509     	 */
510     
511     	mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
512     
513     	/* Clamp it (mss_clamp does not include tcp options) */
514     	if (mss_now > tp->mss_clamp)
515     		mss_now = tp->mss_clamp;
516     
517     	/* Now subtract optional transport overhead */
518     	mss_now -= tp->ext_header_len;
519     
520     	/* Then reserve room for full set of TCP options and 8 bytes of data */
521     	if (mss_now < 48)
522     		mss_now = 48;
523     
524     	/* Now subtract TCP options size, not including SACKs */
525     	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
526     
527     	/* Bound mss with half of window */
528     	if (tp->max_window && mss_now > (tp->max_window>>1))
529     		mss_now = max_t(u32, (tp->max_window>>1), 68 - tp->tcp_header_len);
530     
531     	/* And store cached results */
532     	tp->pmtu_cookie = pmtu;
533     	tp->mss_cache = mss_now;
534     	return mss_now;
535     }
536     
537     
538     /* This routine writes packets to the network.  It advances the
539      * send_head.  This happens as incoming acks open up the remote
540      * window for us.
541      *
542      * Returns 1, if no segments are in flight and we have queued segments, but
543      * cannot send anything now because of SWS or another problem.
544      */
545     int tcp_write_xmit(struct sock *sk, int nonagle)
546     {
547     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
548     	unsigned int mss_now;
549     
550     	/* If we are closed, the bytes will have to remain here.
551     	 * In time closedown will finish, we empty the write queue and all
552     	 * will be happy.
553     	 */
554     	if(sk->state != TCP_CLOSE) {
555     		struct sk_buff *skb;
556     		int sent_pkts = 0;
557     
558     		/* Account for SACKS, we may need to fragment due to this.
559     		 * It is just like the real MSS changing on us midstream.
560     		 * We also handle things correctly when the user adds some
561     		 * IP options mid-stream.  Silly to do, but cover it.
562     		 */
563     		mss_now = tcp_current_mss(sk); 
564     
565     		while((skb = tp->send_head) &&
566     		      tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) {
567     			if (skb->len > mss_now) {
568     				if (tcp_fragment(sk, skb, mss_now))
569     					break;
570     			}
571     
572     			TCP_SKB_CB(skb)->when = tcp_time_stamp;
573     			if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
574     				break;
575     			/* Advance the send_head.  This one is sent out. */
576     			update_send_head(sk, tp, skb);
577     			tcp_minshall_update(tp, mss_now, skb);
578     			sent_pkts = 1;
579     		}
580     
581     		if (sent_pkts) {
582     			tcp_cwnd_validate(sk, tp);
583     			return 0;
584     		}
585     
586     		return !tp->packets_out && tp->send_head;
587     	}
588     	return 0;
589     }
590     
591     /* This function returns the amount that we can raise the
592      * usable window based on the following constraints
593      *  
594      * 1. The window can never be shrunk once it is offered (RFC 793)
595      * 2. We limit memory per socket
596      *
597      * RFC 1122:
598      * "the suggested [SWS] avoidance algorithm for the receiver is to keep
599      *  RECV.NEXT + RCV.WIN fixed until:
600      *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
601      *
602      * i.e. don't raise the right edge of the window until you can raise
603      * it at least MSS bytes.
604      *
605      * Unfortunately, the recommended algorithm breaks header prediction,
606      * since header prediction assumes th->window stays fixed.
607      *
608      * Strictly speaking, keeping th->window fixed violates the receiver
609      * side SWS prevention criteria. The problem is that under this rule
610      * a stream of single byte packets will cause the right side of the
611      * window to always advance by a single byte.
612      * 
613      * Of course, if the sender implements sender side SWS prevention
614      * then this will not be a problem.
615      * 
616      * BSD seems to make the following compromise:
617      * 
618      *	If the free space is less than the 1/4 of the maximum
619      *	space available and the free space is less than 1/2 mss,
620      *	then set the window to 0.
621      *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
622      *	Otherwise, just prevent the window from shrinking
623      *	and from being larger than the largest representable value.
624      *
625      * This prevents incremental opening of the window in the regime
626      * where TCP is limited by the speed of the reader side taking
627      * data out of the TCP receive queue. It does nothing about
628      * those cases where the window is constrained on the sender side
629      * because the pipeline is full.
630      *
631      * BSD also seems to "accidentally" limit itself to windows that are a
632      * multiple of MSS, at least until the free space gets quite small.
633      * This would appear to be a side effect of the mbuf implementation.
634      * Combining these two algorithms results in the observed behavior
635      * of having a fixed window size at almost all times.
636      *
637      * Below we obtain similar behavior by forcing the offered window to
638      * a multiple of the mss when it is feasible to do so.
639      *
640      * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
641      * Regular options like TIMESTAMP are taken into account.
642      */
643     u32 __tcp_select_window(struct sock *sk)
644     {
645     	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
646     	/* MSS for the peer's data.  Previous verions used mss_clamp
647     	 * here.  I don't know if the value based on our guesses
648     	 * of peer's MSS is better for the performance.  It's more correct
649     	 * but may be worse for the performance because of rcv_mss
650     	 * fluctuations.  --SAW  1998/11/1
651     	 */
652     	int mss = tp->ack.rcv_mss;
653     	int free_space = tcp_space(sk);
654     	int full_space = min_t(unsigned int, tp->window_clamp, tcp_full_space(sk));
655     	int window;
656     
657     	if (mss > full_space)
658     		mss = full_space; 
659     
660     	if (free_space < full_space/2) {
661     		tp->ack.quick = 0;
662     
663     		if (tcp_memory_pressure)
664     			tp->rcv_ssthresh = min_t(u32, tp->rcv_ssthresh, 4*tp->advmss);
665     
666     		if (free_space < mss)
667     			return 0;
668     	}
669     
670     	if (free_space > tp->rcv_ssthresh)
671     		free_space = tp->rcv_ssthresh;
672     
673     	/* Get the largest window that is a nice multiple of mss.
674     	 * Window clamp already applied above.
675     	 * If our current window offering is within 1 mss of the
676     	 * free space we just keep it. This prevents the divide
677     	 * and multiply from happening most of the time.
678     	 * We also don't do any window rounding when the free space
679     	 * is too small.
680     	 */
681     	window = tp->rcv_wnd;
682     	if (window <= free_space - mss || window > free_space)
683     		window = (free_space/mss)*mss;
684     
685     	return window;
686     }
687     
688     /* Attempt to collapse two adjacent SKB's during retransmission. */
689     static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
690     {
691     	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
692     	struct sk_buff *next_skb = skb->next;
693     
694     	/* The first test we must make is that neither of these two
695     	 * SKB's are still referenced by someone else.
696     	 */
697     	if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
698     		int skb_size = skb->len, next_skb_size = next_skb->len;
699     		u16 flags = TCP_SKB_CB(skb)->flags;
700     
701     		/* Also punt if next skb has been SACK'd. */
702     		if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
703     			return;
704     
705     		/* Next skb is out of window. */
706     		if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
707     			return;
708     
709     		/* Punt if not enough space exists in the first SKB for
710     		 * the data in the second, or the total combined payload
711     		 * would exceed the MSS.
712     		 */
713     		if ((next_skb_size > skb_tailroom(skb)) ||
714     		    ((skb_size + next_skb_size) > mss_now))
715     			return;
716     
717     		/* Ok.  We will be able to collapse the packet. */
718     		__skb_unlink(next_skb, next_skb->list);
719     
720     		if (next_skb->ip_summed == CHECKSUM_HW)
721     			skb->ip_summed = CHECKSUM_HW;
722     
723     		if (skb->ip_summed != CHECKSUM_HW) {
724     			memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
725     			skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
726     		}
727     
728     		/* Update sequence range on original skb. */
729     		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
730     
731     		/* Merge over control information. */
732     		flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
733     		TCP_SKB_CB(skb)->flags = flags;
734     
735     		/* All done, get rid of second SKB and account for it so
736     		 * packet counting does not break.
737     		 */
738     		TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
739     		if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
740     			tp->retrans_out--;
741     		if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
742     			tp->lost_out--;
743     			tp->left_out--;
744     		}
745     		/* Reno case is special. Sigh... */
746     		if (!tp->sack_ok && tp->sacked_out) {
747     			tp->sacked_out--;
748     			tp->left_out--;
749     		}
750     
751     		/* Not quite right: it can be > snd.fack, but
752     		 * it is better to underestimate fackets.
753     		 */
754     		if (tp->fackets_out)
755     			tp->fackets_out--;
756     		tcp_free_skb(sk, next_skb);
757     		tp->packets_out--;
758     	}
759     }
760     
761     /* Do a simple retransmit without using the backoff mechanisms in
762      * tcp_timer. This is used for path mtu discovery. 
763      * The socket is already locked here.
764      */ 
765     void tcp_simple_retransmit(struct sock *sk)
766     {
767     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
768     	struct sk_buff *skb;
769     	unsigned int mss = tcp_current_mss(sk);
770     	int lost = 0;
771     
772     	for_retrans_queue(skb, sk, tp) {
773     		if (skb->len > mss && 
774     		    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
775     			if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
776     				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
777     				tp->retrans_out--;
778     			}
779     			if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
780     				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
781     				tp->lost_out++;
782     				lost = 1;
783     			}
784     		}
785     	}
786     
787     	if (!lost)
788     		return;
789     
790     	tcp_sync_left_out(tp);
791     
792      	/* Don't muck with the congestion window here.
793     	 * Reason is that we do not increase amount of _data_
794     	 * in network, but units changed and effective
795     	 * cwnd/ssthresh really reduced now.
796     	 */
797     	if (tp->ca_state != TCP_CA_Loss) {
798     		tp->high_seq = tp->snd_nxt;
799     		tp->snd_ssthresh = tcp_current_ssthresh(tp);
800     		tp->prior_ssthresh = 0;
801     		tp->undo_marker = 0;
802     		tp->ca_state = TCP_CA_Loss;
803     	}
804     	tcp_xmit_retransmit_queue(sk);
805     }
806     
807     /* This retransmits one SKB.  Policy decisions and retransmit queue
808      * state updates are done by the caller.  Returns non-zero if an
809      * error occurred which prevented the send.
810      */
811     int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
812     {
813     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
814     	unsigned int cur_mss = tcp_current_mss(sk);
815     	int err;
816     
817     	/* Do not sent more than we queued. 1/4 is reserved for possible
818     	 * copying overhead: frgagmentation, tunneling, mangling etc.
819     	 */
820     	if (atomic_read(&sk->wmem_alloc) > min_t(int, sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
821     		return -EAGAIN;
822     
823     	/* If receiver has shrunk his window, and skb is out of
824     	 * new window, do not retransmit it. The exception is the
825     	 * case, when window is shrunk to zero. In this case
826     	 * our retransmit serves as a zero window probe.
827     	 */
828     	if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
829     	    && TCP_SKB_CB(skb)->seq != tp->snd_una)
830     		return -EAGAIN;
831     
832     	if(skb->len > cur_mss) {
833     		if(tcp_fragment(sk, skb, cur_mss))
834     			return -ENOMEM; /* We'll try again later. */
835     
836     		/* New SKB created, account for it. */
837     		tp->packets_out++;
838     	}
839     
840     	/* Collapse two adjacent packets if worthwhile and we can. */
841     	if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
842     	   (skb->len < (cur_mss >> 1)) &&
843     	   (skb->next != tp->send_head) &&
844     	   (skb->next != (struct sk_buff *)&sk->write_queue) &&
845     	   (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
846     	   (sysctl_tcp_retrans_collapse != 0))
847     		tcp_retrans_try_collapse(sk, skb, cur_mss);
848     
849     	if(tp->af_specific->rebuild_header(sk))
850     		return -EHOSTUNREACH; /* Routing failure or similar. */
851     
852     	/* Some Solaris stacks overoptimize and ignore the FIN on a
853     	 * retransmit when old data is attached.  So strip it off
854     	 * since it is cheap to do so and saves bytes on the network.
855     	 */
856     	if(skb->len > 0 &&
857     	   (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
858     	   tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
859     		if (!pskb_trim(skb, 0)) {
860     			TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
861     			skb->ip_summed = CHECKSUM_NONE;
862     			skb->csum = 0;
863     		}
864     	}
865     
866     	/* Make a copy, if the first transmission SKB clone we made
867     	 * is still in somebody's hands, else make a clone.
868     	 */
869     	TCP_SKB_CB(skb)->when = tcp_time_stamp;
870     
871     	err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
872     				    pskb_copy(skb, GFP_ATOMIC):
873     				    skb_clone(skb, GFP_ATOMIC)));
874     
875     	if (err == 0) {
876     		/* Update global TCP statistics. */
877     		TCP_INC_STATS(TcpRetransSegs);
878     
879     #if FASTRETRANS_DEBUG > 0
880     		if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
881     			if (net_ratelimit())
882     				printk(KERN_DEBUG "retrans_out leaked.\n");
883     		}
884     #endif
885     		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
886     		tp->retrans_out++;
887     
888     		/* Save stamp of the first retransmit. */
889     		if (!tp->retrans_stamp)
890     			tp->retrans_stamp = TCP_SKB_CB(skb)->when;
891     
892     		tp->undo_retrans++;
893     
894     		/* snd_nxt is stored to detect loss of retransmitted segment,
895     		 * see tcp_input.c tcp_sacktag_write_queue().
896     		 */
897     		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
898     	}
899     	return err;
900     }
901     
902     /* This gets called after a retransmit timeout, and the initially
903      * retransmitted data is acknowledged.  It tries to continue
904      * resending the rest of the retransmit queue, until either
905      * we've sent it all or the congestion window limit is reached.
906      * If doing SACK, the first ACK which comes back for a timeout
907      * based retransmit packet might feed us FACK information again.
908      * If so, we use it to avoid unnecessarily retransmissions.
909      */
910     void tcp_xmit_retransmit_queue(struct sock *sk)
911     {
912     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
913     	struct sk_buff *skb;
914     	int packet_cnt = tp->lost_out;
915     
916     	/* First pass: retransmit lost packets. */
917     	if (packet_cnt) {
918     		for_retrans_queue(skb, sk, tp) {
919     			__u8 sacked = TCP_SKB_CB(skb)->sacked;
920     
921     			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
922     				return;
923     
924     			if (sacked&TCPCB_LOST) {
925     				if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
926     					if (tcp_retransmit_skb(sk, skb))
927     						return;
928     					if (tp->ca_state != TCP_CA_Loss)
929     						NET_INC_STATS_BH(TCPFastRetrans);
930     					else
931     						NET_INC_STATS_BH(TCPSlowStartRetrans);
932     
933     					if (skb == skb_peek(&sk->write_queue))
934     						tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
935     				}
936     
937     				if (--packet_cnt <= 0)
938     					break;
939     			}
940     		}
941     	}
942     
943     	/* OK, demanded retransmission is finished. */
944     
945     	/* Forward retransmissions are possible only during Recovery. */
946     	if (tp->ca_state != TCP_CA_Recovery)
947     		return;
948     
949     	/* No forward retransmissions in Reno are possible. */
950     	if (!tp->sack_ok)
951     		return;
952     
953     	/* Yeah, we have to make difficult choice between forward transmission
954     	 * and retransmission... Both ways have their merits...
955     	 *
956     	 * For now we do not retrnamsit anything, while we have some new
957     	 * segments to send.
958     	 */
959     
960     	if (tcp_may_send_now(sk, tp))
961     		return;
962     
963     	packet_cnt = 0;
964     
965     	for_retrans_queue(skb, sk, tp) {
966     		if(++packet_cnt > tp->fackets_out)
967     			break;
968     
969     		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
970     			break;
971     
972     		if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
973     			continue;
974     
975     		/* Ok, retransmit it. */
976     		if(tcp_retransmit_skb(sk, skb))
977     			break;
978     
979     		if (skb == skb_peek(&sk->write_queue))
980     			tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
981     
982     		NET_INC_STATS_BH(TCPForwardRetrans);
983     	}
984     }
985     
986     
987     /* Send a fin.  The caller locks the socket for us.  This cannot be
988      * allowed to fail queueing a FIN frame under any circumstances.
989      */
990     void tcp_send_fin(struct sock *sk)
991     {
992     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);	
993     	struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
994     	unsigned int mss_now;
995     	
996     	/* Optimization, tack on the FIN if we have a queue of
997     	 * unsent frames.  But be careful about outgoing SACKS
998     	 * and IP options.
999     	 */
1000     	mss_now = tcp_current_mss(sk); 
1001     
1002     	if(tp->send_head != NULL) {
1003     		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1004     		TCP_SKB_CB(skb)->end_seq++;
1005     		tp->write_seq++;
1006     	} else {
1007     		/* Socket is locked, keep trying until memory is available. */
1008     		for (;;) {
1009     			skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1010     			if (skb)
1011     				break;
1012     			current->policy |= SCHED_YIELD;
1013     			schedule();
1014     		}
1015     
1016     		/* Reserve space for headers and prepare control bits. */
1017     		skb_reserve(skb, MAX_TCP_HEADER);
1018     		skb->csum = 0;
1019     		TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1020     		TCP_SKB_CB(skb)->sacked = 0;
1021     
1022     		/* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
1023     		TCP_SKB_CB(skb)->seq = tp->write_seq;
1024     		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1025     		tcp_send_skb(sk, skb, 1, mss_now);
1026     	}
1027     	__tcp_push_pending_frames(sk, tp, mss_now, 1);
1028     }
1029     
1030     /* We get here when a process closes a file descriptor (either due to
1031      * an explicit close() or as a byproduct of exit()'ing) and there
1032      * was unread data in the receive queue.  This behavior is recommended
1033      * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
1034      */
1035     void tcp_send_active_reset(struct sock *sk, int priority)
1036     {
1037     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1038     	struct sk_buff *skb;
1039     
1040     	/* NOTE: No TCP options attached and we never retransmit this. */
1041     	skb = alloc_skb(MAX_TCP_HEADER, priority);
1042     	if (!skb) {
1043     		NET_INC_STATS(TCPAbortFailed);
1044     		return;
1045     	}
1046     
1047     	/* Reserve space for headers and prepare control bits. */
1048     	skb_reserve(skb, MAX_TCP_HEADER);
1049     	skb->csum = 0;
1050     	TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1051     	TCP_SKB_CB(skb)->sacked = 0;
1052     
1053     	/* Send it off. */
1054     	TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1055     	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1056     	TCP_SKB_CB(skb)->when = tcp_time_stamp;
1057     	if (tcp_transmit_skb(sk, skb))
1058     		NET_INC_STATS(TCPAbortFailed);
1059     }
1060     
1061     /* WARNING: This routine must only be called when we have already sent
1062      * a SYN packet that crossed the incoming SYN that caused this routine
1063      * to get called. If this assumption fails then the initial rcv_wnd
1064      * and rcv_wscale values will not be correct.
1065      */
1066     int tcp_send_synack(struct sock *sk)
1067     {
1068     	struct sk_buff* skb;
1069     
1070     	skb = skb_peek(&sk->write_queue);
1071     	if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1072     		printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1073     		return -EFAULT;
1074     	}
1075     	if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1076     		if (skb_cloned(skb)) {
1077     			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1078     			if (nskb == NULL)
1079     				return -ENOMEM;
1080     			__skb_unlink(skb, &sk->write_queue);
1081     			__skb_queue_head(&sk->write_queue, nskb);
1082     			tcp_free_skb(sk, skb);
1083     			tcp_charge_skb(sk, nskb);
1084     			skb = nskb;
1085     		}
1086     
1087     		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1088     		TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
1089     	}
1090     	TCP_SKB_CB(skb)->when = tcp_time_stamp;
1091     	return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1092     }
1093     
1094     /*
1095      * Prepare a SYN-ACK.
1096      */
1097     struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1098     				 struct open_request *req)
1099     {
1100     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1101     	struct tcphdr *th;
1102     	int tcp_header_size;
1103     	struct sk_buff *skb;
1104     
1105     	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1106     	if (skb == NULL)
1107     		return NULL;
1108     
1109     	/* Reserve space for headers. */
1110     	skb_reserve(skb, MAX_TCP_HEADER);
1111     
1112     	skb->dst = dst_clone(dst);
1113     
1114     	tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1115     			   (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1116     			   (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1117     			   /* SACK_PERM is in the place of NOP NOP of TS */
1118     			   ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1119     	skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1120     
1121     	memset(th, 0, sizeof(struct tcphdr));
1122     	th->syn = 1;
1123     	th->ack = 1;
1124     	TCP_ECN_make_synack(req, th);
1125     	th->source = sk->sport;
1126     	th->dest = req->rmt_port;
1127     	TCP_SKB_CB(skb)->seq = req->snt_isn;
1128     	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1129     	th->seq = htonl(TCP_SKB_CB(skb)->seq);
1130     	th->ack_seq = htonl(req->rcv_isn + 1);
1131     	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1132     		__u8 rcv_wscale; 
1133     		/* Set this up on the first call only */
1134     		req->window_clamp = tp->window_clamp ? : dst->window;
1135     		/* tcp_full_space because it is guaranteed to be the first packet */
1136     		tcp_select_initial_window(tcp_full_space(sk), 
1137     			dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1138     			&req->rcv_wnd,
1139     			&req->window_clamp,
1140     			req->wscale_ok,
1141     			&rcv_wscale);
1142     		req->rcv_wscale = rcv_wscale; 
1143     	}
1144     
1145     	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1146     	th->window = htons(req->rcv_wnd);
1147     
1148     	TCP_SKB_CB(skb)->when = tcp_time_stamp;
1149     	tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
1150     			      req->sack_ok, req->wscale_ok, req->rcv_wscale,
1151     			      TCP_SKB_CB(skb)->when,
1152     			      req->ts_recent);
1153     
1154     	skb->csum = 0;
1155     	th->doff = (tcp_header_size >> 2);
1156     	TCP_INC_STATS(TcpOutSegs);
1157     	return skb;
1158     }
1159     
1160     int tcp_connect(struct sock *sk, struct sk_buff *buff)
1161     {
1162     	struct dst_entry *dst = __sk_dst_get(sk);
1163     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1164     
1165     	/* Reserve space for headers. */
1166     	skb_reserve(buff, MAX_TCP_HEADER);
1167     
1168     	/* We'll fix this up when we get a response from the other end.
1169     	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1170     	 */
1171     	tp->tcp_header_len = sizeof(struct tcphdr) +
1172     		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1173     
1174     	/* If user gave his TCP_MAXSEG, record it to clamp */
1175     	if (tp->user_mss)
1176     		tp->mss_clamp = tp->user_mss;
1177     	tp->max_window = 0;
1178     	tcp_sync_mss(sk, dst->pmtu);
1179     
1180     	if (!tp->window_clamp)
1181     		tp->window_clamp = dst->window;
1182     	tp->advmss = dst->advmss;
1183     	tcp_initialize_rcv_mss(sk);
1184     
1185     	tcp_select_initial_window(tcp_full_space(sk),
1186     				  tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1187     				  &tp->rcv_wnd,
1188     				  &tp->window_clamp,
1189     				  sysctl_tcp_window_scaling,
1190     				  &tp->rcv_wscale);
1191     
1192     	tp->rcv_ssthresh = tp->rcv_wnd;
1193     
1194     	/* Socket identity change complete, no longer
1195     	 * in TCP_CLOSE, so enter ourselves into the
1196     	 * hash tables.
1197     	 */
1198     	tcp_set_state(sk,TCP_SYN_SENT);
1199     	if (tp->af_specific->hash_connecting(sk))
1200     		goto err_out;
1201     
1202     	sk->err = 0;
1203     	sk->done = 0;
1204     	tp->snd_wnd = 0;
1205     	tcp_init_wl(tp, tp->write_seq, 0);
1206     	tp->snd_una = tp->write_seq;
1207     	tp->snd_sml = tp->write_seq;
1208     	tp->rcv_nxt = 0;
1209     	tp->rcv_wup = 0;
1210     	tp->copied_seq = 0;
1211     
1212     	tp->rto = TCP_TIMEOUT_INIT;
1213     	tp->retransmits = 0;
1214     	tcp_clear_retrans(tp);
1215     
1216     	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1217     	TCP_ECN_send_syn(tp, buff);
1218     	TCP_SKB_CB(buff)->sacked = 0;
1219     	buff->csum = 0;
1220     	TCP_SKB_CB(buff)->seq = tp->write_seq++;
1221     	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1222     	tp->snd_nxt = tp->write_seq;
1223     	tp->pushed_seq = tp->write_seq;
1224     
1225     	/* Send it off. */
1226     	TCP_SKB_CB(buff)->when = tcp_time_stamp;
1227     	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1228     	__skb_queue_tail(&sk->write_queue, buff);
1229     	tcp_charge_skb(sk, buff);
1230     	tp->packets_out++;
1231     	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1232     	TCP_INC_STATS(TcpActiveOpens);
1233     
1234     	/* Timer for repeating the SYN until an answer. */
1235     	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1236     	return 0;
1237     
1238     err_out:
1239     	tcp_set_state(sk,TCP_CLOSE);
1240     	kfree_skb(buff);
1241     	return -EADDRNOTAVAIL;
1242     }
1243     
1244     /* Send out a delayed ack, the caller does the policy checking
1245      * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
1246      * for details.
1247      */
1248     void tcp_send_delayed_ack(struct sock *sk)
1249     {
1250     	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1251     	int ato = tp->ack.ato;
1252     	unsigned long timeout;
1253     
1254     	if (ato > TCP_DELACK_MIN) {
1255     		int max_ato = HZ/2;
1256     
1257     		if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1258     			max_ato = TCP_DELACK_MAX;
1259     
1260     		/* Slow path, intersegment interval is "high". */
1261     
1262     		/* If some rtt estimate is known, use it to bound delayed ack.
1263     		 * Do not use tp->rto here, use results of rtt measurements
1264     		 * directly.
1265     		 */
1266     		if (tp->srtt) {
1267     			int rtt = max_t(unsigned int, tp->srtt>>3, TCP_DELACK_MIN);
1268     
1269     			if (rtt < max_ato)
1270     				max_ato = rtt;
1271     		}
1272     
1273     		ato = min_t(int, ato, max_ato);
1274     	}
1275     
1276     	/* Stay within the limit we were given */
1277     	timeout = jiffies + ato;
1278     
1279     	/* Use new timeout only if there wasn't a older one earlier. */
1280     	if (tp->ack.pending&TCP_ACK_TIMER) {
1281     		/* If delack timer was blocked or is about to expire,
1282     		 * send ACK now.
1283     		 */
1284     		if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1285     			tcp_send_ack(sk);
1286     			return;
1287     		}
1288     
1289     		if (!time_before(timeout, tp->ack.timeout))
1290     			timeout = tp->ack.timeout;
1291     	}
1292     	tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1293     	tp->ack.timeout = timeout;
1294     	if (!mod_timer(&tp->delack_timer, timeout))
1295     		sock_hold(sk);
1296     }
1297     
1298     /* This routine sends an ack and also updates the window. */
1299     void tcp_send_ack(struct sock *sk)
1300     {
1301     	/* If we have been reset, we may not send again. */
1302     	if(sk->state != TCP_CLOSE) {
1303     		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1304     		struct sk_buff *buff;
1305     
1306     		/* We are not putting this on the write queue, so
1307     		 * tcp_transmit_skb() will set the ownership to this
1308     		 * sock.
1309     		 */
1310     		buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1311     		if (buff == NULL) {
1312     			tcp_schedule_ack(tp);
1313     			tp->ack.ato = TCP_ATO_MIN;
1314     			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1315     			return;
1316     		}
1317     
1318     		/* Reserve space for headers and prepare control bits. */
1319     		skb_reserve(buff, MAX_TCP_HEADER);
1320     		buff->csum = 0;
1321     		TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1322     		TCP_SKB_CB(buff)->sacked = 0;
1323     
1324     		/* Send it off, this clears delayed acks for us. */
1325     		TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1326     		TCP_SKB_CB(buff)->when = tcp_time_stamp;
1327     		tcp_transmit_skb(sk, buff);
1328     	}
1329     }
1330     
1331     /* This routine sends a packet with an out of date sequence
1332      * number. It assumes the other end will try to ack it.
1333      *
1334      * Question: what should we make while urgent mode?
1335      * 4.4BSD forces sending single byte of data. We cannot send
1336      * out of window data, because we have SND.NXT==SND.MAX...
1337      *
1338      * Current solution: to send TWO zero-length segments in urgent mode:
1339      * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1340      * out-of-date with SND.UNA-1 to probe window.
1341      */
1342     static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1343     {
1344     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1345     	struct sk_buff *skb;
1346     
1347     	/* We don't queue it, tcp_transmit_skb() sets ownership. */
1348     	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1349     	if (skb == NULL) 
1350     		return -1;
1351     
1352     	/* Reserve space for headers and set control bits. */
1353     	skb_reserve(skb, MAX_TCP_HEADER);
1354     	skb->csum = 0;
1355     	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1356     	TCP_SKB_CB(skb)->sacked = urgent;
1357     
1358     	/* Use a previous sequence.  This should cause the other
1359     	 * end to send an ack.  Don't queue or clone SKB, just
1360     	 * send it.
1361     	 */
1362     	TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1363     	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1364     	TCP_SKB_CB(skb)->when = tcp_time_stamp;
1365     	return tcp_transmit_skb(sk, skb);
1366     }
1367     
1368     int tcp_write_wakeup(struct sock *sk)
1369     {
1370     	if (sk->state != TCP_CLOSE) {
1371     		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1372     		struct sk_buff *skb;
1373     
1374     		if ((skb = tp->send_head) != NULL &&
1375     		    before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1376     			int err;
1377     			int mss = tcp_current_mss(sk);
1378     			int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1379     
1380     			if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1381     				tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1382     
1383     			/* We are probing the opening of a window
1384     			 * but the window size is != 0
1385     			 * must have been a result SWS avoidance ( sender )
1386     			 */
1387     			if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1388     			    skb->len > mss) {
1389     				seg_size = min_t(int, seg_size, mss);
1390     				TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1391     				if (tcp_fragment(sk, skb, seg_size))
1392     					return -1;
1393     			}
1394     			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1395     			TCP_SKB_CB(skb)->when = tcp_time_stamp;
1396     			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1397     			if (!err) {
1398     				update_send_head(sk, tp, skb);
1399     			}
1400     			return err;
1401     		} else {
1402     			if (tp->urg_mode &&
1403     			    between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1404     				tcp_xmit_probe_skb(sk, TCPCB_URG);
1405     			return tcp_xmit_probe_skb(sk, 0);
1406     		}
1407     	}
1408     	return -1;
1409     }
1410     
1411     /* A window probe timeout has occurred.  If window is not closed send
1412      * a partial packet else a zero probe.
1413      */
1414     void tcp_send_probe0(struct sock *sk)
1415     {
1416     	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1417     	int err;
1418     
1419     	err = tcp_write_wakeup(sk);
1420     
1421     	if (tp->packets_out || !tp->send_head) {
1422     		/* Cancel probe timer, if it is not required. */
1423     		tp->probes_out = 0;
1424     		tp->backoff = 0;
1425     		return;
1426     	}
1427     
1428     	if (err <= 0) {
1429     		tp->backoff++;
1430     		tp->probes_out++;
1431     		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
1432     				      min_t(u32, tp->rto << tp->backoff, TCP_RTO_MAX));
1433     	} else {
1434     		/* If packet was not sent due to local congestion,
1435     		 * do not backoff and do not remember probes_out.
1436     		 * Let local senders to fight for local resources.
1437     		 *
1438     		 * Use accumulated backoff yet.
1439     		 */
1440     		if (!tp->probes_out)
1441     			tp->probes_out=1;
1442     		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
1443     				      min_t(unsigned int, tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1444     	}
1445     }
1446