File: /usr/src/linux/net/ipv4/tcp_output.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_output.c,v 1.141 2001/09/18 22:29:10 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 */
22
23 /*
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
27 * : AF independence
28 *
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
36 *
37 */
38
39 #include <net/tcp.h>
40
41 #include <linux/smp_lock.h>
42
43 /* People can turn this off for buggy TCP's found in printers etc. */
44 int sysctl_tcp_retrans_collapse = 1;
45
46 static __inline__
47 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
48 {
49 tp->send_head = skb->next;
50 if (tp->send_head == (struct sk_buff *) &sk->write_queue)
51 tp->send_head = NULL;
52 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
53 if (tp->packets_out++ == 0)
54 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
55 }
56
57 /* SND.NXT, if window was not shrunk.
58 * If window has been shrunk, what should we make? It is not clear at all.
59 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
60 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
61 * invalid. OK, let's make this for now:
62 */
63 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
64 {
65 if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
66 return tp->snd_nxt;
67 else
68 return tp->snd_una+tp->snd_wnd;
69 }
70
71 /* Calculate mss to advertise in SYN segment.
72 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
73 *
74 * 1. It is independent of path mtu.
75 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
76 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
77 * attached devices, because some buggy hosts are confused by
78 * large MSS.
79 * 4. We do not make 3, we advertise MSS, calculated from first
80 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
81 * This may be overriden via information stored in routing table.
82 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
83 * probably even Jumbo".
84 */
85 static __u16 tcp_advertise_mss(struct sock *sk)
86 {
87 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
88 struct dst_entry *dst = __sk_dst_get(sk);
89 int mss = tp->advmss;
90
91 if (dst && dst->advmss < mss) {
92 mss = dst->advmss;
93 tp->advmss = mss;
94 }
95
96 return (__u16)mss;
97 }
98
99 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
100 * This is the first part of cwnd validation mechanism. */
101 static void tcp_cwnd_restart(struct tcp_opt *tp)
102 {
103 s32 delta = tcp_time_stamp - tp->lsndtime;
104 u32 restart_cwnd = tcp_init_cwnd(tp);
105 u32 cwnd = tp->snd_cwnd;
106
107 tp->snd_ssthresh = tcp_current_ssthresh(tp);
108 restart_cwnd = min_t(u32, restart_cwnd, cwnd);
109
110 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
111 cwnd >>= 1;
112 tp->snd_cwnd = max_t(u32, cwnd, restart_cwnd);
113 tp->snd_cwnd_stamp = tcp_time_stamp;
114 tp->snd_cwnd_used = 0;
115 }
116
117 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
118 {
119 u32 now = tcp_time_stamp;
120
121 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
122 tcp_cwnd_restart(tp);
123
124 tp->lsndtime = now;
125
126 /* If it is a reply for ato after last received
127 * packet, enter pingpong mode.
128 */
129 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
130 tp->ack.pingpong = 1;
131 }
132
133 static __inline__ void tcp_event_ack_sent(struct sock *sk)
134 {
135 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
136
137 tcp_dec_quickack_mode(tp);
138 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
139 }
140
141 /* Chose a new window to advertise, update state in tcp_opt for the
142 * socket, and return result with RFC1323 scaling applied. The return
143 * value can be stuffed directly into th->window for an outgoing
144 * frame.
145 */
146 static __inline__ u16 tcp_select_window(struct sock *sk)
147 {
148 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
149 u32 cur_win = tcp_receive_window(tp);
150 u32 new_win = __tcp_select_window(sk);
151
152 /* Never shrink the offered window */
153 if(new_win < cur_win) {
154 /* Danger Will Robinson!
155 * Don't update rcv_wup/rcv_wnd here or else
156 * we will not be able to advertise a zero
157 * window in time. --DaveM
158 *
159 * Relax Will Robinson.
160 */
161 new_win = cur_win;
162 }
163 tp->rcv_wnd = new_win;
164 tp->rcv_wup = tp->rcv_nxt;
165
166 /* RFC1323 scaling applied */
167 new_win >>= tp->rcv_wscale;
168
169 /* If we advertise zero window, disable fast path. */
170 if (new_win == 0)
171 tp->pred_flags = 0;
172
173 return new_win;
174 }
175
176
177 /* This routine actually transmits TCP packets queued in by
178 * tcp_do_sendmsg(). This is used by both the initial
179 * transmission and possible later retransmissions.
180 * All SKB's seen here are completely headerless. It is our
181 * job to build the TCP header, and pass the packet down to
182 * IP so it can do the same plus pass the packet off to the
183 * device.
184 *
185 * We are working here with either a clone of the original
186 * SKB, or a fresh unique copy made by the retransmit engine.
187 */
188 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
189 {
190 if(skb != NULL) {
191 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
192 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
193 int tcp_header_size = tp->tcp_header_len;
194 struct tcphdr *th;
195 int sysctl_flags;
196 int err;
197
198 #define SYSCTL_FLAG_TSTAMPS 0x1
199 #define SYSCTL_FLAG_WSCALE 0x2
200 #define SYSCTL_FLAG_SACK 0x4
201
202 sysctl_flags = 0;
203 if (tcb->flags & TCPCB_FLAG_SYN) {
204 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
205 if(sysctl_tcp_timestamps) {
206 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
207 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
208 }
209 if(sysctl_tcp_window_scaling) {
210 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
211 sysctl_flags |= SYSCTL_FLAG_WSCALE;
212 }
213 if(sysctl_tcp_sack) {
214 sysctl_flags |= SYSCTL_FLAG_SACK;
215 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
216 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
217 }
218 } else if (tp->eff_sacks) {
219 /* A SACK is 2 pad bytes, a 2 byte header, plus
220 * 2 32-bit sequence numbers for each SACK block.
221 */
222 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
223 (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
224 }
225 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
226 skb->h.th = th;
227 skb_set_owner_w(skb, sk);
228
229 /* Build TCP header and checksum it. */
230 th->source = sk->sport;
231 th->dest = sk->dport;
232 th->seq = htonl(tcb->seq);
233 th->ack_seq = htonl(tp->rcv_nxt);
234 *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
235 if (tcb->flags & TCPCB_FLAG_SYN) {
236 /* RFC1323: The window in SYN & SYN/ACK segments
237 * is never scaled.
238 */
239 th->window = htons(tp->rcv_wnd);
240 } else {
241 th->window = htons(tcp_select_window(sk));
242 }
243 th->check = 0;
244 th->urg_ptr = 0;
245
246 if (tp->urg_mode &&
247 between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
248 th->urg_ptr = htons(tp->snd_up-tcb->seq);
249 th->urg = 1;
250 }
251
252 if (tcb->flags & TCPCB_FLAG_SYN) {
253 tcp_syn_build_options((__u32 *)(th + 1),
254 tcp_advertise_mss(sk),
255 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
256 (sysctl_flags & SYSCTL_FLAG_SACK),
257 (sysctl_flags & SYSCTL_FLAG_WSCALE),
258 tp->rcv_wscale,
259 tcb->when,
260 tp->ts_recent);
261 } else {
262 tcp_build_and_update_options((__u32 *)(th + 1),
263 tp, tcb->when);
264
265 TCP_ECN_send(sk, tp, skb, tcp_header_size);
266 }
267 tp->af_specific->send_check(sk, th, skb->len, skb);
268
269 if (tcb->flags & TCPCB_FLAG_ACK)
270 tcp_event_ack_sent(sk);
271
272 if (skb->len != tcp_header_size)
273 tcp_event_data_sent(tp, skb);
274
275 TCP_INC_STATS(TcpOutSegs);
276
277 err = tp->af_specific->queue_xmit(skb);
278 if (err <= 0)
279 return err;
280
281 tcp_enter_cwr(tp);
282
283 /* NET_XMIT_CN is special. It does not guarantee,
284 * that this packet is lost. It tells that device
285 * is about to start to drop packets or already
286 * drops some packets of the same priority and
287 * invokes us to send less aggressively.
288 */
289 return err == NET_XMIT_CN ? 0 : err;
290 }
291 return -ENOBUFS;
292 #undef SYSCTL_FLAG_TSTAMPS
293 #undef SYSCTL_FLAG_WSCALE
294 #undef SYSCTL_FLAG_SACK
295 }
296
297
298 /* This is the main buffer sending routine. We queue the buffer
299 * and decide whether to queue or transmit now.
300 *
301 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
302 * otherwise socket can stall.
303 */
304 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
305 {
306 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
307
308 /* Advance write_seq and place onto the write_queue. */
309 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
310 __skb_queue_tail(&sk->write_queue, skb);
311 tcp_charge_skb(sk, skb);
312
313 if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
314 /* Send it out now. */
315 TCP_SKB_CB(skb)->when = tcp_time_stamp;
316 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
317 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
318 tcp_minshall_update(tp, cur_mss, skb);
319 if (tp->packets_out++ == 0)
320 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
321 return;
322 }
323 }
324 /* Queue it, remembering where we must start sending. */
325 if (tp->send_head == NULL)
326 tp->send_head = skb;
327 }
328
329 /* Send _single_ skb sitting at the send head. This function requires
330 * true push pending frames to setup probe timer etc.
331 */
332 void tcp_push_one(struct sock *sk, unsigned cur_mss)
333 {
334 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
335 struct sk_buff *skb = tp->send_head;
336
337 if (tcp_snd_test(tp, skb, cur_mss, 1)) {
338 /* Send it out now. */
339 TCP_SKB_CB(skb)->when = tcp_time_stamp;
340 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
341 tp->send_head = NULL;
342 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
343 if (tp->packets_out++ == 0)
344 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
345 return;
346 }
347 }
348 }
349
350 /* Split fragmented skb to two parts at length len. */
351
352 static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len)
353 {
354 int i;
355 int pos = skb->len - skb->data_len;
356
357 if (len < pos) {
358 /* Split line is inside header. */
359 memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len);
360
361 /* And move data appendix as is. */
362 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
363 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
364
365 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
366 skb_shinfo(skb)->nr_frags = 0;
367
368 skb1->data_len = skb->data_len;
369 skb1->len += skb1->data_len;
370 skb->data_len = 0;
371 skb->len = len;
372 skb->tail = skb->data+len;
373 } else {
374 int k = 0;
375 int nfrags = skb_shinfo(skb)->nr_frags;
376
377 /* Second chunk has no header, nothing to copy. */
378
379 skb_shinfo(skb)->nr_frags = 0;
380 skb1->len = skb1->data_len = skb->len - len;
381 skb->len = len;
382 skb->data_len = len - pos;
383
384 for (i=0; i<nfrags; i++) {
385 int size = skb_shinfo(skb)->frags[i].size;
386 if (pos + size > len) {
387 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
388
389 if (pos < len) {
390 /* Split frag.
391 * We have to variants in this case:
392 * 1. Move all the frag to the second
393 * part, if it is possible. F.e.
394 * this approach is mandatory for TUX,
395 * where splitting is expensive.
396 * 2. Split is accurately. We make this.
397 */
398 get_page(skb_shinfo(skb)->frags[i].page);
399 skb_shinfo(skb1)->frags[0].page_offset += (len-pos);
400 skb_shinfo(skb1)->frags[0].size -= (len-pos);
401 skb_shinfo(skb)->frags[i].size = len-pos;
402 skb_shinfo(skb)->nr_frags++;
403 }
404 k++;
405 } else {
406 skb_shinfo(skb)->nr_frags++;
407 }
408 pos += size;
409 }
410 skb_shinfo(skb1)->nr_frags = k;
411 }
412 }
413
414 /* Function to create two new TCP segments. Shrinks the given segment
415 * to the specified size and appends a new segment with the rest of the
416 * packet to the list. This won't be called frequently, I hope.
417 * Remember, these are still headerless SKBs at this point.
418 */
419 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
420 {
421 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
422 struct sk_buff *buff;
423 int nsize = skb->len - len;
424 u16 flags;
425
426 if (skb_cloned(skb) &&
427 skb_is_nonlinear(skb) &&
428 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
429 return -ENOMEM;
430
431 /* Get a new skb... force flag on. */
432 buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC);
433 if (buff == NULL)
434 return -ENOMEM; /* We'll just try again later. */
435 tcp_charge_skb(sk, buff);
436
437 /* Correct the sequence numbers. */
438 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
439 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
440 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
441
442 /* PSH and FIN should only be set in the second packet. */
443 flags = TCP_SKB_CB(skb)->flags;
444 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
445 TCP_SKB_CB(buff)->flags = flags;
446 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
447 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
448 tp->lost_out++;
449 tp->left_out++;
450 }
451 TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
452
453 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
454 /* Copy and checksum data tail into the new buffer. */
455 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
456 nsize, 0);
457
458 skb_trim(skb, len);
459
460 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
461 } else {
462 skb->ip_summed = CHECKSUM_HW;
463 skb_split(skb, buff, len);
464 }
465
466 buff->ip_summed = skb->ip_summed;
467
468 /* Looks stupid, but our code really uses when of
469 * skbs, which it never sent before. --ANK
470 */
471 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
472
473 /* Link BUFF into the send queue. */
474 __skb_append(skb, buff);
475
476 return 0;
477 }
478
479 /* This function synchronize snd mss to current pmtu/exthdr set.
480
481 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
482 for TCP options, but includes only bare TCP header.
483
484 tp->mss_clamp is mss negotiated at connection setup.
485 It is minumum of user_mss and mss received with SYN.
486 It also does not include TCP options.
487
488 tp->pmtu_cookie is last pmtu, seen by this function.
489
490 tp->mss_cache is current effective sending mss, including
491 all tcp options except for SACKs. It is evaluated,
492 taking into account current pmtu, but never exceeds
493 tp->mss_clamp.
494
495 NOTE1. rfc1122 clearly states that advertised MSS
496 DOES NOT include either tcp or ip options.
497
498 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
499 this function. --ANK (980731)
500 */
501
502 int tcp_sync_mss(struct sock *sk, u32 pmtu)
503 {
504 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
505 int mss_now;
506
507 /* Calculate base mss without TCP options:
508 It is MMS_S - sizeof(tcphdr) of rfc1122
509 */
510
511 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
512
513 /* Clamp it (mss_clamp does not include tcp options) */
514 if (mss_now > tp->mss_clamp)
515 mss_now = tp->mss_clamp;
516
517 /* Now subtract optional transport overhead */
518 mss_now -= tp->ext_header_len;
519
520 /* Then reserve room for full set of TCP options and 8 bytes of data */
521 if (mss_now < 48)
522 mss_now = 48;
523
524 /* Now subtract TCP options size, not including SACKs */
525 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
526
527 /* Bound mss with half of window */
528 if (tp->max_window && mss_now > (tp->max_window>>1))
529 mss_now = max_t(u32, (tp->max_window>>1), 68 - tp->tcp_header_len);
530
531 /* And store cached results */
532 tp->pmtu_cookie = pmtu;
533 tp->mss_cache = mss_now;
534 return mss_now;
535 }
536
537
538 /* This routine writes packets to the network. It advances the
539 * send_head. This happens as incoming acks open up the remote
540 * window for us.
541 *
542 * Returns 1, if no segments are in flight and we have queued segments, but
543 * cannot send anything now because of SWS or another problem.
544 */
545 int tcp_write_xmit(struct sock *sk, int nonagle)
546 {
547 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
548 unsigned int mss_now;
549
550 /* If we are closed, the bytes will have to remain here.
551 * In time closedown will finish, we empty the write queue and all
552 * will be happy.
553 */
554 if(sk->state != TCP_CLOSE) {
555 struct sk_buff *skb;
556 int sent_pkts = 0;
557
558 /* Account for SACKS, we may need to fragment due to this.
559 * It is just like the real MSS changing on us midstream.
560 * We also handle things correctly when the user adds some
561 * IP options mid-stream. Silly to do, but cover it.
562 */
563 mss_now = tcp_current_mss(sk);
564
565 while((skb = tp->send_head) &&
566 tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) {
567 if (skb->len > mss_now) {
568 if (tcp_fragment(sk, skb, mss_now))
569 break;
570 }
571
572 TCP_SKB_CB(skb)->when = tcp_time_stamp;
573 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
574 break;
575 /* Advance the send_head. This one is sent out. */
576 update_send_head(sk, tp, skb);
577 tcp_minshall_update(tp, mss_now, skb);
578 sent_pkts = 1;
579 }
580
581 if (sent_pkts) {
582 tcp_cwnd_validate(sk, tp);
583 return 0;
584 }
585
586 return !tp->packets_out && tp->send_head;
587 }
588 return 0;
589 }
590
591 /* This function returns the amount that we can raise the
592 * usable window based on the following constraints
593 *
594 * 1. The window can never be shrunk once it is offered (RFC 793)
595 * 2. We limit memory per socket
596 *
597 * RFC 1122:
598 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
599 * RECV.NEXT + RCV.WIN fixed until:
600 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
601 *
602 * i.e. don't raise the right edge of the window until you can raise
603 * it at least MSS bytes.
604 *
605 * Unfortunately, the recommended algorithm breaks header prediction,
606 * since header prediction assumes th->window stays fixed.
607 *
608 * Strictly speaking, keeping th->window fixed violates the receiver
609 * side SWS prevention criteria. The problem is that under this rule
610 * a stream of single byte packets will cause the right side of the
611 * window to always advance by a single byte.
612 *
613 * Of course, if the sender implements sender side SWS prevention
614 * then this will not be a problem.
615 *
616 * BSD seems to make the following compromise:
617 *
618 * If the free space is less than the 1/4 of the maximum
619 * space available and the free space is less than 1/2 mss,
620 * then set the window to 0.
621 * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
622 * Otherwise, just prevent the window from shrinking
623 * and from being larger than the largest representable value.
624 *
625 * This prevents incremental opening of the window in the regime
626 * where TCP is limited by the speed of the reader side taking
627 * data out of the TCP receive queue. It does nothing about
628 * those cases where the window is constrained on the sender side
629 * because the pipeline is full.
630 *
631 * BSD also seems to "accidentally" limit itself to windows that are a
632 * multiple of MSS, at least until the free space gets quite small.
633 * This would appear to be a side effect of the mbuf implementation.
634 * Combining these two algorithms results in the observed behavior
635 * of having a fixed window size at almost all times.
636 *
637 * Below we obtain similar behavior by forcing the offered window to
638 * a multiple of the mss when it is feasible to do so.
639 *
640 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
641 * Regular options like TIMESTAMP are taken into account.
642 */
643 u32 __tcp_select_window(struct sock *sk)
644 {
645 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
646 /* MSS for the peer's data. Previous verions used mss_clamp
647 * here. I don't know if the value based on our guesses
648 * of peer's MSS is better for the performance. It's more correct
649 * but may be worse for the performance because of rcv_mss
650 * fluctuations. --SAW 1998/11/1
651 */
652 int mss = tp->ack.rcv_mss;
653 int free_space = tcp_space(sk);
654 int full_space = min_t(unsigned int, tp->window_clamp, tcp_full_space(sk));
655 int window;
656
657 if (mss > full_space)
658 mss = full_space;
659
660 if (free_space < full_space/2) {
661 tp->ack.quick = 0;
662
663 if (tcp_memory_pressure)
664 tp->rcv_ssthresh = min_t(u32, tp->rcv_ssthresh, 4*tp->advmss);
665
666 if (free_space < mss)
667 return 0;
668 }
669
670 if (free_space > tp->rcv_ssthresh)
671 free_space = tp->rcv_ssthresh;
672
673 /* Get the largest window that is a nice multiple of mss.
674 * Window clamp already applied above.
675 * If our current window offering is within 1 mss of the
676 * free space we just keep it. This prevents the divide
677 * and multiply from happening most of the time.
678 * We also don't do any window rounding when the free space
679 * is too small.
680 */
681 window = tp->rcv_wnd;
682 if (window <= free_space - mss || window > free_space)
683 window = (free_space/mss)*mss;
684
685 return window;
686 }
687
688 /* Attempt to collapse two adjacent SKB's during retransmission. */
689 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
690 {
691 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
692 struct sk_buff *next_skb = skb->next;
693
694 /* The first test we must make is that neither of these two
695 * SKB's are still referenced by someone else.
696 */
697 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
698 int skb_size = skb->len, next_skb_size = next_skb->len;
699 u16 flags = TCP_SKB_CB(skb)->flags;
700
701 /* Also punt if next skb has been SACK'd. */
702 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
703 return;
704
705 /* Next skb is out of window. */
706 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
707 return;
708
709 /* Punt if not enough space exists in the first SKB for
710 * the data in the second, or the total combined payload
711 * would exceed the MSS.
712 */
713 if ((next_skb_size > skb_tailroom(skb)) ||
714 ((skb_size + next_skb_size) > mss_now))
715 return;
716
717 /* Ok. We will be able to collapse the packet. */
718 __skb_unlink(next_skb, next_skb->list);
719
720 if (next_skb->ip_summed == CHECKSUM_HW)
721 skb->ip_summed = CHECKSUM_HW;
722
723 if (skb->ip_summed != CHECKSUM_HW) {
724 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
725 skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
726 }
727
728 /* Update sequence range on original skb. */
729 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
730
731 /* Merge over control information. */
732 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
733 TCP_SKB_CB(skb)->flags = flags;
734
735 /* All done, get rid of second SKB and account for it so
736 * packet counting does not break.
737 */
738 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
739 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
740 tp->retrans_out--;
741 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
742 tp->lost_out--;
743 tp->left_out--;
744 }
745 /* Reno case is special. Sigh... */
746 if (!tp->sack_ok && tp->sacked_out) {
747 tp->sacked_out--;
748 tp->left_out--;
749 }
750
751 /* Not quite right: it can be > snd.fack, but
752 * it is better to underestimate fackets.
753 */
754 if (tp->fackets_out)
755 tp->fackets_out--;
756 tcp_free_skb(sk, next_skb);
757 tp->packets_out--;
758 }
759 }
760
761 /* Do a simple retransmit without using the backoff mechanisms in
762 * tcp_timer. This is used for path mtu discovery.
763 * The socket is already locked here.
764 */
765 void tcp_simple_retransmit(struct sock *sk)
766 {
767 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
768 struct sk_buff *skb;
769 unsigned int mss = tcp_current_mss(sk);
770 int lost = 0;
771
772 for_retrans_queue(skb, sk, tp) {
773 if (skb->len > mss &&
774 !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
775 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
776 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
777 tp->retrans_out--;
778 }
779 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
780 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
781 tp->lost_out++;
782 lost = 1;
783 }
784 }
785 }
786
787 if (!lost)
788 return;
789
790 tcp_sync_left_out(tp);
791
792 /* Don't muck with the congestion window here.
793 * Reason is that we do not increase amount of _data_
794 * in network, but units changed and effective
795 * cwnd/ssthresh really reduced now.
796 */
797 if (tp->ca_state != TCP_CA_Loss) {
798 tp->high_seq = tp->snd_nxt;
799 tp->snd_ssthresh = tcp_current_ssthresh(tp);
800 tp->prior_ssthresh = 0;
801 tp->undo_marker = 0;
802 tp->ca_state = TCP_CA_Loss;
803 }
804 tcp_xmit_retransmit_queue(sk);
805 }
806
807 /* This retransmits one SKB. Policy decisions and retransmit queue
808 * state updates are done by the caller. Returns non-zero if an
809 * error occurred which prevented the send.
810 */
811 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
812 {
813 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
814 unsigned int cur_mss = tcp_current_mss(sk);
815 int err;
816
817 /* Do not sent more than we queued. 1/4 is reserved for possible
818 * copying overhead: frgagmentation, tunneling, mangling etc.
819 */
820 if (atomic_read(&sk->wmem_alloc) > min_t(int, sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
821 return -EAGAIN;
822
823 /* If receiver has shrunk his window, and skb is out of
824 * new window, do not retransmit it. The exception is the
825 * case, when window is shrunk to zero. In this case
826 * our retransmit serves as a zero window probe.
827 */
828 if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
829 && TCP_SKB_CB(skb)->seq != tp->snd_una)
830 return -EAGAIN;
831
832 if(skb->len > cur_mss) {
833 if(tcp_fragment(sk, skb, cur_mss))
834 return -ENOMEM; /* We'll try again later. */
835
836 /* New SKB created, account for it. */
837 tp->packets_out++;
838 }
839
840 /* Collapse two adjacent packets if worthwhile and we can. */
841 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
842 (skb->len < (cur_mss >> 1)) &&
843 (skb->next != tp->send_head) &&
844 (skb->next != (struct sk_buff *)&sk->write_queue) &&
845 (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
846 (sysctl_tcp_retrans_collapse != 0))
847 tcp_retrans_try_collapse(sk, skb, cur_mss);
848
849 if(tp->af_specific->rebuild_header(sk))
850 return -EHOSTUNREACH; /* Routing failure or similar. */
851
852 /* Some Solaris stacks overoptimize and ignore the FIN on a
853 * retransmit when old data is attached. So strip it off
854 * since it is cheap to do so and saves bytes on the network.
855 */
856 if(skb->len > 0 &&
857 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
858 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
859 if (!pskb_trim(skb, 0)) {
860 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
861 skb->ip_summed = CHECKSUM_NONE;
862 skb->csum = 0;
863 }
864 }
865
866 /* Make a copy, if the first transmission SKB clone we made
867 * is still in somebody's hands, else make a clone.
868 */
869 TCP_SKB_CB(skb)->when = tcp_time_stamp;
870
871 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
872 pskb_copy(skb, GFP_ATOMIC):
873 skb_clone(skb, GFP_ATOMIC)));
874
875 if (err == 0) {
876 /* Update global TCP statistics. */
877 TCP_INC_STATS(TcpRetransSegs);
878
879 #if FASTRETRANS_DEBUG > 0
880 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
881 if (net_ratelimit())
882 printk(KERN_DEBUG "retrans_out leaked.\n");
883 }
884 #endif
885 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
886 tp->retrans_out++;
887
888 /* Save stamp of the first retransmit. */
889 if (!tp->retrans_stamp)
890 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
891
892 tp->undo_retrans++;
893
894 /* snd_nxt is stored to detect loss of retransmitted segment,
895 * see tcp_input.c tcp_sacktag_write_queue().
896 */
897 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
898 }
899 return err;
900 }
901
902 /* This gets called after a retransmit timeout, and the initially
903 * retransmitted data is acknowledged. It tries to continue
904 * resending the rest of the retransmit queue, until either
905 * we've sent it all or the congestion window limit is reached.
906 * If doing SACK, the first ACK which comes back for a timeout
907 * based retransmit packet might feed us FACK information again.
908 * If so, we use it to avoid unnecessarily retransmissions.
909 */
910 void tcp_xmit_retransmit_queue(struct sock *sk)
911 {
912 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
913 struct sk_buff *skb;
914 int packet_cnt = tp->lost_out;
915
916 /* First pass: retransmit lost packets. */
917 if (packet_cnt) {
918 for_retrans_queue(skb, sk, tp) {
919 __u8 sacked = TCP_SKB_CB(skb)->sacked;
920
921 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
922 return;
923
924 if (sacked&TCPCB_LOST) {
925 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
926 if (tcp_retransmit_skb(sk, skb))
927 return;
928 if (tp->ca_state != TCP_CA_Loss)
929 NET_INC_STATS_BH(TCPFastRetrans);
930 else
931 NET_INC_STATS_BH(TCPSlowStartRetrans);
932
933 if (skb == skb_peek(&sk->write_queue))
934 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
935 }
936
937 if (--packet_cnt <= 0)
938 break;
939 }
940 }
941 }
942
943 /* OK, demanded retransmission is finished. */
944
945 /* Forward retransmissions are possible only during Recovery. */
946 if (tp->ca_state != TCP_CA_Recovery)
947 return;
948
949 /* No forward retransmissions in Reno are possible. */
950 if (!tp->sack_ok)
951 return;
952
953 /* Yeah, we have to make difficult choice between forward transmission
954 * and retransmission... Both ways have their merits...
955 *
956 * For now we do not retrnamsit anything, while we have some new
957 * segments to send.
958 */
959
960 if (tcp_may_send_now(sk, tp))
961 return;
962
963 packet_cnt = 0;
964
965 for_retrans_queue(skb, sk, tp) {
966 if(++packet_cnt > tp->fackets_out)
967 break;
968
969 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
970 break;
971
972 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
973 continue;
974
975 /* Ok, retransmit it. */
976 if(tcp_retransmit_skb(sk, skb))
977 break;
978
979 if (skb == skb_peek(&sk->write_queue))
980 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
981
982 NET_INC_STATS_BH(TCPForwardRetrans);
983 }
984 }
985
986
987 /* Send a fin. The caller locks the socket for us. This cannot be
988 * allowed to fail queueing a FIN frame under any circumstances.
989 */
990 void tcp_send_fin(struct sock *sk)
991 {
992 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
993 struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
994 unsigned int mss_now;
995
996 /* Optimization, tack on the FIN if we have a queue of
997 * unsent frames. But be careful about outgoing SACKS
998 * and IP options.
999 */
1000 mss_now = tcp_current_mss(sk);
1001
1002 if(tp->send_head != NULL) {
1003 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1004 TCP_SKB_CB(skb)->end_seq++;
1005 tp->write_seq++;
1006 } else {
1007 /* Socket is locked, keep trying until memory is available. */
1008 for (;;) {
1009 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1010 if (skb)
1011 break;
1012 current->policy |= SCHED_YIELD;
1013 schedule();
1014 }
1015
1016 /* Reserve space for headers and prepare control bits. */
1017 skb_reserve(skb, MAX_TCP_HEADER);
1018 skb->csum = 0;
1019 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1020 TCP_SKB_CB(skb)->sacked = 0;
1021
1022 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
1023 TCP_SKB_CB(skb)->seq = tp->write_seq;
1024 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1025 tcp_send_skb(sk, skb, 1, mss_now);
1026 }
1027 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1028 }
1029
1030 /* We get here when a process closes a file descriptor (either due to
1031 * an explicit close() or as a byproduct of exit()'ing) and there
1032 * was unread data in the receive queue. This behavior is recommended
1033 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1034 */
1035 void tcp_send_active_reset(struct sock *sk, int priority)
1036 {
1037 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1038 struct sk_buff *skb;
1039
1040 /* NOTE: No TCP options attached and we never retransmit this. */
1041 skb = alloc_skb(MAX_TCP_HEADER, priority);
1042 if (!skb) {
1043 NET_INC_STATS(TCPAbortFailed);
1044 return;
1045 }
1046
1047 /* Reserve space for headers and prepare control bits. */
1048 skb_reserve(skb, MAX_TCP_HEADER);
1049 skb->csum = 0;
1050 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1051 TCP_SKB_CB(skb)->sacked = 0;
1052
1053 /* Send it off. */
1054 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1055 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1056 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1057 if (tcp_transmit_skb(sk, skb))
1058 NET_INC_STATS(TCPAbortFailed);
1059 }
1060
1061 /* WARNING: This routine must only be called when we have already sent
1062 * a SYN packet that crossed the incoming SYN that caused this routine
1063 * to get called. If this assumption fails then the initial rcv_wnd
1064 * and rcv_wscale values will not be correct.
1065 */
1066 int tcp_send_synack(struct sock *sk)
1067 {
1068 struct sk_buff* skb;
1069
1070 skb = skb_peek(&sk->write_queue);
1071 if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1072 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1073 return -EFAULT;
1074 }
1075 if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1076 if (skb_cloned(skb)) {
1077 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1078 if (nskb == NULL)
1079 return -ENOMEM;
1080 __skb_unlink(skb, &sk->write_queue);
1081 __skb_queue_head(&sk->write_queue, nskb);
1082 tcp_free_skb(sk, skb);
1083 tcp_charge_skb(sk, nskb);
1084 skb = nskb;
1085 }
1086
1087 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1088 TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
1089 }
1090 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1091 return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1092 }
1093
1094 /*
1095 * Prepare a SYN-ACK.
1096 */
1097 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1098 struct open_request *req)
1099 {
1100 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1101 struct tcphdr *th;
1102 int tcp_header_size;
1103 struct sk_buff *skb;
1104
1105 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1106 if (skb == NULL)
1107 return NULL;
1108
1109 /* Reserve space for headers. */
1110 skb_reserve(skb, MAX_TCP_HEADER);
1111
1112 skb->dst = dst_clone(dst);
1113
1114 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1115 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1116 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1117 /* SACK_PERM is in the place of NOP NOP of TS */
1118 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1119 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1120
1121 memset(th, 0, sizeof(struct tcphdr));
1122 th->syn = 1;
1123 th->ack = 1;
1124 TCP_ECN_make_synack(req, th);
1125 th->source = sk->sport;
1126 th->dest = req->rmt_port;
1127 TCP_SKB_CB(skb)->seq = req->snt_isn;
1128 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1129 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1130 th->ack_seq = htonl(req->rcv_isn + 1);
1131 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1132 __u8 rcv_wscale;
1133 /* Set this up on the first call only */
1134 req->window_clamp = tp->window_clamp ? : dst->window;
1135 /* tcp_full_space because it is guaranteed to be the first packet */
1136 tcp_select_initial_window(tcp_full_space(sk),
1137 dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1138 &req->rcv_wnd,
1139 &req->window_clamp,
1140 req->wscale_ok,
1141 &rcv_wscale);
1142 req->rcv_wscale = rcv_wscale;
1143 }
1144
1145 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1146 th->window = htons(req->rcv_wnd);
1147
1148 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1149 tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
1150 req->sack_ok, req->wscale_ok, req->rcv_wscale,
1151 TCP_SKB_CB(skb)->when,
1152 req->ts_recent);
1153
1154 skb->csum = 0;
1155 th->doff = (tcp_header_size >> 2);
1156 TCP_INC_STATS(TcpOutSegs);
1157 return skb;
1158 }
1159
1160 int tcp_connect(struct sock *sk, struct sk_buff *buff)
1161 {
1162 struct dst_entry *dst = __sk_dst_get(sk);
1163 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1164
1165 /* Reserve space for headers. */
1166 skb_reserve(buff, MAX_TCP_HEADER);
1167
1168 /* We'll fix this up when we get a response from the other end.
1169 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1170 */
1171 tp->tcp_header_len = sizeof(struct tcphdr) +
1172 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1173
1174 /* If user gave his TCP_MAXSEG, record it to clamp */
1175 if (tp->user_mss)
1176 tp->mss_clamp = tp->user_mss;
1177 tp->max_window = 0;
1178 tcp_sync_mss(sk, dst->pmtu);
1179
1180 if (!tp->window_clamp)
1181 tp->window_clamp = dst->window;
1182 tp->advmss = dst->advmss;
1183 tcp_initialize_rcv_mss(sk);
1184
1185 tcp_select_initial_window(tcp_full_space(sk),
1186 tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1187 &tp->rcv_wnd,
1188 &tp->window_clamp,
1189 sysctl_tcp_window_scaling,
1190 &tp->rcv_wscale);
1191
1192 tp->rcv_ssthresh = tp->rcv_wnd;
1193
1194 /* Socket identity change complete, no longer
1195 * in TCP_CLOSE, so enter ourselves into the
1196 * hash tables.
1197 */
1198 tcp_set_state(sk,TCP_SYN_SENT);
1199 if (tp->af_specific->hash_connecting(sk))
1200 goto err_out;
1201
1202 sk->err = 0;
1203 sk->done = 0;
1204 tp->snd_wnd = 0;
1205 tcp_init_wl(tp, tp->write_seq, 0);
1206 tp->snd_una = tp->write_seq;
1207 tp->snd_sml = tp->write_seq;
1208 tp->rcv_nxt = 0;
1209 tp->rcv_wup = 0;
1210 tp->copied_seq = 0;
1211
1212 tp->rto = TCP_TIMEOUT_INIT;
1213 tp->retransmits = 0;
1214 tcp_clear_retrans(tp);
1215
1216 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1217 TCP_ECN_send_syn(tp, buff);
1218 TCP_SKB_CB(buff)->sacked = 0;
1219 buff->csum = 0;
1220 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1221 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1222 tp->snd_nxt = tp->write_seq;
1223 tp->pushed_seq = tp->write_seq;
1224
1225 /* Send it off. */
1226 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1227 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1228 __skb_queue_tail(&sk->write_queue, buff);
1229 tcp_charge_skb(sk, buff);
1230 tp->packets_out++;
1231 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1232 TCP_INC_STATS(TcpActiveOpens);
1233
1234 /* Timer for repeating the SYN until an answer. */
1235 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1236 return 0;
1237
1238 err_out:
1239 tcp_set_state(sk,TCP_CLOSE);
1240 kfree_skb(buff);
1241 return -EADDRNOTAVAIL;
1242 }
1243
1244 /* Send out a delayed ack, the caller does the policy checking
1245 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1246 * for details.
1247 */
1248 void tcp_send_delayed_ack(struct sock *sk)
1249 {
1250 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1251 int ato = tp->ack.ato;
1252 unsigned long timeout;
1253
1254 if (ato > TCP_DELACK_MIN) {
1255 int max_ato = HZ/2;
1256
1257 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1258 max_ato = TCP_DELACK_MAX;
1259
1260 /* Slow path, intersegment interval is "high". */
1261
1262 /* If some rtt estimate is known, use it to bound delayed ack.
1263 * Do not use tp->rto here, use results of rtt measurements
1264 * directly.
1265 */
1266 if (tp->srtt) {
1267 int rtt = max_t(unsigned int, tp->srtt>>3, TCP_DELACK_MIN);
1268
1269 if (rtt < max_ato)
1270 max_ato = rtt;
1271 }
1272
1273 ato = min_t(int, ato, max_ato);
1274 }
1275
1276 /* Stay within the limit we were given */
1277 timeout = jiffies + ato;
1278
1279 /* Use new timeout only if there wasn't a older one earlier. */
1280 if (tp->ack.pending&TCP_ACK_TIMER) {
1281 /* If delack timer was blocked or is about to expire,
1282 * send ACK now.
1283 */
1284 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1285 tcp_send_ack(sk);
1286 return;
1287 }
1288
1289 if (!time_before(timeout, tp->ack.timeout))
1290 timeout = tp->ack.timeout;
1291 }
1292 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1293 tp->ack.timeout = timeout;
1294 if (!mod_timer(&tp->delack_timer, timeout))
1295 sock_hold(sk);
1296 }
1297
1298 /* This routine sends an ack and also updates the window. */
1299 void tcp_send_ack(struct sock *sk)
1300 {
1301 /* If we have been reset, we may not send again. */
1302 if(sk->state != TCP_CLOSE) {
1303 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1304 struct sk_buff *buff;
1305
1306 /* We are not putting this on the write queue, so
1307 * tcp_transmit_skb() will set the ownership to this
1308 * sock.
1309 */
1310 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1311 if (buff == NULL) {
1312 tcp_schedule_ack(tp);
1313 tp->ack.ato = TCP_ATO_MIN;
1314 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1315 return;
1316 }
1317
1318 /* Reserve space for headers and prepare control bits. */
1319 skb_reserve(buff, MAX_TCP_HEADER);
1320 buff->csum = 0;
1321 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1322 TCP_SKB_CB(buff)->sacked = 0;
1323
1324 /* Send it off, this clears delayed acks for us. */
1325 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1326 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1327 tcp_transmit_skb(sk, buff);
1328 }
1329 }
1330
1331 /* This routine sends a packet with an out of date sequence
1332 * number. It assumes the other end will try to ack it.
1333 *
1334 * Question: what should we make while urgent mode?
1335 * 4.4BSD forces sending single byte of data. We cannot send
1336 * out of window data, because we have SND.NXT==SND.MAX...
1337 *
1338 * Current solution: to send TWO zero-length segments in urgent mode:
1339 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1340 * out-of-date with SND.UNA-1 to probe window.
1341 */
1342 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1343 {
1344 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1345 struct sk_buff *skb;
1346
1347 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1348 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1349 if (skb == NULL)
1350 return -1;
1351
1352 /* Reserve space for headers and set control bits. */
1353 skb_reserve(skb, MAX_TCP_HEADER);
1354 skb->csum = 0;
1355 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1356 TCP_SKB_CB(skb)->sacked = urgent;
1357
1358 /* Use a previous sequence. This should cause the other
1359 * end to send an ack. Don't queue or clone SKB, just
1360 * send it.
1361 */
1362 TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1363 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1364 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1365 return tcp_transmit_skb(sk, skb);
1366 }
1367
1368 int tcp_write_wakeup(struct sock *sk)
1369 {
1370 if (sk->state != TCP_CLOSE) {
1371 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1372 struct sk_buff *skb;
1373
1374 if ((skb = tp->send_head) != NULL &&
1375 before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1376 int err;
1377 int mss = tcp_current_mss(sk);
1378 int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1379
1380 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1381 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1382
1383 /* We are probing the opening of a window
1384 * but the window size is != 0
1385 * must have been a result SWS avoidance ( sender )
1386 */
1387 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1388 skb->len > mss) {
1389 seg_size = min_t(int, seg_size, mss);
1390 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1391 if (tcp_fragment(sk, skb, seg_size))
1392 return -1;
1393 }
1394 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1395 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1396 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1397 if (!err) {
1398 update_send_head(sk, tp, skb);
1399 }
1400 return err;
1401 } else {
1402 if (tp->urg_mode &&
1403 between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1404 tcp_xmit_probe_skb(sk, TCPCB_URG);
1405 return tcp_xmit_probe_skb(sk, 0);
1406 }
1407 }
1408 return -1;
1409 }
1410
1411 /* A window probe timeout has occurred. If window is not closed send
1412 * a partial packet else a zero probe.
1413 */
1414 void tcp_send_probe0(struct sock *sk)
1415 {
1416 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1417 int err;
1418
1419 err = tcp_write_wakeup(sk);
1420
1421 if (tp->packets_out || !tp->send_head) {
1422 /* Cancel probe timer, if it is not required. */
1423 tp->probes_out = 0;
1424 tp->backoff = 0;
1425 return;
1426 }
1427
1428 if (err <= 0) {
1429 tp->backoff++;
1430 tp->probes_out++;
1431 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1432 min_t(u32, tp->rto << tp->backoff, TCP_RTO_MAX));
1433 } else {
1434 /* If packet was not sent due to local congestion,
1435 * do not backoff and do not remember probes_out.
1436 * Let local senders to fight for local resources.
1437 *
1438 * Use accumulated backoff yet.
1439 */
1440 if (!tp->probes_out)
1441 tp->probes_out=1;
1442 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
1443 min_t(unsigned int, tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1444 }
1445 }
1446