File: /usr/src/linux/net/core/datagram.c
1 /*
2 * SUCS NET3:
3 *
4 * Generic datagram handling routines. These are generic for all protocols. Possibly a generic IP version on top
5 * of these would make sense. Not tonight however 8-).
6 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and NetROM layer all have identical poll code and mostly
7 * identical recvmsg() code. So we share it here. The poll was shared before but buried in udp.c so I moved it.
8 *
9 * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old udp.c code)
10 *
11 * Fixes:
12 * Alan Cox : NULL return from skb_peek_copy() understood
13 * Alan Cox : Rewrote skb_read_datagram to avoid the skb_peek_copy stuff.
14 * Alan Cox : Added support for SOCK_SEQPACKET. IPX can no longer use the SO_TYPE hack but
15 * AX.25 now works right, and SPX is feasible.
16 * Alan Cox : Fixed write poll of non IP protocol crash.
17 * Florian La Roche: Changed for my new skbuff handling.
18 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
19 * Linus Torvalds : BSD semantic fixes.
20 * Alan Cox : Datagram iovec handling
21 * Darryl Miles : Fixed non-blocking SOCK_STREAM.
22 * Alan Cox : POSIXisms
23 * Pete Wyckoff : Unconnected accept() fix.
24 *
25 */
26
27 #include <linux/types.h>
28 #include <linux/kernel.h>
29 #include <asm/uaccess.h>
30 #include <asm/system.h>
31 #include <linux/mm.h>
32 #include <linux/interrupt.h>
33 #include <linux/in.h>
34 #include <linux/errno.h>
35 #include <linux/sched.h>
36 #include <linux/inet.h>
37 #include <linux/netdevice.h>
38 #include <linux/poll.h>
39 #include <linux/highmem.h>
40
41 #include <net/ip.h>
42 #include <net/protocol.h>
43 #include <net/route.h>
44 #include <net/tcp.h>
45 #include <net/udp.h>
46 #include <linux/skbuff.h>
47 #include <net/sock.h>
48
49
50 /*
51 * Is a socket 'connection oriented' ?
52 */
53
54 static inline int connection_based(struct sock *sk)
55 {
56 return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM);
57 }
58
59
60 /*
61 * Wait for a packet..
62 */
63
64 static int wait_for_packet(struct sock * sk, int *err, long *timeo_p)
65 {
66 int error;
67
68 DECLARE_WAITQUEUE(wait, current);
69
70 __set_current_state(TASK_INTERRUPTIBLE);
71 add_wait_queue_exclusive(sk->sleep, &wait);
72
73 /* Socket errors? */
74 error = sock_error(sk);
75 if (error)
76 goto out_err;
77
78 if (!skb_queue_empty(&sk->receive_queue))
79 goto ready;
80
81 /* Socket shut down? */
82 if (sk->shutdown & RCV_SHUTDOWN)
83 goto out_noerr;
84
85 /* Sequenced packets can come disconnected. If so we report the problem */
86 error = -ENOTCONN;
87 if(connection_based(sk) && !(sk->state==TCP_ESTABLISHED || sk->state==TCP_LISTEN))
88 goto out_err;
89
90 /* handle signals */
91 if (signal_pending(current))
92 goto interrupted;
93
94 *timeo_p = schedule_timeout(*timeo_p);
95
96 ready:
97 current->state = TASK_RUNNING;
98 remove_wait_queue(sk->sleep, &wait);
99 return 0;
100
101 interrupted:
102 error = sock_intr_errno(*timeo_p);
103 out_err:
104 *err = error;
105 out:
106 current->state = TASK_RUNNING;
107 remove_wait_queue(sk->sleep, &wait);
108 return error;
109 out_noerr:
110 *err = 0;
111 error = 1;
112 goto out;
113 }
114
115 /*
116 * Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible
117 * races. This replaces identical code in packet,raw and udp, as well as the IPX
118 * AX.25 and Appletalk. It also finally fixes the long standing peek and read
119 * race for datagram sockets. If you alter this routine remember it must be
120 * re-entrant.
121 *
122 * This function will lock the socket if a skb is returned, so the caller
123 * needs to unlock the socket in that case (usually by calling skb_free_datagram)
124 *
125 * * It does not lock socket since today. This function is
126 * * free of race conditions. This measure should/can improve
127 * * significantly datagram socket latencies at high loads,
128 * * when data copying to user space takes lots of time.
129 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
130 * * 8) Great win.)
131 * * --ANK (980729)
132 *
133 * The order of the tests when we find no data waiting are specified
134 * quite explicitly by POSIX 1003.1g, don't change them without having
135 * the standard around please.
136 */
137
138 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err)
139 {
140 int error;
141 struct sk_buff *skb;
142 long timeo;
143
144 /* Caller is allowed not to check sk->err before skb_recv_datagram() */
145 error = sock_error(sk);
146 if (error)
147 goto no_packet;
148
149 timeo = sock_rcvtimeo(sk, noblock);
150
151 do {
152 /* Again only user level code calls this function, so nothing interrupt level
153 will suddenly eat the receive_queue.
154
155 Look at current nfs client by the way...
156 However, this function was corrent in any case. 8)
157 */
158 if (flags & MSG_PEEK)
159 {
160 unsigned long cpu_flags;
161
162 spin_lock_irqsave(&sk->receive_queue.lock, cpu_flags);
163 skb = skb_peek(&sk->receive_queue);
164 if(skb!=NULL)
165 atomic_inc(&skb->users);
166 spin_unlock_irqrestore(&sk->receive_queue.lock, cpu_flags);
167 } else
168 skb = skb_dequeue(&sk->receive_queue);
169
170 if (skb)
171 return skb;
172
173 /* User doesn't want to wait */
174 error = -EAGAIN;
175 if (!timeo)
176 goto no_packet;
177
178 } while (wait_for_packet(sk, err, &timeo) == 0);
179
180 return NULL;
181
182 no_packet:
183 *err = error;
184 return NULL;
185 }
186
187 void skb_free_datagram(struct sock * sk, struct sk_buff *skb)
188 {
189 kfree_skb(skb);
190 }
191
192 /*
193 * Copy a datagram to a linear buffer.
194 */
195
196 int skb_copy_datagram(const struct sk_buff *skb, int offset, char *to, int size)
197 {
198 struct iovec iov = { to, size };
199
200 return skb_copy_datagram_iovec(skb, offset, &iov, size);
201 }
202
203 /*
204 * Copy a datagram to an iovec.
205 * Note: the iovec is modified during the copy.
206 */
207 int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, struct iovec *to,
208 int len)
209 {
210 int i, copy;
211 int start = skb->len - skb->data_len;
212
213 /* Copy header. */
214 if ((copy = start-offset) > 0) {
215 if (copy > len)
216 copy = len;
217 if (memcpy_toiovec(to, skb->data + offset, copy))
218 goto fault;
219 if ((len -= copy) == 0)
220 return 0;
221 offset += copy;
222 }
223
224 /* Copy paged appendix. Hmm... why does this look so complicated? */
225 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
226 int end;
227
228 BUG_TRAP(start <= offset+len);
229
230 end = start + skb_shinfo(skb)->frags[i].size;
231 if ((copy = end-offset) > 0) {
232 int err;
233 u8 *vaddr;
234 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
235 struct page *page = frag->page;
236
237 if (copy > len)
238 copy = len;
239 vaddr = kmap(page);
240 err = memcpy_toiovec(to, vaddr + frag->page_offset +
241 offset-start, copy);
242 kunmap(page);
243 if (err)
244 goto fault;
245 if (!(len -= copy))
246 return 0;
247 offset += copy;
248 }
249 start = end;
250 }
251
252 if (skb_shinfo(skb)->frag_list) {
253 struct sk_buff *list;
254
255 for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
256 int end;
257
258 BUG_TRAP(start <= offset+len);
259
260 end = start + list->len;
261 if ((copy = end-offset) > 0) {
262 if (copy > len)
263 copy = len;
264 if (skb_copy_datagram_iovec(list, offset-start, to, copy))
265 goto fault;
266 if ((len -= copy) == 0)
267 return 0;
268 offset += copy;
269 }
270 start = end;
271 }
272 }
273 if (len == 0)
274 return 0;
275
276 fault:
277 return -EFAULT;
278 }
279
280 int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int *csump)
281 {
282 int i, copy;
283 int start = skb->len - skb->data_len;
284 int pos = 0;
285
286 /* Copy header. */
287 if ((copy = start-offset) > 0) {
288 int err = 0;
289 if (copy > len)
290 copy = len;
291 *csump = csum_and_copy_to_user(skb->data+offset, to, copy, *csump, &err);
292 if (err)
293 goto fault;
294 if ((len -= copy) == 0)
295 return 0;
296 offset += copy;
297 to += copy;
298 pos = copy;
299 }
300
301 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
302 int end;
303
304 BUG_TRAP(start <= offset+len);
305
306 end = start + skb_shinfo(skb)->frags[i].size;
307 if ((copy = end-offset) > 0) {
308 unsigned int csum2;
309 int err = 0;
310 u8 *vaddr;
311 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
312 struct page *page = frag->page;
313
314 if (copy > len)
315 copy = len;
316 vaddr = kmap(page);
317 csum2 = csum_and_copy_to_user(vaddr + frag->page_offset +
318 offset-start, to, copy, 0, &err);
319 kunmap(page);
320 if (err)
321 goto fault;
322 *csump = csum_block_add(*csump, csum2, pos);
323 if (!(len -= copy))
324 return 0;
325 offset += copy;
326 to += copy;
327 pos += copy;
328 }
329 start = end;
330 }
331
332 if (skb_shinfo(skb)->frag_list) {
333 struct sk_buff *list;
334
335 for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
336 int end;
337
338 BUG_TRAP(start <= offset+len);
339
340 end = start + list->len;
341 if ((copy = end-offset) > 0) {
342 unsigned int csum2 = 0;
343 if (copy > len)
344 copy = len;
345 if (skb_copy_and_csum_datagram(list, offset-start, to, copy, &csum2))
346 goto fault;
347 *csump = csum_block_add(*csump, csum2, pos);
348 if ((len -= copy) == 0)
349 return 0;
350 offset += copy;
351 to += copy;
352 pos += copy;
353 }
354 start = end;
355 }
356 }
357 if (len == 0)
358 return 0;
359
360 fault:
361 return -EFAULT;
362 }
363
364 /* Copy and checkum skb to user iovec. Caller _must_ check that
365 skb will fit to this iovec.
366
367 Returns: 0 - success.
368 -EINVAL - checksum failure.
369 -EFAULT - fault during copy. Beware, in this case iovec can be
370 modified!
371 */
372
373 int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, int hlen, struct iovec *iov)
374 {
375 unsigned int csum;
376 int chunk = skb->len - hlen;
377
378 /* Skip filled elements. Pretty silly, look at memcpy_toiovec, though 8) */
379 while (iov->iov_len == 0)
380 iov++;
381
382 if (iov->iov_len < chunk) {
383 if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk+hlen, skb->csum)))
384 goto csum_error;
385 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
386 goto fault;
387 } else {
388 csum = csum_partial(skb->data, hlen, skb->csum);
389 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, chunk, &csum))
390 goto fault;
391 if ((unsigned short)csum_fold(csum))
392 goto csum_error;
393 iov->iov_len -= chunk;
394 iov->iov_base += chunk;
395 }
396 return 0;
397
398 csum_error:
399 return -EINVAL;
400
401 fault:
402 return -EFAULT;
403 }
404
405
406
407 /*
408 * Datagram poll: Again totally generic. This also handles
409 * sequenced packet sockets providing the socket receive queue
410 * is only ever holding data ready to receive.
411 *
412 * Note: when you _don't_ use this routine for this protocol,
413 * and you use a different write policy from sock_writeable()
414 * then please supply your own write_space callback.
415 */
416
417 unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait)
418 {
419 struct sock *sk = sock->sk;
420 unsigned int mask;
421
422 poll_wait(file, sk->sleep, wait);
423 mask = 0;
424
425 /* exceptional events? */
426 if (sk->err || !skb_queue_empty(&sk->error_queue))
427 mask |= POLLERR;
428 if (sk->shutdown == SHUTDOWN_MASK)
429 mask |= POLLHUP;
430
431 /* readable? */
432 if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN))
433 mask |= POLLIN | POLLRDNORM;
434
435 /* Connection-based need to check for termination and startup */
436 if (connection_based(sk)) {
437 if (sk->state==TCP_CLOSE)
438 mask |= POLLHUP;
439 /* connection hasn't started yet? */
440 if (sk->state == TCP_SYN_SENT)
441 return mask;
442 }
443
444 /* writable? */
445 if (sock_writeable(sk))
446 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
447 else
448 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
449
450 return mask;
451 }
452