File: /usr/src/linux/net/core/datagram.c

1     /*
2      *	SUCS NET3:
3      *
4      *	Generic datagram handling routines. These are generic for all protocols. Possibly a generic IP version on top
5      *	of these would make sense. Not tonight however 8-).
6      *	This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and NetROM layer all have identical poll code and mostly
7      *	identical recvmsg() code. So we share it here. The poll was shared before but buried in udp.c so I moved it.
8      *
9      *	Authors:	Alan Cox <alan@redhat.com>. (datagram_poll() from old udp.c code)
10      *
11      *	Fixes:
12      *		Alan Cox	:	NULL return from skb_peek_copy() understood
13      *		Alan Cox	:	Rewrote skb_read_datagram to avoid the skb_peek_copy stuff.
14      *		Alan Cox	:	Added support for SOCK_SEQPACKET. IPX can no longer use the SO_TYPE hack but
15      *					AX.25 now works right, and SPX is feasible.
16      *		Alan Cox	:	Fixed write poll of non IP protocol crash.
17      *		Florian  La Roche:	Changed for my new skbuff handling.
18      *		Darryl Miles	:	Fixed non-blocking SOCK_SEQPACKET.
19      *		Linus Torvalds	:	BSD semantic fixes.
20      *		Alan Cox	:	Datagram iovec handling
21      *		Darryl Miles	:	Fixed non-blocking SOCK_STREAM.
22      *		Alan Cox	:	POSIXisms
23      *		Pete Wyckoff    :       Unconnected accept() fix.
24      *
25      */
26     
27     #include <linux/types.h>
28     #include <linux/kernel.h>
29     #include <asm/uaccess.h>
30     #include <asm/system.h>
31     #include <linux/mm.h>
32     #include <linux/interrupt.h>
33     #include <linux/in.h>
34     #include <linux/errno.h>
35     #include <linux/sched.h>
36     #include <linux/inet.h>
37     #include <linux/netdevice.h>
38     #include <linux/poll.h>
39     #include <linux/highmem.h>
40     
41     #include <net/ip.h>
42     #include <net/protocol.h>
43     #include <net/route.h>
44     #include <net/tcp.h>
45     #include <net/udp.h>
46     #include <linux/skbuff.h>
47     #include <net/sock.h>
48     
49     
50     /*
51      *	Is a socket 'connection oriented' ?
52      */
53      
54     static inline int connection_based(struct sock *sk)
55     {
56     	return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM);
57     }
58     
59     
60     /*
61      * Wait for a packet..
62      */
63     
64     static int wait_for_packet(struct sock * sk, int *err, long *timeo_p)
65     {
66     	int error;
67     
68     	DECLARE_WAITQUEUE(wait, current);
69     
70     	__set_current_state(TASK_INTERRUPTIBLE);
71     	add_wait_queue_exclusive(sk->sleep, &wait);
72     
73     	/* Socket errors? */
74     	error = sock_error(sk);
75     	if (error)
76     		goto out_err;
77     
78     	if (!skb_queue_empty(&sk->receive_queue))
79     		goto ready;
80     
81     	/* Socket shut down? */
82     	if (sk->shutdown & RCV_SHUTDOWN)
83     		goto out_noerr;
84     
85     	/* Sequenced packets can come disconnected. If so we report the problem */
86     	error = -ENOTCONN;
87     	if(connection_based(sk) && !(sk->state==TCP_ESTABLISHED || sk->state==TCP_LISTEN))
88     		goto out_err;
89     
90     	/* handle signals */
91     	if (signal_pending(current))
92     		goto interrupted;
93     
94     	*timeo_p = schedule_timeout(*timeo_p);
95     
96     ready:
97     	current->state = TASK_RUNNING;
98     	remove_wait_queue(sk->sleep, &wait);
99     	return 0;
100     
101     interrupted:
102     	error = sock_intr_errno(*timeo_p);
103     out_err:
104     	*err = error;
105     out:
106     	current->state = TASK_RUNNING;
107     	remove_wait_queue(sk->sleep, &wait);
108     	return error;
109     out_noerr:
110     	*err = 0;
111     	error = 1;
112     	goto out;
113     }
114     
115     /*
116      *	Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible
117      *	races. This replaces identical code in packet,raw and udp, as well as the IPX
118      *	AX.25 and Appletalk. It also finally fixes the long standing peek and read
119      *	race for datagram sockets. If you alter this routine remember it must be
120      *	re-entrant.
121      *
122      *	This function will lock the socket if a skb is returned, so the caller
123      *	needs to unlock the socket in that case (usually by calling skb_free_datagram)
124      *
125      *	* It does not lock socket since today. This function is
126      *	* free of race conditions. This measure should/can improve
127      *	* significantly datagram socket latencies at high loads,
128      *	* when data copying to user space takes lots of time.
129      *	* (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
130      *	*  8) Great win.)
131      *	*			                    --ANK (980729)
132      *
133      *	The order of the tests when we find no data waiting are specified
134      *	quite explicitly by POSIX 1003.1g, don't change them without having
135      *	the standard around please.
136      */
137     
138     struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err)
139     {
140     	int error;
141     	struct sk_buff *skb;
142     	long timeo;
143     
144     	/* Caller is allowed not to check sk->err before skb_recv_datagram() */
145     	error = sock_error(sk);
146     	if (error)
147     		goto no_packet;
148     
149     	timeo = sock_rcvtimeo(sk, noblock);
150     
151     	do {
152     		/* Again only user level code calls this function, so nothing interrupt level
153     		   will suddenly eat the receive_queue.
154     
155     		   Look at current nfs client by the way...
156     		   However, this function was corrent in any case. 8)
157     		 */
158     		if (flags & MSG_PEEK)
159     		{
160     			unsigned long cpu_flags;
161     
162     			spin_lock_irqsave(&sk->receive_queue.lock, cpu_flags);
163     			skb = skb_peek(&sk->receive_queue);
164     			if(skb!=NULL)
165     				atomic_inc(&skb->users);
166     			spin_unlock_irqrestore(&sk->receive_queue.lock, cpu_flags);
167     		} else
168     			skb = skb_dequeue(&sk->receive_queue);
169     
170     		if (skb)
171     			return skb;
172     
173     		/* User doesn't want to wait */
174     		error = -EAGAIN;
175     		if (!timeo)
176     			goto no_packet;
177     
178     	} while (wait_for_packet(sk, err, &timeo) == 0);
179     
180     	return NULL;
181     
182     no_packet:
183     	*err = error;
184     	return NULL;
185     }
186     
187     void skb_free_datagram(struct sock * sk, struct sk_buff *skb)
188     {
189     	kfree_skb(skb);
190     }
191     
192     /*
193      *	Copy a datagram to a linear buffer.
194      */
195     
196     int skb_copy_datagram(const struct sk_buff *skb, int offset, char *to, int size)
197     {
198     	struct iovec iov = { to, size };
199     
200     	return skb_copy_datagram_iovec(skb, offset, &iov, size);
201     }
202     
203     /*
204      *	Copy a datagram to an iovec.
205      *	Note: the iovec is modified during the copy.
206      */
207     int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, struct iovec *to,
208     			    int len)
209     {
210     	int i, copy;
211     	int start = skb->len - skb->data_len;
212     
213     	/* Copy header. */
214     	if ((copy = start-offset) > 0) {
215     		if (copy > len)
216     			copy = len;
217     		if (memcpy_toiovec(to, skb->data + offset, copy))
218     			goto fault;
219     		if ((len -= copy) == 0)
220     			return 0;
221     		offset += copy;
222     	}
223     
224     	/* Copy paged appendix. Hmm... why does this look so complicated? */
225     	for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
226     		int end;
227     
228     		BUG_TRAP(start <= offset+len);
229     
230     		end = start + skb_shinfo(skb)->frags[i].size;
231     		if ((copy = end-offset) > 0) {
232     			int err;
233     			u8  *vaddr;
234     			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
235     			struct page *page = frag->page;
236     
237     			if (copy > len)
238     				copy = len;
239     			vaddr = kmap(page);
240     			err = memcpy_toiovec(to, vaddr + frag->page_offset +
241     					     offset-start, copy);
242     			kunmap(page);
243     			if (err)
244     				goto fault;
245     			if (!(len -= copy))
246     				return 0;
247     			offset += copy;
248     		}
249     		start = end;
250     	}
251     
252     	if (skb_shinfo(skb)->frag_list) {
253     		struct sk_buff *list;
254     
255     		for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
256     			int end;
257     
258     			BUG_TRAP(start <= offset+len);
259     
260     			end = start + list->len;
261     			if ((copy = end-offset) > 0) {
262     				if (copy > len)
263     					copy = len;
264     				if (skb_copy_datagram_iovec(list, offset-start, to, copy))
265     					goto fault;
266     				if ((len -= copy) == 0)
267     					return 0;
268     				offset += copy;
269     			}
270     			start = end;
271     		}
272     	}
273     	if (len == 0)
274     		return 0;
275     
276     fault:
277     	return -EFAULT;
278     }
279     
280     int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int *csump)
281     {
282     	int i, copy;
283     	int start = skb->len - skb->data_len;
284     	int pos = 0;
285     
286     	/* Copy header. */
287     	if ((copy = start-offset) > 0) {
288     		int err = 0;
289     		if (copy > len)
290     			copy = len;
291     		*csump = csum_and_copy_to_user(skb->data+offset, to, copy, *csump, &err);
292     		if (err)
293     			goto fault;
294     		if ((len -= copy) == 0)
295     			return 0;
296     		offset += copy;
297     		to += copy;
298     		pos = copy;
299     	}
300     
301     	for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
302     		int end;
303     
304     		BUG_TRAP(start <= offset+len);
305     
306     		end = start + skb_shinfo(skb)->frags[i].size;
307     		if ((copy = end-offset) > 0) {
308     			unsigned int csum2;
309     			int err = 0;
310     			u8  *vaddr;
311     			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
312     			struct page *page = frag->page;
313     
314     			if (copy > len)
315     				copy = len;
316     			vaddr = kmap(page);
317     			csum2 = csum_and_copy_to_user(vaddr + frag->page_offset +
318     						      offset-start, to, copy, 0, &err);
319     			kunmap(page);
320     			if (err)
321     				goto fault;
322     			*csump = csum_block_add(*csump, csum2, pos);
323     			if (!(len -= copy))
324     				return 0;
325     			offset += copy;
326     			to += copy;
327     			pos += copy;
328     		}
329     		start = end;
330     	}
331     
332     	if (skb_shinfo(skb)->frag_list) {
333     		struct sk_buff *list;
334     
335     		for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
336     			int end;
337     
338     			BUG_TRAP(start <= offset+len);
339     
340     			end = start + list->len;
341     			if ((copy = end-offset) > 0) {
342     				unsigned int csum2 = 0;
343     				if (copy > len)
344     					copy = len;
345     				if (skb_copy_and_csum_datagram(list, offset-start, to, copy, &csum2))
346     					goto fault;
347     				*csump = csum_block_add(*csump, csum2, pos);
348     				if ((len -= copy) == 0)
349     					return 0;
350     				offset += copy;
351     				to += copy;
352     				pos += copy;
353     			}
354     			start = end;
355     		}
356     	}
357     	if (len == 0)
358     		return 0;
359     
360     fault:
361     	return -EFAULT;
362     }
363     
364     /* Copy and checkum skb to user iovec. Caller _must_ check that
365        skb will fit to this iovec.
366     
367        Returns: 0       - success.
368                 -EINVAL - checksum failure.
369     	    -EFAULT - fault during copy. Beware, in this case iovec can be
370     	              modified!
371      */
372     
373     int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, int hlen, struct iovec *iov)
374     {
375     	unsigned int csum;
376     	int chunk = skb->len - hlen;
377     
378     	/* Skip filled elements. Pretty silly, look at memcpy_toiovec, though 8) */
379     	while (iov->iov_len == 0)
380     		iov++;
381     
382     	if (iov->iov_len < chunk) {
383     		if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk+hlen, skb->csum)))
384     			goto csum_error;
385     		if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
386     			goto fault;
387     	} else {
388     		csum = csum_partial(skb->data, hlen, skb->csum);
389     		if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, chunk, &csum))
390     			goto fault;
391     		if ((unsigned short)csum_fold(csum))
392     			goto csum_error;
393     		iov->iov_len -= chunk;
394     		iov->iov_base += chunk;
395     	}
396     	return 0;
397     
398     csum_error:
399     	return -EINVAL;
400     
401     fault:
402     	return -EFAULT;
403     }
404     
405     
406     
407     /*
408      *	Datagram poll: Again totally generic. This also handles
409      *	sequenced packet sockets providing the socket receive queue
410      *	is only ever holding data ready to receive.
411      *
412      *	Note: when you _don't_ use this routine for this protocol,
413      *	and you use a different write policy from sock_writeable()
414      *	then please supply your own write_space callback.
415      */
416     
417     unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait)
418     {
419     	struct sock *sk = sock->sk;
420     	unsigned int mask;
421     
422     	poll_wait(file, sk->sleep, wait);
423     	mask = 0;
424     
425     	/* exceptional events? */
426     	if (sk->err || !skb_queue_empty(&sk->error_queue))
427     		mask |= POLLERR;
428     	if (sk->shutdown == SHUTDOWN_MASK)
429     		mask |= POLLHUP;
430     
431     	/* readable? */
432     	if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN))
433     		mask |= POLLIN | POLLRDNORM;
434     
435     	/* Connection-based need to check for termination and startup */
436     	if (connection_based(sk)) {
437     		if (sk->state==TCP_CLOSE)
438     			mask |= POLLHUP;
439     		/* connection hasn't started yet? */
440     		if (sk->state == TCP_SYN_SENT)
441     			return mask;
442     	}
443     
444     	/* writable? */
445     	if (sock_writeable(sk))
446     		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
447     	else
448     		set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
449     
450     	return mask;
451     }
452