File: /usr/src/linux/net/core/sock.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Version: $Id: sock.c,v 1.112 2001/07/27 09:54:48 davem Exp $
11 *
12 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
37 * code. The ACK stuff can wait and needs major
38 * TCP layer surgery.
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
66 * (compatibility fix)
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
82 * Chris Evans : Security fixes - signedness again
83 * Arnaldo C. Melo : cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 * This program is free software; you can redistribute it and/or
89 * modify it under the terms of the GNU General Public License
90 * as published by the Free Software Foundation; either version
91 * 2 of the License, or (at your option) any later version.
92 */
93
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/major.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/fcntl.h>
107 #include <linux/mm.h>
108 #include <linux/slab.h>
109 #include <linux/interrupt.h>
110 #include <linux/poll.h>
111 #include <linux/init.h>
112
113 #include <asm/uaccess.h>
114 #include <asm/system.h>
115
116 #include <linux/inet.h>
117 #include <linux/netdevice.h>
118 #include <net/ip.h>
119 #include <net/protocol.h>
120 #include <net/arp.h>
121 #include <net/route.h>
122 #include <net/tcp.h>
123 #include <net/udp.h>
124 #include <linux/skbuff.h>
125 #include <net/sock.h>
126 #include <net/raw.h>
127 #include <net/icmp.h>
128 #include <linux/ipsec.h>
129
130 #ifdef CONFIG_FILTER
131 #include <linux/filter.h>
132 #endif
133
134 /* Run time adjustable parameters. */
135 __u32 sysctl_wmem_max = SK_WMEM_MAX;
136 __u32 sysctl_rmem_max = SK_RMEM_MAX;
137 __u32 sysctl_wmem_default = SK_WMEM_MAX;
138 __u32 sysctl_rmem_default = SK_RMEM_MAX;
139
140 /* Maximal space eaten by iovec or ancilliary data plus some space */
141 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
142
143 static int sock_set_timeout(long *timeo_p, char *optval, int optlen)
144 {
145 struct timeval tv;
146
147 if (optlen < sizeof(tv))
148 return -EINVAL;
149 if (copy_from_user(&tv, optval, sizeof(tv)))
150 return -EFAULT;
151
152 *timeo_p = MAX_SCHEDULE_TIMEOUT;
153 if (tv.tv_sec == 0 && tv.tv_usec == 0)
154 return 0;
155 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
156 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
157 return 0;
158 }
159
160 /*
161 * This is meant for all protocols to use and covers goings on
162 * at the socket level. Everything here is generic.
163 */
164
165 int sock_setsockopt(struct socket *sock, int level, int optname,
166 char *optval, int optlen)
167 {
168 struct sock *sk=sock->sk;
169 #ifdef CONFIG_FILTER
170 struct sk_filter *filter;
171 #endif
172 int val;
173 int valbool;
174 struct linger ling;
175 int ret = 0;
176
177 /*
178 * Options without arguments
179 */
180
181 #ifdef SO_DONTLINGER /* Compatibility item... */
182 switch(optname)
183 {
184 case SO_DONTLINGER:
185 sk->linger=0;
186 return 0;
187 }
188 #endif
189
190 if(optlen<sizeof(int))
191 return(-EINVAL);
192
193 if (get_user(val, (int *)optval))
194 return -EFAULT;
195
196 valbool = val?1:0;
197
198 lock_sock(sk);
199
200 switch(optname)
201 {
202 case SO_DEBUG:
203 if(val && !capable(CAP_NET_ADMIN))
204 {
205 ret = -EACCES;
206 }
207 else
208 sk->debug=valbool;
209 break;
210 case SO_REUSEADDR:
211 sk->reuse = valbool;
212 break;
213 case SO_TYPE:
214 case SO_ERROR:
215 ret = -ENOPROTOOPT;
216 break;
217 case SO_DONTROUTE:
218 sk->localroute=valbool;
219 break;
220 case SO_BROADCAST:
221 sk->broadcast=valbool;
222 break;
223 case SO_SNDBUF:
224 /* Don't error on this BSD doesn't and if you think
225 about it this is right. Otherwise apps have to
226 play 'guess the biggest size' games. RCVBUF/SNDBUF
227 are treated in BSD as hints */
228
229 if (val > sysctl_wmem_max)
230 val = sysctl_wmem_max;
231
232 sk->userlocks |= SOCK_SNDBUF_LOCK;
233 if ((val * 2) < SOCK_MIN_SNDBUF)
234 sk->sndbuf = SOCK_MIN_SNDBUF;
235 else
236 sk->sndbuf = (val * 2);
237
238 /*
239 * Wake up sending tasks if we
240 * upped the value.
241 */
242 sk->write_space(sk);
243 break;
244
245 case SO_RCVBUF:
246 /* Don't error on this BSD doesn't and if you think
247 about it this is right. Otherwise apps have to
248 play 'guess the biggest size' games. RCVBUF/SNDBUF
249 are treated in BSD as hints */
250
251 if (val > sysctl_rmem_max)
252 val = sysctl_rmem_max;
253
254 sk->userlocks |= SOCK_RCVBUF_LOCK;
255 /* FIXME: is this lower bound the right one? */
256 if ((val * 2) < SOCK_MIN_RCVBUF)
257 sk->rcvbuf = SOCK_MIN_RCVBUF;
258 else
259 sk->rcvbuf = (val * 2);
260 break;
261
262 case SO_KEEPALIVE:
263 #ifdef CONFIG_INET
264 if (sk->protocol == IPPROTO_TCP)
265 {
266 tcp_set_keepalive(sk, valbool);
267 }
268 #endif
269 sk->keepopen = valbool;
270 break;
271
272 case SO_OOBINLINE:
273 sk->urginline = valbool;
274 break;
275
276 case SO_NO_CHECK:
277 sk->no_check = valbool;
278 break;
279
280 case SO_PRIORITY:
281 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
282 sk->priority = val;
283 else
284 ret = -EPERM;
285 break;
286
287 case SO_LINGER:
288 if(optlen<sizeof(ling)) {
289 ret = -EINVAL; /* 1003.1g */
290 break;
291 }
292 if (copy_from_user(&ling,optval,sizeof(ling))) {
293 ret = -EFAULT;
294 break;
295 }
296 if(ling.l_onoff==0) {
297 sk->linger=0;
298 } else {
299 #if (BITS_PER_LONG == 32)
300 if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
301 sk->lingertime=MAX_SCHEDULE_TIMEOUT;
302 else
303 #endif
304 sk->lingertime=ling.l_linger*HZ;
305 sk->linger=1;
306 }
307 break;
308
309 case SO_BSDCOMPAT:
310 sk->bsdism = valbool;
311 break;
312
313 case SO_PASSCRED:
314 sock->passcred = valbool;
315 break;
316
317 case SO_TIMESTAMP:
318 sk->rcvtstamp = valbool;
319 break;
320
321 case SO_RCVLOWAT:
322 if (val < 0)
323 val = INT_MAX;
324 sk->rcvlowat = val ? : 1;
325 break;
326
327 case SO_RCVTIMEO:
328 ret = sock_set_timeout(&sk->rcvtimeo, optval, optlen);
329 break;
330
331 case SO_SNDTIMEO:
332 ret = sock_set_timeout(&sk->sndtimeo, optval, optlen);
333 break;
334
335 #ifdef CONFIG_NETDEVICES
336 case SO_BINDTODEVICE:
337 {
338 char devname[IFNAMSIZ];
339
340 /* Sorry... */
341 if (!capable(CAP_NET_RAW)) {
342 ret = -EPERM;
343 break;
344 }
345
346 /* Bind this socket to a particular device like "eth0",
347 * as specified in the passed interface name. If the
348 * name is "" or the option length is zero the socket
349 * is not bound.
350 */
351
352 if (!valbool) {
353 sk->bound_dev_if = 0;
354 } else {
355 if (optlen > IFNAMSIZ)
356 optlen = IFNAMSIZ;
357 if (copy_from_user(devname, optval, optlen)) {
358 ret = -EFAULT;
359 break;
360 }
361
362 /* Remove any cached route for this socket. */
363 sk_dst_reset(sk);
364
365 if (devname[0] == '\0') {
366 sk->bound_dev_if = 0;
367 } else {
368 struct net_device *dev = dev_get_by_name(devname);
369 if (!dev) {
370 ret = -ENODEV;
371 break;
372 }
373 sk->bound_dev_if = dev->ifindex;
374 dev_put(dev);
375 }
376 }
377 break;
378 }
379 #endif
380
381
382 #ifdef CONFIG_FILTER
383 case SO_ATTACH_FILTER:
384 ret = -EINVAL;
385 if (optlen == sizeof(struct sock_fprog)) {
386 struct sock_fprog fprog;
387
388 ret = -EFAULT;
389 if (copy_from_user(&fprog, optval, sizeof(fprog)))
390 break;
391
392 ret = sk_attach_filter(&fprog, sk);
393 }
394 break;
395
396 case SO_DETACH_FILTER:
397 spin_lock_bh(&sk->lock.slock);
398 filter = sk->filter;
399 if (filter) {
400 sk->filter = NULL;
401 spin_unlock_bh(&sk->lock.slock);
402 sk_filter_release(sk, filter);
403 break;
404 }
405 spin_unlock_bh(&sk->lock.slock);
406 ret = -ENONET;
407 break;
408 #endif
409 /* We implement the SO_SNDLOWAT etc to
410 not be settable (1003.1g 5.3) */
411 default:
412 ret = -ENOPROTOOPT;
413 break;
414 }
415 release_sock(sk);
416 return ret;
417 }
418
419
420 int sock_getsockopt(struct socket *sock, int level, int optname,
421 char *optval, int *optlen)
422 {
423 struct sock *sk = sock->sk;
424
425 union
426 {
427 int val;
428 struct linger ling;
429 struct timeval tm;
430 } v;
431
432 unsigned int lv=sizeof(int),len;
433
434 if(get_user(len,optlen))
435 return -EFAULT;
436 if(len < 0)
437 return -EINVAL;
438
439 switch(optname)
440 {
441 case SO_DEBUG:
442 v.val = sk->debug;
443 break;
444
445 case SO_DONTROUTE:
446 v.val = sk->localroute;
447 break;
448
449 case SO_BROADCAST:
450 v.val= sk->broadcast;
451 break;
452
453 case SO_SNDBUF:
454 v.val=sk->sndbuf;
455 break;
456
457 case SO_RCVBUF:
458 v.val =sk->rcvbuf;
459 break;
460
461 case SO_REUSEADDR:
462 v.val = sk->reuse;
463 break;
464
465 case SO_KEEPALIVE:
466 v.val = sk->keepopen;
467 break;
468
469 case SO_TYPE:
470 v.val = sk->type;
471 break;
472
473 case SO_ERROR:
474 v.val = -sock_error(sk);
475 if(v.val==0)
476 v.val=xchg(&sk->err_soft,0);
477 break;
478
479 case SO_OOBINLINE:
480 v.val = sk->urginline;
481 break;
482
483 case SO_NO_CHECK:
484 v.val = sk->no_check;
485 break;
486
487 case SO_PRIORITY:
488 v.val = sk->priority;
489 break;
490
491 case SO_LINGER:
492 lv=sizeof(v.ling);
493 v.ling.l_onoff=sk->linger;
494 v.ling.l_linger=sk->lingertime/HZ;
495 break;
496
497 case SO_BSDCOMPAT:
498 v.val = sk->bsdism;
499 break;
500
501 case SO_TIMESTAMP:
502 v.val = sk->rcvtstamp;
503 break;
504
505 case SO_RCVTIMEO:
506 lv=sizeof(struct timeval);
507 if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
508 v.tm.tv_sec = 0;
509 v.tm.tv_usec = 0;
510 } else {
511 v.tm.tv_sec = sk->rcvtimeo/HZ;
512 v.tm.tv_usec = ((sk->rcvtimeo%HZ)*1000)/HZ;
513 }
514 break;
515
516 case SO_SNDTIMEO:
517 lv=sizeof(struct timeval);
518 if (sk->sndtimeo == MAX_SCHEDULE_TIMEOUT) {
519 v.tm.tv_sec = 0;
520 v.tm.tv_usec = 0;
521 } else {
522 v.tm.tv_sec = sk->sndtimeo/HZ;
523 v.tm.tv_usec = ((sk->sndtimeo%HZ)*1000)/HZ;
524 }
525 break;
526
527 case SO_RCVLOWAT:
528 v.val = sk->rcvlowat;
529 break;
530
531 case SO_SNDLOWAT:
532 v.val=1;
533 break;
534
535 case SO_PASSCRED:
536 v.val = sock->passcred;
537 break;
538
539 case SO_PEERCRED:
540 if (len > sizeof(sk->peercred))
541 len = sizeof(sk->peercred);
542 if (copy_to_user(optval, &sk->peercred, len))
543 return -EFAULT;
544 goto lenout;
545
546 case SO_PEERNAME:
547 {
548 char address[128];
549
550 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
551 return -ENOTCONN;
552 if (lv < len)
553 return -EINVAL;
554 if(copy_to_user((void*)optval, address, len))
555 return -EFAULT;
556 goto lenout;
557 }
558
559 /* Dubious BSD thing... Probably nobody even uses it, but
560 * the UNIX standard wants it for whatever reason... -DaveM
561 */
562 case SO_ACCEPTCONN:
563 v.val = (sk->state == TCP_LISTEN);
564 break;
565
566 default:
567 return(-ENOPROTOOPT);
568 }
569 if (len > lv)
570 len = lv;
571 if (copy_to_user(optval, &v, len))
572 return -EFAULT;
573 lenout:
574 if (put_user(len, optlen))
575 return -EFAULT;
576 return 0;
577 }
578
579 static kmem_cache_t *sk_cachep;
580
581 /*
582 * All socket objects are allocated here. This is for future
583 * usage.
584 */
585
586 struct sock *sk_alloc(int family, int priority, int zero_it)
587 {
588 struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
589
590 if(sk && zero_it) {
591 memset(sk, 0, sizeof(struct sock));
592 sk->family = family;
593 sock_lock_init(sk);
594 }
595
596 return sk;
597 }
598
599 void sk_free(struct sock *sk)
600 {
601 #ifdef CONFIG_FILTER
602 struct sk_filter *filter;
603 #endif
604
605 if (sk->destruct)
606 sk->destruct(sk);
607
608 #ifdef CONFIG_FILTER
609 filter = sk->filter;
610 if (filter) {
611 sk_filter_release(sk, filter);
612 sk->filter = NULL;
613 }
614 #endif
615
616 if (atomic_read(&sk->omem_alloc))
617 printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
618
619 kmem_cache_free(sk_cachep, sk);
620 }
621
622 void __init sk_init(void)
623 {
624 sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
625 SLAB_HWCACHE_ALIGN, 0, 0);
626 if (!sk_cachep)
627 printk(KERN_CRIT "sk_init: Cannot create sock SLAB cache!");
628
629 if (num_physpages <= 4096) {
630 sysctl_wmem_max = 32767;
631 sysctl_rmem_max = 32767;
632 sysctl_wmem_default = 32767;
633 sysctl_wmem_default = 32767;
634 } else if (num_physpages >= 131072) {
635 sysctl_wmem_max = 131071;
636 sysctl_rmem_max = 131071;
637 }
638 }
639
640 /*
641 * Simple resource managers for sockets.
642 */
643
644
645 /*
646 * Write buffer destructor automatically called from kfree_skb.
647 */
648 void sock_wfree(struct sk_buff *skb)
649 {
650 struct sock *sk = skb->sk;
651
652 /* In case it might be waiting for more memory. */
653 atomic_sub(skb->truesize, &sk->wmem_alloc);
654 if (!sk->use_write_queue)
655 sk->write_space(sk);
656 sock_put(sk);
657 }
658
659 /*
660 * Read buffer destructor automatically called from kfree_skb.
661 */
662 void sock_rfree(struct sk_buff *skb)
663 {
664 struct sock *sk = skb->sk;
665
666 atomic_sub(skb->truesize, &sk->rmem_alloc);
667 }
668
669 /*
670 * Allocate a skb from the socket's send buffer.
671 */
672 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
673 {
674 if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
675 struct sk_buff * skb = alloc_skb(size, priority);
676 if (skb) {
677 skb_set_owner_w(skb, sk);
678 return skb;
679 }
680 }
681 return NULL;
682 }
683
684 /*
685 * Allocate a skb from the socket's receive buffer.
686 */
687 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
688 {
689 if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) {
690 struct sk_buff *skb = alloc_skb(size, priority);
691 if (skb) {
692 skb_set_owner_r(skb, sk);
693 return skb;
694 }
695 }
696 return NULL;
697 }
698
699 /*
700 * Allocate a memory block from the socket's option memory buffer.
701 */
702 void *sock_kmalloc(struct sock *sk, int size, int priority)
703 {
704 if ((unsigned)size <= sysctl_optmem_max &&
705 atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) {
706 void *mem;
707 /* First do the add, to avoid the race if kmalloc
708 * might sleep.
709 */
710 atomic_add(size, &sk->omem_alloc);
711 mem = kmalloc(size, priority);
712 if (mem)
713 return mem;
714 atomic_sub(size, &sk->omem_alloc);
715 }
716 return NULL;
717 }
718
719 /*
720 * Free an option memory block.
721 */
722 void sock_kfree_s(struct sock *sk, void *mem, int size)
723 {
724 kfree(mem);
725 atomic_sub(size, &sk->omem_alloc);
726 }
727
728 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
729 I think, these locks should be removed for datagram sockets.
730 */
731 static long sock_wait_for_wmem(struct sock * sk, long timeo)
732 {
733 DECLARE_WAITQUEUE(wait, current);
734
735 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
736 add_wait_queue(sk->sleep, &wait);
737 for (;;) {
738 if (!timeo)
739 break;
740 if (signal_pending(current))
741 break;
742 set_bit(SOCK_NOSPACE, &sk->socket->flags);
743 set_current_state(TASK_INTERRUPTIBLE);
744 if (atomic_read(&sk->wmem_alloc) < sk->sndbuf)
745 break;
746 if (sk->shutdown & SEND_SHUTDOWN)
747 break;
748 if (sk->err)
749 break;
750 timeo = schedule_timeout(timeo);
751 }
752 __set_current_state(TASK_RUNNING);
753 remove_wait_queue(sk->sleep, &wait);
754 return timeo;
755 }
756
757
758 /*
759 * Generic send/receive buffer handlers
760 */
761
762 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
763 int noblock, int *errcode)
764 {
765 int err;
766 struct sk_buff *skb;
767 long timeo;
768
769 timeo = sock_sndtimeo(sk, noblock);
770
771 while (1) {
772 unsigned long try_size = size;
773
774 err = sock_error(sk);
775 if (err != 0)
776 goto failure;
777
778 /*
779 * We should send SIGPIPE in these cases according to
780 * 1003.1g draft 6.4. If we (the user) did a shutdown()
781 * call however we should not.
782 *
783 * Note: This routine isnt just used for datagrams and
784 * anyway some datagram protocols have a notion of
785 * close down.
786 */
787
788 err = -EPIPE;
789 if (sk->shutdown&SEND_SHUTDOWN)
790 goto failure;
791
792 if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
793 skb = alloc_skb(try_size, sk->allocation);
794 if (skb)
795 break;
796 err = -ENOBUFS;
797 goto failure;
798 }
799
800 /*
801 * This means we have too many buffers for this socket already.
802 */
803
804 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
805 set_bit(SOCK_NOSPACE, &sk->socket->flags);
806 err = -EAGAIN;
807 if (!timeo)
808 goto failure;
809 if (signal_pending(current))
810 goto interrupted;
811 timeo = sock_wait_for_wmem(sk, timeo);
812 }
813
814 skb_set_owner_w(skb, sk);
815 return skb;
816
817 interrupted:
818 err = sock_intr_errno(timeo);
819 failure:
820 *errcode = err;
821 return NULL;
822 }
823
824 void __lock_sock(struct sock *sk)
825 {
826 DECLARE_WAITQUEUE(wait, current);
827
828 add_wait_queue_exclusive(&sk->lock.wq, &wait);
829 for(;;) {
830 current->state = TASK_UNINTERRUPTIBLE;
831 spin_unlock_bh(&sk->lock.slock);
832 schedule();
833 spin_lock_bh(&sk->lock.slock);
834 if(!sk->lock.users)
835 break;
836 }
837 current->state = TASK_RUNNING;
838 remove_wait_queue(&sk->lock.wq, &wait);
839 }
840
841 void __release_sock(struct sock *sk)
842 {
843 struct sk_buff *skb = sk->backlog.head;
844
845 do {
846 sk->backlog.head = sk->backlog.tail = NULL;
847 bh_unlock_sock(sk);
848
849 do {
850 struct sk_buff *next = skb->next;
851
852 skb->next = NULL;
853 sk->backlog_rcv(sk, skb);
854 skb = next;
855 } while (skb != NULL);
856
857 bh_lock_sock(sk);
858 } while((skb = sk->backlog.head) != NULL);
859 }
860
861 /*
862 * Generic socket manager library. Most simpler socket families
863 * use this to manage their socket lists. At some point we should
864 * hash these. By making this generic we get the lot hashed for free.
865 *
866 * It is broken by design. All the protocols using it must be fixed. --ANK
867 */
868
869 rwlock_t net_big_sklist_lock = RW_LOCK_UNLOCKED;
870
871 void sklist_remove_socket(struct sock **list, struct sock *sk)
872 {
873 struct sock *s;
874
875 write_lock_bh(&net_big_sklist_lock);
876
877 while ((s = *list) != NULL) {
878 if (s == sk) {
879 *list = s->next;
880 break;
881 }
882 list = &s->next;
883 }
884
885 write_unlock_bh(&net_big_sklist_lock);
886 if (s)
887 sock_put(s);
888 }
889
890 void sklist_insert_socket(struct sock **list, struct sock *sk)
891 {
892 write_lock_bh(&net_big_sklist_lock);
893 sk->next= *list;
894 *list=sk;
895 sock_hold(sk);
896 write_unlock_bh(&net_big_sklist_lock);
897 }
898
899 /*
900 * This is only called from user mode. Thus it protects itself against
901 * interrupt users but doesn't worry about being called during work.
902 * Once it is removed from the queue no interrupt or bottom half will
903 * touch it and we are (fairly 8-) ) safe.
904 */
905
906 void sklist_destroy_socket(struct sock **list, struct sock *sk);
907
908 /*
909 * Handler for deferred kills.
910 */
911
912 static void sklist_destroy_timer(unsigned long data)
913 {
914 struct sock *sk=(struct sock *)data;
915 sklist_destroy_socket(NULL,sk);
916 }
917
918 /*
919 * Destroy a socket. We pass NULL for a list if we know the
920 * socket is not on a list.
921 */
922
923 void sklist_destroy_socket(struct sock **list,struct sock *sk)
924 {
925 if(list)
926 sklist_remove_socket(list, sk);
927
928 skb_queue_purge(&sk->receive_queue);
929
930 if(atomic_read(&sk->wmem_alloc) == 0 &&
931 atomic_read(&sk->rmem_alloc) == 0 &&
932 sk->dead)
933 {
934 sock_put(sk);
935 }
936 else
937 {
938 /*
939 * Someone is using our buffers still.. defer
940 */
941 init_timer(&sk->timer);
942 sk->timer.expires=jiffies+SOCK_DESTROY_TIME;
943 sk->timer.function=sklist_destroy_timer;
944 sk->timer.data = (unsigned long)sk;
945 add_timer(&sk->timer);
946 }
947 }
948
949 /*
950 * Set of default routines for initialising struct proto_ops when
951 * the protocol does not support a particular function. In certain
952 * cases where it makes no sense for a protocol to have a "do nothing"
953 * function, some default processing is provided.
954 */
955
956 int sock_no_release(struct socket *sock)
957 {
958 return 0;
959 }
960
961 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
962 {
963 return -EOPNOTSUPP;
964 }
965
966 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
967 int len, int flags)
968 {
969 return -EOPNOTSUPP;
970 }
971
972 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
973 {
974 return -EOPNOTSUPP;
975 }
976
977 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
978 {
979 return -EOPNOTSUPP;
980 }
981
982 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
983 int *len, int peer)
984 {
985 return -EOPNOTSUPP;
986 }
987
988 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
989 {
990 return 0;
991 }
992
993 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
994 {
995 return -EOPNOTSUPP;
996 }
997
998 int sock_no_listen(struct socket *sock, int backlog)
999 {
1000 return -EOPNOTSUPP;
1001 }
1002
1003 int sock_no_shutdown(struct socket *sock, int how)
1004 {
1005 return -EOPNOTSUPP;
1006 }
1007
1008 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1009 char *optval, int optlen)
1010 {
1011 return -EOPNOTSUPP;
1012 }
1013
1014 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1015 char *optval, int *optlen)
1016 {
1017 return -EOPNOTSUPP;
1018 }
1019
1020 /*
1021 * Note: if you add something that sleeps here then change sock_fcntl()
1022 * to do proper fd locking.
1023 */
1024 int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
1025 {
1026 struct sock *sk = sock->sk;
1027
1028 switch(cmd)
1029 {
1030 case F_SETOWN:
1031 /*
1032 * This is a little restrictive, but it's the only
1033 * way to make sure that you can't send a sigurg to
1034 * another process.
1035 */
1036 if (current->pgrp != -arg &&
1037 current->pid != arg &&
1038 !capable(CAP_KILL)) return(-EPERM);
1039 sk->proc = arg;
1040 return(0);
1041 case F_GETOWN:
1042 return(sk->proc);
1043 default:
1044 return(-EINVAL);
1045 }
1046 }
1047
1048 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags,
1049 struct scm_cookie *scm)
1050 {
1051 return -EOPNOTSUPP;
1052 }
1053
1054 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int len, int flags,
1055 struct scm_cookie *scm)
1056 {
1057 return -EOPNOTSUPP;
1058 }
1059
1060 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1061 {
1062 /* Mirror missing mmap method error code */
1063 return -ENODEV;
1064 }
1065
1066 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1067 {
1068 ssize_t res;
1069 struct msghdr msg;
1070 struct iovec iov;
1071 mm_segment_t old_fs;
1072 char *kaddr;
1073
1074 kaddr = kmap(page);
1075
1076 msg.msg_name = NULL;
1077 msg.msg_namelen = 0;
1078 msg.msg_iov = &iov;
1079 msg.msg_iovlen = 1;
1080 msg.msg_control = NULL;
1081 msg.msg_controllen = 0;
1082 msg.msg_flags = flags;
1083
1084 iov.iov_base = kaddr + offset;
1085 iov.iov_len = size;
1086
1087 old_fs = get_fs();
1088 set_fs(KERNEL_DS);
1089 res = sock_sendmsg(sock, &msg, size);
1090 set_fs(old_fs);
1091
1092 kunmap(page);
1093 return res;
1094 }
1095
1096 /*
1097 * Default Socket Callbacks
1098 */
1099
1100 void sock_def_wakeup(struct sock *sk)
1101 {
1102 read_lock(&sk->callback_lock);
1103 if (sk->sleep && waitqueue_active(sk->sleep))
1104 wake_up_interruptible_all(sk->sleep);
1105 read_unlock(&sk->callback_lock);
1106 }
1107
1108 void sock_def_error_report(struct sock *sk)
1109 {
1110 read_lock(&sk->callback_lock);
1111 if (sk->sleep && waitqueue_active(sk->sleep))
1112 wake_up_interruptible(sk->sleep);
1113 sk_wake_async(sk,0,POLL_ERR);
1114 read_unlock(&sk->callback_lock);
1115 }
1116
1117 void sock_def_readable(struct sock *sk, int len)
1118 {
1119 read_lock(&sk->callback_lock);
1120 if (sk->sleep && waitqueue_active(sk->sleep))
1121 wake_up_interruptible(sk->sleep);
1122 sk_wake_async(sk,1,POLL_IN);
1123 read_unlock(&sk->callback_lock);
1124 }
1125
1126 void sock_def_write_space(struct sock *sk)
1127 {
1128 read_lock(&sk->callback_lock);
1129
1130 /* Do not wake up a writer until he can make "significant"
1131 * progress. --DaveM
1132 */
1133 if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) {
1134 if (sk->sleep && waitqueue_active(sk->sleep))
1135 wake_up_interruptible(sk->sleep);
1136
1137 /* Should agree with poll, otherwise some programs break */
1138 if (sock_writeable(sk))
1139 sk_wake_async(sk, 2, POLL_OUT);
1140 }
1141
1142 read_unlock(&sk->callback_lock);
1143 }
1144
1145 void sock_def_destruct(struct sock *sk)
1146 {
1147 if (sk->protinfo.destruct_hook)
1148 kfree(sk->protinfo.destruct_hook);
1149 }
1150
1151 void sock_init_data(struct socket *sock, struct sock *sk)
1152 {
1153 skb_queue_head_init(&sk->receive_queue);
1154 skb_queue_head_init(&sk->write_queue);
1155 skb_queue_head_init(&sk->error_queue);
1156
1157 init_timer(&sk->timer);
1158
1159 sk->allocation = GFP_KERNEL;
1160 sk->rcvbuf = sysctl_rmem_default;
1161 sk->sndbuf = sysctl_wmem_default;
1162 sk->state = TCP_CLOSE;
1163 sk->zapped = 1;
1164 sk->socket = sock;
1165
1166 if(sock)
1167 {
1168 sk->type = sock->type;
1169 sk->sleep = &sock->wait;
1170 sock->sk = sk;
1171 } else
1172 sk->sleep = NULL;
1173
1174 sk->dst_lock = RW_LOCK_UNLOCKED;
1175 sk->callback_lock = RW_LOCK_UNLOCKED;
1176
1177 sk->state_change = sock_def_wakeup;
1178 sk->data_ready = sock_def_readable;
1179 sk->write_space = sock_def_write_space;
1180 sk->error_report = sock_def_error_report;
1181 sk->destruct = sock_def_destruct;
1182
1183 sk->peercred.pid = 0;
1184 sk->peercred.uid = -1;
1185 sk->peercred.gid = -1;
1186 sk->rcvlowat = 1;
1187 sk->rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1188 sk->sndtimeo = MAX_SCHEDULE_TIMEOUT;
1189
1190 atomic_set(&sk->refcnt, 1);
1191 }
1192