File: /usr/src/linux/net/core/sock.c

1     /*
2      * INET		An implementation of the TCP/IP protocol suite for the LINUX
3      *		operating system.  INET is implemented using the  BSD Socket
4      *		interface as the means of communication with the user level.
5      *
6      *		Generic socket support routines. Memory allocators, socket lock/release
7      *		handler for protocols to use and generic option handler.
8      *
9      *
10      * Version:	$Id: sock.c,v 1.112 2001/07/27 09:54:48 davem Exp $
11      *
12      * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
13      *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14      *		Florian La Roche, <flla@stud.uni-sb.de>
15      *		Alan Cox, <A.Cox@swansea.ac.uk>
16      *
17      * Fixes:
18      *		Alan Cox	: 	Numerous verify_area() problems
19      *		Alan Cox	:	Connecting on a connecting socket
20      *					now returns an error for tcp.
21      *		Alan Cox	:	sock->protocol is set correctly.
22      *					and is not sometimes left as 0.
23      *		Alan Cox	:	connect handles icmp errors on a
24      *					connect properly. Unfortunately there
25      *					is a restart syscall nasty there. I
26      *					can't match BSD without hacking the C
27      *					library. Ideas urgently sought!
28      *		Alan Cox	:	Disallow bind() to addresses that are
29      *					not ours - especially broadcast ones!!
30      *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31      *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32      *					instead they leave that for the DESTROY timer.
33      *		Alan Cox	:	Clean up error flag in accept
34      *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35      *					was buggy. Put a remove_sock() in the handler
36      *					for memory when we hit 0. Also altered the timer
37      *					code. The ACK stuff can wait and needs major 
38      *					TCP layer surgery.
39      *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40      *					and fixed timer/inet_bh race.
41      *		Alan Cox	:	Added zapped flag for TCP
42      *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43      *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44      *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45      *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46      *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47      *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48      *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49      *	Pauline Middelink	:	identd support
50      *		Alan Cox	:	Fixed connect() taking signals I think.
51      *		Alan Cox	:	SO_LINGER supported
52      *		Alan Cox	:	Error reporting fixes
53      *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54      *		Alan Cox	:	inet sockets don't set sk->type!
55      *		Alan Cox	:	Split socket option code
56      *		Alan Cox	:	Callbacks
57      *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58      *		Alex		:	Removed restriction on inet fioctl
59      *		Alan Cox	:	Splitting INET from NET core
60      *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61      *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62      *		Alan Cox	:	Split IP from generic code
63      *		Alan Cox	:	New kfree_skbmem()
64      *		Alan Cox	:	Make SO_DEBUG superuser only.
65      *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66      *					(compatibility fix)
67      *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68      *		Alan Cox	:	Allocator for a socket is settable.
69      *		Alan Cox	:	SO_ERROR includes soft errors.
70      *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71      *		Alan Cox	: 	Generic socket allocation to make hooks
72      *					easier (suggested by Craig Metz).
73      *		Michael Pall	:	SO_ERROR returns positive errno again
74      *              Steve Whitehouse:       Added default destructor to free
75      *                                      protocol private data.
76      *              Steve Whitehouse:       Added various other default routines
77      *                                      common to several socket families.
78      *              Chris Evans     :       Call suser() check last on F_SETOWN
79      *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80      *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81      *		Andi Kleen	:	Fix write_space callback
82      *		Chris Evans	:	Security fixes - signedness again
83      *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84      *
85      * To Fix:
86      *
87      *
88      *		This program is free software; you can redistribute it and/or
89      *		modify it under the terms of the GNU General Public License
90      *		as published by the Free Software Foundation; either version
91      *		2 of the License, or (at your option) any later version.
92      */
93     
94     #include <linux/config.h>
95     #include <linux/errno.h>
96     #include <linux/types.h>
97     #include <linux/socket.h>
98     #include <linux/in.h>
99     #include <linux/kernel.h>
100     #include <linux/major.h>
101     #include <linux/sched.h>
102     #include <linux/timer.h>
103     #include <linux/string.h>
104     #include <linux/sockios.h>
105     #include <linux/net.h>
106     #include <linux/fcntl.h>
107     #include <linux/mm.h>
108     #include <linux/slab.h>
109     #include <linux/interrupt.h>
110     #include <linux/poll.h>
111     #include <linux/init.h>
112     
113     #include <asm/uaccess.h>
114     #include <asm/system.h>
115     
116     #include <linux/inet.h>
117     #include <linux/netdevice.h>
118     #include <net/ip.h>
119     #include <net/protocol.h>
120     #include <net/arp.h>
121     #include <net/route.h>
122     #include <net/tcp.h>
123     #include <net/udp.h>
124     #include <linux/skbuff.h>
125     #include <net/sock.h>
126     #include <net/raw.h>
127     #include <net/icmp.h>
128     #include <linux/ipsec.h>
129     
130     #ifdef CONFIG_FILTER
131     #include <linux/filter.h>
132     #endif
133     
134     /* Run time adjustable parameters. */
135     __u32 sysctl_wmem_max = SK_WMEM_MAX;
136     __u32 sysctl_rmem_max = SK_RMEM_MAX;
137     __u32 sysctl_wmem_default = SK_WMEM_MAX;
138     __u32 sysctl_rmem_default = SK_RMEM_MAX;
139     
140     /* Maximal space eaten by iovec or ancilliary data plus some space */
141     int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
142     
143     static int sock_set_timeout(long *timeo_p, char *optval, int optlen)
144     {
145     	struct timeval tv;
146     
147     	if (optlen < sizeof(tv))
148     		return -EINVAL;
149     	if (copy_from_user(&tv, optval, sizeof(tv)))
150     		return -EFAULT;
151     
152     	*timeo_p = MAX_SCHEDULE_TIMEOUT;
153     	if (tv.tv_sec == 0 && tv.tv_usec == 0)
154     		return 0;
155     	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
156     		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
157     	return 0;
158     }
159     
160     /*
161      *	This is meant for all protocols to use and covers goings on
162      *	at the socket level. Everything here is generic.
163      */
164     
165     int sock_setsockopt(struct socket *sock, int level, int optname,
166     		    char *optval, int optlen)
167     {
168     	struct sock *sk=sock->sk;
169     #ifdef CONFIG_FILTER
170     	struct sk_filter *filter;
171     #endif
172     	int val;
173     	int valbool;
174     	struct linger ling;
175     	int ret = 0;
176     	
177     	/*
178     	 *	Options without arguments
179     	 */
180     
181     #ifdef SO_DONTLINGER		/* Compatibility item... */
182     	switch(optname)
183     	{
184     		case SO_DONTLINGER:
185     			sk->linger=0;
186     			return 0;
187     	}
188     #endif	
189     		
190       	if(optlen<sizeof(int))
191       		return(-EINVAL);
192       	
193     	if (get_user(val, (int *)optval))
194     		return -EFAULT;
195     	
196       	valbool = val?1:0;
197     
198     	lock_sock(sk);
199     
200       	switch(optname) 
201       	{
202     		case SO_DEBUG:	
203     			if(val && !capable(CAP_NET_ADMIN))
204     			{
205     				ret = -EACCES;
206     			}
207     			else
208     				sk->debug=valbool;
209     			break;
210     		case SO_REUSEADDR:
211     			sk->reuse = valbool;
212     			break;
213     		case SO_TYPE:
214     		case SO_ERROR:
215     			ret = -ENOPROTOOPT;
216     		  	break;
217     		case SO_DONTROUTE:
218     			sk->localroute=valbool;
219     			break;
220     		case SO_BROADCAST:
221     			sk->broadcast=valbool;
222     			break;
223     		case SO_SNDBUF:
224     			/* Don't error on this BSD doesn't and if you think
225     			   about it this is right. Otherwise apps have to
226     			   play 'guess the biggest size' games. RCVBUF/SNDBUF
227     			   are treated in BSD as hints */
228     			   
229     			if (val > sysctl_wmem_max)
230     				val = sysctl_wmem_max;
231     
232     			sk->userlocks |= SOCK_SNDBUF_LOCK;
233     			if ((val * 2) < SOCK_MIN_SNDBUF)
234     				sk->sndbuf = SOCK_MIN_SNDBUF;
235     			else
236     				sk->sndbuf = (val * 2);
237     
238     			/*
239     			 *	Wake up sending tasks if we
240     			 *	upped the value.
241     			 */
242     			sk->write_space(sk);
243     			break;
244     
245     		case SO_RCVBUF:
246     			/* Don't error on this BSD doesn't and if you think
247     			   about it this is right. Otherwise apps have to
248     			   play 'guess the biggest size' games. RCVBUF/SNDBUF
249     			   are treated in BSD as hints */
250     			  
251     			if (val > sysctl_rmem_max)
252     				val = sysctl_rmem_max;
253     
254     			sk->userlocks |= SOCK_RCVBUF_LOCK;
255     			/* FIXME: is this lower bound the right one? */
256     			if ((val * 2) < SOCK_MIN_RCVBUF)
257     				sk->rcvbuf = SOCK_MIN_RCVBUF;
258     			else
259     				sk->rcvbuf = (val * 2);
260     			break;
261     
262     		case SO_KEEPALIVE:
263     #ifdef CONFIG_INET
264     			if (sk->protocol == IPPROTO_TCP)
265     			{
266     				tcp_set_keepalive(sk, valbool);
267     			}
268     #endif
269     			sk->keepopen = valbool;
270     			break;
271     
272     	 	case SO_OOBINLINE:
273     			sk->urginline = valbool;
274     			break;
275     
276     	 	case SO_NO_CHECK:
277     			sk->no_check = valbool;
278     			break;
279     
280     		case SO_PRIORITY:
281     			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
282     				sk->priority = val;
283     			else
284     				ret = -EPERM;
285     			break;
286     
287     		case SO_LINGER:
288     			if(optlen<sizeof(ling)) {
289     				ret = -EINVAL;	/* 1003.1g */
290     				break;
291     			}
292     			if (copy_from_user(&ling,optval,sizeof(ling))) {
293     				ret = -EFAULT;
294     				break;
295     			}
296     			if(ling.l_onoff==0) {
297     				sk->linger=0;
298     			} else {
299     #if (BITS_PER_LONG == 32)
300     				if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
301     					sk->lingertime=MAX_SCHEDULE_TIMEOUT;
302     				else
303     #endif
304     					sk->lingertime=ling.l_linger*HZ;
305     				sk->linger=1;
306     			}
307     			break;
308     
309     		case SO_BSDCOMPAT:
310     			sk->bsdism = valbool;
311     			break;
312     
313     		case SO_PASSCRED:
314     			sock->passcred = valbool;
315     			break;
316     
317     		case SO_TIMESTAMP:
318     			sk->rcvtstamp = valbool;
319     			break;
320     
321     		case SO_RCVLOWAT:
322     			if (val < 0)
323     				val = INT_MAX;
324     			sk->rcvlowat = val ? : 1;
325     			break;
326     
327     		case SO_RCVTIMEO:
328     			ret = sock_set_timeout(&sk->rcvtimeo, optval, optlen);
329     			break;
330     
331     		case SO_SNDTIMEO:
332     			ret = sock_set_timeout(&sk->sndtimeo, optval, optlen);
333     			break;
334     
335     #ifdef CONFIG_NETDEVICES
336     		case SO_BINDTODEVICE:
337     		{
338     			char devname[IFNAMSIZ]; 
339     
340     			/* Sorry... */ 
341     			if (!capable(CAP_NET_RAW)) {
342     				ret = -EPERM;
343     				break;
344     			}
345     
346     			/* Bind this socket to a particular device like "eth0",
347     			 * as specified in the passed interface name. If the
348     			 * name is "" or the option length is zero the socket 
349     			 * is not bound. 
350     			 */ 
351     
352     			if (!valbool) {
353     				sk->bound_dev_if = 0;
354     			} else {
355     				if (optlen > IFNAMSIZ) 
356     					optlen = IFNAMSIZ; 
357     				if (copy_from_user(devname, optval, optlen)) {
358     					ret = -EFAULT;
359     					break;
360     				}
361     
362     				/* Remove any cached route for this socket. */
363     				sk_dst_reset(sk);
364     
365     				if (devname[0] == '\0') {
366     					sk->bound_dev_if = 0;
367     				} else {
368     					struct net_device *dev = dev_get_by_name(devname);
369     					if (!dev) {
370     						ret = -ENODEV;
371     						break;
372     					}
373     					sk->bound_dev_if = dev->ifindex;
374     					dev_put(dev);
375     				}
376     			}
377     			break;
378     		}
379     #endif
380     
381     
382     #ifdef CONFIG_FILTER
383     		case SO_ATTACH_FILTER:
384     			ret = -EINVAL;
385     			if (optlen == sizeof(struct sock_fprog)) {
386     				struct sock_fprog fprog;
387     
388     				ret = -EFAULT;
389     				if (copy_from_user(&fprog, optval, sizeof(fprog)))
390     					break;
391     
392     				ret = sk_attach_filter(&fprog, sk);
393     			}
394     			break;
395     
396     		case SO_DETACH_FILTER:
397     			spin_lock_bh(&sk->lock.slock);
398     			filter = sk->filter;
399                             if (filter) {
400     				sk->filter = NULL;
401     				spin_unlock_bh(&sk->lock.slock);
402     				sk_filter_release(sk, filter);
403     				break;
404     			}
405     			spin_unlock_bh(&sk->lock.slock);
406     			ret = -ENONET;
407     			break;
408     #endif
409     		/* We implement the SO_SNDLOWAT etc to
410     		   not be settable (1003.1g 5.3) */
411     		default:
412     		  	ret = -ENOPROTOOPT;
413     			break;
414       	}
415     	release_sock(sk);
416     	return ret;
417     }
418     
419     
420     int sock_getsockopt(struct socket *sock, int level, int optname,
421     		    char *optval, int *optlen)
422     {
423     	struct sock *sk = sock->sk;
424     	
425     	union
426     	{
427       		int val;
428       		struct linger ling;
429     		struct timeval tm;
430     	} v;
431     	
432     	unsigned int lv=sizeof(int),len;
433       	
434       	if(get_user(len,optlen))
435       		return -EFAULT;
436     	if(len < 0)
437     		return -EINVAL;
438     		
439       	switch(optname) 
440       	{
441     		case SO_DEBUG:		
442     			v.val = sk->debug;
443     			break;
444     		
445     		case SO_DONTROUTE:
446     			v.val = sk->localroute;
447     			break;
448     		
449     		case SO_BROADCAST:
450     			v.val= sk->broadcast;
451     			break;
452     
453     		case SO_SNDBUF:
454     			v.val=sk->sndbuf;
455     			break;
456     		
457     		case SO_RCVBUF:
458     			v.val =sk->rcvbuf;
459     			break;
460     
461     		case SO_REUSEADDR:
462     			v.val = sk->reuse;
463     			break;
464     
465     		case SO_KEEPALIVE:
466     			v.val = sk->keepopen;
467     			break;
468     
469     		case SO_TYPE:
470     			v.val = sk->type;		  		
471     			break;
472     
473     		case SO_ERROR:
474     			v.val = -sock_error(sk);
475     			if(v.val==0)
476     				v.val=xchg(&sk->err_soft,0);
477     			break;
478     
479     		case SO_OOBINLINE:
480     			v.val = sk->urginline;
481     			break;
482     	
483     		case SO_NO_CHECK:
484     			v.val = sk->no_check;
485     			break;
486     
487     		case SO_PRIORITY:
488     			v.val = sk->priority;
489     			break;
490     		
491     		case SO_LINGER:	
492     			lv=sizeof(v.ling);
493     			v.ling.l_onoff=sk->linger;
494      			v.ling.l_linger=sk->lingertime/HZ;
495     			break;
496     					
497     		case SO_BSDCOMPAT:
498     			v.val = sk->bsdism;
499     			break;
500     
501     		case SO_TIMESTAMP:
502     			v.val = sk->rcvtstamp;
503     			break;
504     
505     		case SO_RCVTIMEO:
506     			lv=sizeof(struct timeval);
507     			if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
508     				v.tm.tv_sec = 0;
509     				v.tm.tv_usec = 0;
510     			} else {
511     				v.tm.tv_sec = sk->rcvtimeo/HZ;
512     				v.tm.tv_usec = ((sk->rcvtimeo%HZ)*1000)/HZ;
513     			}
514     			break;
515     
516     		case SO_SNDTIMEO:
517     			lv=sizeof(struct timeval);
518     			if (sk->sndtimeo == MAX_SCHEDULE_TIMEOUT) {
519     				v.tm.tv_sec = 0;
520     				v.tm.tv_usec = 0;
521     			} else {
522     				v.tm.tv_sec = sk->sndtimeo/HZ;
523     				v.tm.tv_usec = ((sk->sndtimeo%HZ)*1000)/HZ;
524     			}
525     			break;
526     
527     		case SO_RCVLOWAT:
528     			v.val = sk->rcvlowat;
529     			break;
530     
531     		case SO_SNDLOWAT:
532     			v.val=1;
533     			break; 
534     
535     		case SO_PASSCRED:
536     			v.val = sock->passcred;
537     			break;
538     
539     		case SO_PEERCRED:
540     			if (len > sizeof(sk->peercred))
541     				len = sizeof(sk->peercred);
542     			if (copy_to_user(optval, &sk->peercred, len))
543     				return -EFAULT;
544     			goto lenout;
545     
546     		case SO_PEERNAME:
547     		{
548     			char address[128];
549     
550     			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
551     				return -ENOTCONN;
552     			if (lv < len)
553     				return -EINVAL;
554     			if(copy_to_user((void*)optval, address, len))
555     				return -EFAULT;
556     			goto lenout;
557     		}
558     
559     		/* Dubious BSD thing... Probably nobody even uses it, but
560     		 * the UNIX standard wants it for whatever reason... -DaveM
561     		 */
562     		case SO_ACCEPTCONN:
563     			v.val = (sk->state == TCP_LISTEN);
564     			break;
565     
566     		default:
567     			return(-ENOPROTOOPT);
568     	}
569     	if (len > lv)
570     		len = lv;
571     	if (copy_to_user(optval, &v, len))
572     		return -EFAULT;
573     lenout:
574       	if (put_user(len, optlen))
575       		return -EFAULT;
576       	return 0;
577     }
578     
579     static kmem_cache_t *sk_cachep;
580     
581     /*
582      *	All socket objects are allocated here. This is for future
583      *	usage.
584      */
585      
586     struct sock *sk_alloc(int family, int priority, int zero_it)
587     {
588     	struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
589     
590     	if(sk && zero_it) {
591     		memset(sk, 0, sizeof(struct sock));
592     		sk->family = family;
593     		sock_lock_init(sk);
594     	}
595     
596     	return sk;
597     }
598     
599     void sk_free(struct sock *sk)
600     {
601     #ifdef CONFIG_FILTER
602     	struct sk_filter *filter;
603     #endif
604     
605     	if (sk->destruct)
606     		sk->destruct(sk);
607     
608     #ifdef CONFIG_FILTER
609     	filter = sk->filter;
610     	if (filter) {
611     		sk_filter_release(sk, filter);
612     		sk->filter = NULL;
613     	}
614     #endif
615     
616     	if (atomic_read(&sk->omem_alloc))
617     		printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
618     
619     	kmem_cache_free(sk_cachep, sk);
620     }
621     
622     void __init sk_init(void)
623     {
624     	sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
625     				      SLAB_HWCACHE_ALIGN, 0, 0);
626     	if (!sk_cachep)
627     		printk(KERN_CRIT "sk_init: Cannot create sock SLAB cache!");
628     
629     	if (num_physpages <= 4096) {
630     		sysctl_wmem_max = 32767;
631     		sysctl_rmem_max = 32767;
632     		sysctl_wmem_default = 32767;
633     		sysctl_wmem_default = 32767;
634     	} else if (num_physpages >= 131072) {
635     		sysctl_wmem_max = 131071;
636     		sysctl_rmem_max = 131071;
637     	}
638     }
639     
640     /*
641      *	Simple resource managers for sockets.
642      */
643     
644     
645     /* 
646      * Write buffer destructor automatically called from kfree_skb. 
647      */
648     void sock_wfree(struct sk_buff *skb)
649     {
650     	struct sock *sk = skb->sk;
651     
652     	/* In case it might be waiting for more memory. */
653     	atomic_sub(skb->truesize, &sk->wmem_alloc);
654     	if (!sk->use_write_queue)
655     		sk->write_space(sk);
656     	sock_put(sk);
657     }
658     
659     /* 
660      * Read buffer destructor automatically called from kfree_skb. 
661      */
662     void sock_rfree(struct sk_buff *skb)
663     {
664     	struct sock *sk = skb->sk;
665     
666     	atomic_sub(skb->truesize, &sk->rmem_alloc);
667     }
668     
669     /*
670      * Allocate a skb from the socket's send buffer.
671      */
672     struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
673     {
674     	if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
675     		struct sk_buff * skb = alloc_skb(size, priority);
676     		if (skb) {
677     			skb_set_owner_w(skb, sk);
678     			return skb;
679     		}
680     	}
681     	return NULL;
682     }
683     
684     /*
685      * Allocate a skb from the socket's receive buffer.
686      */ 
687     struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
688     {
689     	if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) {
690     		struct sk_buff *skb = alloc_skb(size, priority);
691     		if (skb) {
692     			skb_set_owner_r(skb, sk);
693     			return skb;
694     		}
695     	}
696     	return NULL;
697     }
698     
699     /* 
700      * Allocate a memory block from the socket's option memory buffer.
701      */ 
702     void *sock_kmalloc(struct sock *sk, int size, int priority)
703     {
704     	if ((unsigned)size <= sysctl_optmem_max &&
705     	    atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) {
706     		void *mem;
707     		/* First do the add, to avoid the race if kmalloc
708      		 * might sleep.
709     		 */
710     		atomic_add(size, &sk->omem_alloc);
711     		mem = kmalloc(size, priority);
712     		if (mem)
713     			return mem;
714     		atomic_sub(size, &sk->omem_alloc);
715     	}
716     	return NULL;
717     }
718     
719     /*
720      * Free an option memory block.
721      */
722     void sock_kfree_s(struct sock *sk, void *mem, int size)
723     {
724     	kfree(mem);
725     	atomic_sub(size, &sk->omem_alloc);
726     }
727     
728     /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
729        I think, these locks should be removed for datagram sockets.
730      */
731     static long sock_wait_for_wmem(struct sock * sk, long timeo)
732     {
733     	DECLARE_WAITQUEUE(wait, current);
734     
735     	clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
736     	add_wait_queue(sk->sleep, &wait);
737     	for (;;) {
738     		if (!timeo)
739     			break;
740     		if (signal_pending(current))
741     			break;
742     		set_bit(SOCK_NOSPACE, &sk->socket->flags);
743     		set_current_state(TASK_INTERRUPTIBLE);
744     		if (atomic_read(&sk->wmem_alloc) < sk->sndbuf)
745     			break;
746     		if (sk->shutdown & SEND_SHUTDOWN)
747     			break;
748     		if (sk->err)
749     			break;
750     		timeo = schedule_timeout(timeo);
751     	}
752     	__set_current_state(TASK_RUNNING);
753     	remove_wait_queue(sk->sleep, &wait);
754     	return timeo;
755     }
756     
757     
758     /*
759      *	Generic send/receive buffer handlers
760      */
761     
762     struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
763     			int noblock, int *errcode)
764     {
765     	int err;
766     	struct sk_buff *skb;
767     	long timeo;
768     
769     	timeo = sock_sndtimeo(sk, noblock);
770     
771     	while (1) {
772     		unsigned long try_size = size;
773     
774     		err = sock_error(sk);
775     		if (err != 0)
776     			goto failure;
777     
778     		/*
779     		 *	We should send SIGPIPE in these cases according to
780     		 *	1003.1g draft 6.4. If we (the user) did a shutdown()
781     		 *	call however we should not. 
782     		 *
783     		 *	Note: This routine isnt just used for datagrams and
784     		 *	anyway some datagram protocols have a notion of
785     		 *	close down.
786     		 */
787     
788     		err = -EPIPE;
789     		if (sk->shutdown&SEND_SHUTDOWN)
790     			goto failure;
791     
792     		if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
793     			skb = alloc_skb(try_size, sk->allocation);
794     			if (skb)
795     				break;
796     			err = -ENOBUFS;
797     			goto failure;
798     		}
799     
800     		/*
801     		 *	This means we have too many buffers for this socket already.
802     		 */
803     
804     		set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
805     		set_bit(SOCK_NOSPACE, &sk->socket->flags);
806     		err = -EAGAIN;
807     		if (!timeo)
808     			goto failure;
809     		if (signal_pending(current))
810     			goto interrupted;
811     		timeo = sock_wait_for_wmem(sk, timeo);
812     	}
813     
814     	skb_set_owner_w(skb, sk);
815     	return skb;
816     
817     interrupted:
818     	err = sock_intr_errno(timeo);
819     failure:
820     	*errcode = err;
821     	return NULL;
822     }
823     
824     void __lock_sock(struct sock *sk)
825     {
826     	DECLARE_WAITQUEUE(wait, current);
827     
828     	add_wait_queue_exclusive(&sk->lock.wq, &wait);
829     	for(;;) {
830     		current->state = TASK_UNINTERRUPTIBLE;
831     		spin_unlock_bh(&sk->lock.slock);
832     		schedule();
833     		spin_lock_bh(&sk->lock.slock);
834     		if(!sk->lock.users)
835     			break;
836     	}
837     	current->state = TASK_RUNNING;
838     	remove_wait_queue(&sk->lock.wq, &wait);
839     }
840     
841     void __release_sock(struct sock *sk)
842     {
843     	struct sk_buff *skb = sk->backlog.head;
844     
845     	do {
846     		sk->backlog.head = sk->backlog.tail = NULL;
847     		bh_unlock_sock(sk);
848     
849     		do {
850     			struct sk_buff *next = skb->next;
851     
852     			skb->next = NULL;
853     			sk->backlog_rcv(sk, skb);
854     			skb = next;
855     		} while (skb != NULL);
856     
857     		bh_lock_sock(sk);
858     	} while((skb = sk->backlog.head) != NULL);
859     }
860     
861     /*
862      *	Generic socket manager library. Most simpler socket families
863      *	use this to manage their socket lists. At some point we should
864      *	hash these. By making this generic we get the lot hashed for free.
865      *
866      *	It is broken by design. All the protocols using it must be fixed. --ANK
867      */
868     
869     rwlock_t net_big_sklist_lock = RW_LOCK_UNLOCKED;
870      
871     void sklist_remove_socket(struct sock **list, struct sock *sk)
872     {
873     	struct sock *s;
874     
875     	write_lock_bh(&net_big_sklist_lock);
876     
877     	while ((s = *list) != NULL) {
878     		if (s == sk) {
879     			*list = s->next;
880     			break;
881     		}
882     		list = &s->next;
883     	}
884     
885     	write_unlock_bh(&net_big_sklist_lock);
886     	if (s)
887     		sock_put(s);
888     }
889     
890     void sklist_insert_socket(struct sock **list, struct sock *sk)
891     {
892     	write_lock_bh(&net_big_sklist_lock);
893     	sk->next= *list;
894     	*list=sk;
895     	sock_hold(sk);
896     	write_unlock_bh(&net_big_sklist_lock);
897     }
898     
899     /*
900      *	This is only called from user mode. Thus it protects itself against
901      *	interrupt users but doesn't worry about being called during work.
902      *	Once it is removed from the queue no interrupt or bottom half will
903      *	touch it and we are (fairly 8-) ) safe.
904      */
905     
906     void sklist_destroy_socket(struct sock **list, struct sock *sk);
907     
908     /*
909      *	Handler for deferred kills.
910      */
911     
912     static void sklist_destroy_timer(unsigned long data)
913     {
914     	struct sock *sk=(struct sock *)data;
915     	sklist_destroy_socket(NULL,sk);
916     }
917     
918     /*
919      *	Destroy a socket. We pass NULL for a list if we know the
920      *	socket is not on a list.
921      */
922      
923     void sklist_destroy_socket(struct sock **list,struct sock *sk)
924     {
925     	if(list)
926     		sklist_remove_socket(list, sk);
927     
928     	skb_queue_purge(&sk->receive_queue);
929     
930     	if(atomic_read(&sk->wmem_alloc) == 0 &&
931     	   atomic_read(&sk->rmem_alloc) == 0 &&
932     	   sk->dead)
933     	{
934     		sock_put(sk);
935     	}
936     	else
937     	{
938     		/*
939     		 *	Someone is using our buffers still.. defer
940     		 */
941     		init_timer(&sk->timer);
942     		sk->timer.expires=jiffies+SOCK_DESTROY_TIME;
943     		sk->timer.function=sklist_destroy_timer;
944     		sk->timer.data = (unsigned long)sk;
945     		add_timer(&sk->timer);
946     	}
947     }
948     
949     /*
950      * Set of default routines for initialising struct proto_ops when
951      * the protocol does not support a particular function. In certain
952      * cases where it makes no sense for a protocol to have a "do nothing"
953      * function, some default processing is provided.
954      */
955     
956     int sock_no_release(struct socket *sock)
957     {
958     	return 0;
959     }
960     
961     int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
962     {
963     	return -EOPNOTSUPP;
964     }
965     
966     int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
967     		    int len, int flags)
968     {
969     	return -EOPNOTSUPP;
970     }
971     
972     int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
973     {
974     	return -EOPNOTSUPP;
975     }
976     
977     int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
978     {
979     	return -EOPNOTSUPP;
980     }
981     
982     int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
983     		    int *len, int peer)
984     {
985     	return -EOPNOTSUPP;
986     }
987     
988     unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
989     {
990     	return 0;
991     }
992     
993     int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
994     {
995     	return -EOPNOTSUPP;
996     }
997     
998     int sock_no_listen(struct socket *sock, int backlog)
999     {
1000     	return -EOPNOTSUPP;
1001     }
1002     
1003     int sock_no_shutdown(struct socket *sock, int how)
1004     {
1005     	return -EOPNOTSUPP;
1006     }
1007     
1008     int sock_no_setsockopt(struct socket *sock, int level, int optname,
1009     		    char *optval, int optlen)
1010     {
1011     	return -EOPNOTSUPP;
1012     }
1013     
1014     int sock_no_getsockopt(struct socket *sock, int level, int optname,
1015     		    char *optval, int *optlen)
1016     {
1017     	return -EOPNOTSUPP;
1018     }
1019     
1020     /* 
1021      * Note: if you add something that sleeps here then change sock_fcntl()
1022      *       to do proper fd locking.
1023      */
1024     int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
1025     {
1026     	struct sock *sk = sock->sk;
1027     
1028     	switch(cmd)
1029     	{
1030     		case F_SETOWN:
1031     			/*
1032     			 * This is a little restrictive, but it's the only
1033     			 * way to make sure that you can't send a sigurg to
1034     			 * another process.
1035     			 */
1036     			if (current->pgrp != -arg &&
1037     				current->pid != arg &&
1038     				!capable(CAP_KILL)) return(-EPERM);
1039     			sk->proc = arg;
1040     			return(0);
1041     		case F_GETOWN:
1042     			return(sk->proc);
1043     		default:
1044     			return(-EINVAL);
1045     	}
1046     }
1047     
1048     int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags,
1049     		    struct scm_cookie *scm)
1050     {
1051     	return -EOPNOTSUPP;
1052     }
1053     
1054     int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int len, int flags,
1055     		    struct scm_cookie *scm)
1056     {
1057     	return -EOPNOTSUPP;
1058     }
1059     
1060     int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1061     {
1062     	/* Mirror missing mmap method error code */
1063     	return -ENODEV;
1064     }
1065     
1066     ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1067     {
1068     	ssize_t res;
1069     	struct msghdr msg;
1070     	struct iovec iov;
1071     	mm_segment_t old_fs;
1072     	char *kaddr;
1073     
1074     	kaddr = kmap(page);
1075     
1076     	msg.msg_name = NULL;
1077     	msg.msg_namelen = 0;
1078     	msg.msg_iov = &iov;
1079     	msg.msg_iovlen = 1;
1080     	msg.msg_control = NULL;
1081     	msg.msg_controllen = 0;
1082     	msg.msg_flags = flags;
1083     
1084     	iov.iov_base = kaddr + offset;
1085     	iov.iov_len = size;
1086     
1087     	old_fs = get_fs();
1088     	set_fs(KERNEL_DS);
1089     	res = sock_sendmsg(sock, &msg, size);
1090     	set_fs(old_fs);
1091     
1092     	kunmap(page);
1093     	return res;
1094     }
1095     
1096     /*
1097      *	Default Socket Callbacks
1098      */
1099     
1100     void sock_def_wakeup(struct sock *sk)
1101     {
1102     	read_lock(&sk->callback_lock);
1103     	if (sk->sleep && waitqueue_active(sk->sleep))
1104     		wake_up_interruptible_all(sk->sleep);
1105     	read_unlock(&sk->callback_lock);
1106     }
1107     
1108     void sock_def_error_report(struct sock *sk)
1109     {
1110     	read_lock(&sk->callback_lock);
1111     	if (sk->sleep && waitqueue_active(sk->sleep))
1112     		wake_up_interruptible(sk->sleep);
1113     	sk_wake_async(sk,0,POLL_ERR); 
1114     	read_unlock(&sk->callback_lock);
1115     }
1116     
1117     void sock_def_readable(struct sock *sk, int len)
1118     {
1119     	read_lock(&sk->callback_lock);
1120     	if (sk->sleep && waitqueue_active(sk->sleep))
1121     		wake_up_interruptible(sk->sleep);
1122     	sk_wake_async(sk,1,POLL_IN);
1123     	read_unlock(&sk->callback_lock);
1124     }
1125     
1126     void sock_def_write_space(struct sock *sk)
1127     {
1128     	read_lock(&sk->callback_lock);
1129     
1130     	/* Do not wake up a writer until he can make "significant"
1131     	 * progress.  --DaveM
1132     	 */
1133     	if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) {
1134     		if (sk->sleep && waitqueue_active(sk->sleep))
1135     			wake_up_interruptible(sk->sleep);
1136     
1137     		/* Should agree with poll, otherwise some programs break */
1138     		if (sock_writeable(sk))
1139     			sk_wake_async(sk, 2, POLL_OUT);
1140     	}
1141     
1142     	read_unlock(&sk->callback_lock);
1143     }
1144     
1145     void sock_def_destruct(struct sock *sk)
1146     {
1147     	if (sk->protinfo.destruct_hook)
1148     		kfree(sk->protinfo.destruct_hook);
1149     }
1150     
1151     void sock_init_data(struct socket *sock, struct sock *sk)
1152     {
1153     	skb_queue_head_init(&sk->receive_queue);
1154     	skb_queue_head_init(&sk->write_queue);
1155     	skb_queue_head_init(&sk->error_queue);
1156     
1157     	init_timer(&sk->timer);
1158     	
1159     	sk->allocation	=	GFP_KERNEL;
1160     	sk->rcvbuf	=	sysctl_rmem_default;
1161     	sk->sndbuf	=	sysctl_wmem_default;
1162     	sk->state 	= 	TCP_CLOSE;
1163     	sk->zapped	=	1;
1164     	sk->socket	=	sock;
1165     
1166     	if(sock)
1167     	{
1168     		sk->type	=	sock->type;
1169     		sk->sleep	=	&sock->wait;
1170     		sock->sk	=	sk;
1171     	} else
1172     		sk->sleep	=	NULL;
1173     
1174     	sk->dst_lock		=	RW_LOCK_UNLOCKED;
1175     	sk->callback_lock	=	RW_LOCK_UNLOCKED;
1176     
1177     	sk->state_change	=	sock_def_wakeup;
1178     	sk->data_ready		=	sock_def_readable;
1179     	sk->write_space		=	sock_def_write_space;
1180     	sk->error_report	=	sock_def_error_report;
1181     	sk->destruct            =       sock_def_destruct;
1182     
1183     	sk->peercred.pid 	=	0;
1184     	sk->peercred.uid	=	-1;
1185     	sk->peercred.gid	=	-1;
1186     	sk->rcvlowat		=	1;
1187     	sk->rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1188     	sk->sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1189     
1190     	atomic_set(&sk->refcnt, 1);
1191     }
1192