File: /usr/src/linux/fs/buffer.c

1     /*
2      *  linux/fs/buffer.c
3      *
4      *  Copyright (C) 1991, 1992  Linus Torvalds
5      */
6     
7     /*
8      *  'buffer.c' implements the buffer-cache functions. Race-conditions have
9      * been avoided by NEVER letting an interrupt change a buffer (except for the
10      * data, of course), but instead letting the caller do it.
11      */
12     
13     /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14     
15     /* Removed a lot of unnecessary code and simplified things now that
16      * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17      */
18     
19     /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
20      * hash table, use SLAB cache for buffer heads. -DaveM
21      */
22     
23     /* Added 32k buffer block sizes - these are required older ARM systems.
24      * - RMK
25      */
26     
27     /* Thread it... -DaveM */
28     
29     /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30     
31     #include <linux/config.h>
32     #include <linux/sched.h>
33     #include <linux/fs.h>
34     #include <linux/slab.h>
35     #include <linux/locks.h>
36     #include <linux/errno.h>
37     #include <linux/swap.h>
38     #include <linux/swapctl.h>
39     #include <linux/smp_lock.h>
40     #include <linux/vmalloc.h>
41     #include <linux/blkdev.h>
42     #include <linux/sysrq.h>
43     #include <linux/file.h>
44     #include <linux/init.h>
45     #include <linux/quotaops.h>
46     #include <linux/iobuf.h>
47     #include <linux/highmem.h>
48     #include <linux/completion.h>
49     
50     #include <asm/uaccess.h>
51     #include <asm/io.h>
52     #include <asm/bitops.h>
53     #include <asm/mmu_context.h>
54     
55     #define NR_SIZES 7
56     static char buffersize_index[65] =
57     {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
58       4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59       5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
60      -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
61       6};
62     
63     #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
64     #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
65     #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
66     #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
67     					     number of unused buffer heads */
68     
69     /* Anti-deadlock ordering:
70      *	lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
71      */
72     
73     #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
74     
75     /*
76      * Hash table gook..
77      */
78     static unsigned int bh_hash_mask;
79     static unsigned int bh_hash_shift;
80     static struct buffer_head **hash_table;
81     static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
82     
83     static struct buffer_head *lru_list[NR_LIST];
84     static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
85     static int nr_buffers_type[NR_LIST];
86     static unsigned long size_buffers_type[NR_LIST];
87     
88     static struct buffer_head * unused_list;
89     static int nr_unused_buffer_heads;
90     static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
91     static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
92     
93     struct bh_free_head {
94     	struct buffer_head *list;
95     	spinlock_t lock;
96     };
97     static struct bh_free_head free_list[NR_SIZES];
98     
99     static int grow_buffers(int size);
100     static void __refile_buffer(struct buffer_head *);
101     
102     /* This is used by some architectures to estimate available memory. */
103     atomic_t buffermem_pages = ATOMIC_INIT(0);
104     
105     /* Here is the parameter block for the bdflush process. If you add or
106      * remove any of the parameters, make sure to update kernel/sysctl.c
107      * and the documentation at linux/Documentation/sysctl/vm.txt.
108      */
109     
110     #define N_PARAM 9
111     
112     /* The dummy values in this structure are left in there for compatibility
113      * with old programs that play with the /proc entries.
114      */
115     union bdflush_param {
116     	struct {
117     		int nfract;	/* Percentage of buffer cache dirty to 
118     				   activate bdflush */
119     		int dummy1;	/* old "ndirty" */
120     		int dummy2;	/* old "nrefill" */
121     		int dummy3;	/* unused */
122     		int interval;	/* jiffies delay between kupdate flushes */
123     		int age_buffer;	/* Time for normal buffer to age before we flush it */
124     		int nfract_sync;/* Percentage of buffer cache dirty to 
125     				   activate bdflush synchronously */
126     		int dummy4;	/* unused */
127     		int dummy5;	/* unused */
128     	} b_un;
129     	unsigned int data[N_PARAM];
130     } bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
131     
132     /* These are the min and max parameter values that we will allow to be assigned */
133     int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
134     int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
135     
136     inline void unlock_buffer(struct buffer_head *bh)
137     {
138     	clear_bit(BH_Wait_IO, &bh->b_state);
139     	clear_bit(BH_Lock, &bh->b_state);
140     	smp_mb__after_clear_bit();
141     	if (waitqueue_active(&bh->b_wait))
142     		wake_up(&bh->b_wait);
143     }
144     
145     /*
146      * Rewrote the wait-routines to use the "new" wait-queue functionality,
147      * and getting rid of the cli-sti pairs. The wait-queue routines still
148      * need cli-sti, but now it's just a couple of 386 instructions or so.
149      *
150      * Note that the real wait_on_buffer() is an inline function that checks
151      * if 'b_wait' is set before calling this, so that the queues aren't set
152      * up unnecessarily.
153      */
154     void __wait_on_buffer(struct buffer_head * bh)
155     {
156     	struct task_struct *tsk = current;
157     	DECLARE_WAITQUEUE(wait, tsk);
158     
159     	get_bh(bh);
160     	add_wait_queue(&bh->b_wait, &wait);
161     	do {
162     		run_task_queue(&tq_disk);
163     		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
164     		if (!buffer_locked(bh))
165     			break;
166     		schedule();
167     	} while (buffer_locked(bh));
168     	tsk->state = TASK_RUNNING;
169     	remove_wait_queue(&bh->b_wait, &wait);
170     	put_bh(bh);
171     }
172     
173     /*
174      * Default synchronous end-of-IO handler..  Just mark it up-to-date and
175      * unlock the buffer. This is what ll_rw_block uses too.
176      */
177     void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
178     {
179     	mark_buffer_uptodate(bh, uptodate);
180     	unlock_buffer(bh);
181     	put_bh(bh);
182     }
183     
184     /*
185      * The buffers have been marked clean and locked.  Just submit the dang
186      * things.. 
187      */
188     static void write_locked_buffers(struct buffer_head **array, unsigned int count)
189     {
190     	do {
191     		struct buffer_head * bh = *array++;
192     		bh->b_end_io = end_buffer_io_sync;
193     		submit_bh(WRITE, bh);
194     	} while (--count);
195     }
196     
197     /*
198      * Write some buffers from the head of the dirty queue.
199      *
200      * This must be called with the LRU lock held, and will
201      * return without it!
202      */
203     #define NRSYNC (32)
204     static int write_some_buffers(kdev_t dev)
205     {
206     	struct buffer_head *next;
207     	struct buffer_head *array[NRSYNC];
208     	unsigned int count;
209     	int nr;
210     
211     	next = lru_list[BUF_DIRTY];
212     	nr = nr_buffers_type[BUF_DIRTY];
213     	count = 0;
214     	while (next && --nr >= 0) {
215     		struct buffer_head * bh = next;
216     		next = bh->b_next_free;
217     
218     		if (dev && bh->b_dev != dev)
219     			continue;
220     		if (test_and_set_bit(BH_Lock, &bh->b_state))
221     			continue;
222     		if (atomic_set_buffer_clean(bh)) {
223     			__refile_buffer(bh);
224     			get_bh(bh);
225     			array[count++] = bh;
226     			if (count < NRSYNC)
227     				continue;
228     
229     			spin_unlock(&lru_list_lock);
230     			write_locked_buffers(array, count);
231     			return -EAGAIN;
232     		}
233     		unlock_buffer(bh);
234     		__refile_buffer(bh);
235     	}
236     	spin_unlock(&lru_list_lock);
237     
238     	if (count)
239     		write_locked_buffers(array, count);
240     	return 0;
241     }
242     
243     /*
244      * Write out all buffers on the dirty list.
245      */
246     static void write_unlocked_buffers(kdev_t dev)
247     {
248     	do {
249     		spin_lock(&lru_list_lock);
250     	} while (write_some_buffers(dev));
251     	run_task_queue(&tq_disk);
252     }
253     
254     /*
255      * Wait for a buffer on the proper list.
256      *
257      * This must be called with the LRU lock held, and
258      * will return with it released.
259      */
260     static int wait_for_buffers(kdev_t dev, int index, int refile)
261     {
262     	struct buffer_head * next;
263     	int nr;
264     
265     	next = lru_list[index];
266     	nr = nr_buffers_type[index];
267     	while (next && --nr >= 0) {
268     		struct buffer_head *bh = next;
269     		next = bh->b_next_free;
270     
271     		if (!buffer_locked(bh)) {
272     			if (refile)
273     				__refile_buffer(bh);
274     			continue;
275     		}
276     		if (dev && bh->b_dev != dev)
277     			continue;
278     
279     		get_bh(bh);
280     		spin_unlock(&lru_list_lock);
281     		wait_on_buffer (bh);
282     		put_bh(bh);
283     		return -EAGAIN;
284     	}
285     	spin_unlock(&lru_list_lock);
286     	return 0;
287     }
288     
289     static inline void wait_for_some_buffers(kdev_t dev)
290     {
291     	spin_lock(&lru_list_lock);
292     	wait_for_buffers(dev, BUF_LOCKED, 1);
293     }
294     
295     static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
296     {
297     	do {
298     		spin_lock(&lru_list_lock);
299     	} while (wait_for_buffers(dev, index, refile));
300     	return 0;
301     }
302     
303     /* Call sync_buffers with wait!=0 to ensure that the call does not
304      * return until all buffer writes have completed.  Sync() may return
305      * before the writes have finished; fsync() may not.
306      */
307     
308     /* Godamity-damn.  Some buffers (bitmaps for filesystems)
309      * spontaneously dirty themselves without ever brelse being called.
310      * We will ultimately want to put these in a separate list, but for
311      * now we search all of the lists for dirty buffers.
312      */
313     int sync_buffers(kdev_t dev, int wait)
314     {
315     	int err = 0;
316     
317     	/* One pass for no-wait, three for wait:
318     	 * 0) write out all dirty, unlocked buffers;
319     	 * 1) wait for all dirty locked buffers;
320     	 * 2) write out all dirty, unlocked buffers;
321     	 * 2) wait for completion by waiting for all buffers to unlock.
322     	 */
323     	write_unlocked_buffers(dev);
324     	if (wait) {
325     		err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
326     		write_unlocked_buffers(dev);
327     		err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
328     	}
329     	return err;
330     }
331     
332     int fsync_super(struct super_block *sb)
333     {
334     	kdev_t dev = sb->s_dev;
335     	sync_buffers(dev, 0);
336     
337     	lock_kernel();
338     	sync_inodes_sb(sb);
339     	DQUOT_SYNC(dev);
340     	lock_super(sb);
341     	if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
342     		sb->s_op->write_super(sb);
343     	unlock_super(sb);
344     	unlock_kernel();
345     
346     	return sync_buffers(dev, 1);
347     }
348     
349     int fsync_no_super(kdev_t dev)
350     {
351     	sync_buffers(dev, 0);
352     	return sync_buffers(dev, 1);
353     }
354     
355     int fsync_dev(kdev_t dev)
356     {
357     	sync_buffers(dev, 0);
358     
359     	lock_kernel();
360     	sync_inodes(dev);
361     	DQUOT_SYNC(dev);
362     	sync_supers(dev);
363     	unlock_kernel();
364     
365     	return sync_buffers(dev, 1);
366     }
367     
368     /*
369      * There's no real reason to pretend we should
370      * ever do anything differently
371      */
372     void sync_dev(kdev_t dev)
373     {
374     	fsync_dev(dev);
375     }
376     
377     asmlinkage long sys_sync(void)
378     {
379     	fsync_dev(0);
380     	return 0;
381     }
382     
383     /*
384      *	filp may be NULL if called via the msync of a vma.
385      */
386      
387     int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
388     {
389     	struct inode * inode = dentry->d_inode;
390     	struct super_block * sb;
391     	kdev_t dev;
392     	int ret;
393     
394     	lock_kernel();
395     	/* sync the inode to buffers */
396     	write_inode_now(inode, 0);
397     
398     	/* sync the superblock to buffers */
399     	sb = inode->i_sb;
400     	lock_super(sb);
401     	if (sb->s_op && sb->s_op->write_super)
402     		sb->s_op->write_super(sb);
403     	unlock_super(sb);
404     
405     	/* .. finally sync the buffers to disk */
406     	dev = inode->i_dev;
407     	ret = sync_buffers(dev, 1);
408     	unlock_kernel();
409     	return ret;
410     }
411     
412     asmlinkage long sys_fsync(unsigned int fd)
413     {
414     	struct file * file;
415     	struct dentry * dentry;
416     	struct inode * inode;
417     	int err;
418     
419     	err = -EBADF;
420     	file = fget(fd);
421     	if (!file)
422     		goto out;
423     
424     	dentry = file->f_dentry;
425     	inode = dentry->d_inode;
426     
427     	err = -EINVAL;
428     	if (!file->f_op || !file->f_op->fsync)
429     		goto out_putf;
430     
431     	/* We need to protect against concurrent writers.. */
432     	down(&inode->i_sem);
433     	filemap_fdatasync(inode->i_mapping);
434     	err = file->f_op->fsync(file, dentry, 0);
435     	filemap_fdatawait(inode->i_mapping);
436     	up(&inode->i_sem);
437     
438     out_putf:
439     	fput(file);
440     out:
441     	return err;
442     }
443     
444     asmlinkage long sys_fdatasync(unsigned int fd)
445     {
446     	struct file * file;
447     	struct dentry * dentry;
448     	struct inode * inode;
449     	int err;
450     
451     	err = -EBADF;
452     	file = fget(fd);
453     	if (!file)
454     		goto out;
455     
456     	dentry = file->f_dentry;
457     	inode = dentry->d_inode;
458     
459     	err = -EINVAL;
460     	if (!file->f_op || !file->f_op->fsync)
461     		goto out_putf;
462     
463     	down(&inode->i_sem);
464     	filemap_fdatasync(inode->i_mapping);
465     	err = file->f_op->fsync(file, dentry, 1);
466     	filemap_fdatawait(inode->i_mapping);
467     	up(&inode->i_sem);
468     
469     out_putf:
470     	fput(file);
471     out:
472     	return err;
473     }
474     
475     /* After several hours of tedious analysis, the following hash
476      * function won.  Do not mess with it... -DaveM
477      */
478     #define _hashfn(dev,block)	\
479     	((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
480     	 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
481     	  ((block) << (bh_hash_shift - 12))))
482     #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
483     
484     static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
485     {
486     	if ((bh->b_next = *head) != NULL)
487     		bh->b_next->b_pprev = &bh->b_next;
488     	*head = bh;
489     	bh->b_pprev = head;
490     }
491     
492     static __inline__ void __hash_unlink(struct buffer_head *bh)
493     {
494     	if (bh->b_pprev) {
495     		if (bh->b_next)
496     			bh->b_next->b_pprev = bh->b_pprev;
497     		*(bh->b_pprev) = bh->b_next;
498     		bh->b_pprev = NULL;
499     	}
500     }
501     
502     static void __insert_into_lru_list(struct buffer_head * bh, int blist)
503     {
504     	struct buffer_head **bhp = &lru_list[blist];
505     
506     	if(!*bhp) {
507     		*bhp = bh;
508     		bh->b_prev_free = bh;
509     	}
510     	bh->b_next_free = *bhp;
511     	bh->b_prev_free = (*bhp)->b_prev_free;
512     	(*bhp)->b_prev_free->b_next_free = bh;
513     	(*bhp)->b_prev_free = bh;
514     	nr_buffers_type[blist]++;
515     	size_buffers_type[blist] += bh->b_size;
516     }
517     
518     static void __remove_from_lru_list(struct buffer_head * bh, int blist)
519     {
520     	if (bh->b_prev_free || bh->b_next_free) {
521     		bh->b_prev_free->b_next_free = bh->b_next_free;
522     		bh->b_next_free->b_prev_free = bh->b_prev_free;
523     		if (lru_list[blist] == bh)
524     			lru_list[blist] = bh->b_next_free;
525     		if (lru_list[blist] == bh)
526     			lru_list[blist] = NULL;
527     		bh->b_next_free = bh->b_prev_free = NULL;
528     		nr_buffers_type[blist]--;
529     		size_buffers_type[blist] -= bh->b_size;
530     	}
531     }
532     
533     static void __remove_from_free_list(struct buffer_head * bh, int index)
534     {
535     	if(bh->b_next_free == bh)
536     		 free_list[index].list = NULL;
537     	else {
538     		bh->b_prev_free->b_next_free = bh->b_next_free;
539     		bh->b_next_free->b_prev_free = bh->b_prev_free;
540     		if (free_list[index].list == bh)
541     			 free_list[index].list = bh->b_next_free;
542     	}
543     	bh->b_next_free = bh->b_prev_free = NULL;
544     }
545     
546     /* must be called with both the hash_table_lock and the lru_list_lock
547        held */
548     static void __remove_from_queues(struct buffer_head *bh)
549     {
550     	__hash_unlink(bh);
551     	__remove_from_lru_list(bh, bh->b_list);
552     }
553     
554     static void __insert_into_queues(struct buffer_head *bh)
555     {
556     	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
557     
558     	__hash_link(bh, head);
559     	__insert_into_lru_list(bh, bh->b_list);
560     }
561     
562     /* This function must only run if there are no other
563      * references _anywhere_ to this buffer head.
564      */
565     static void put_last_free(struct buffer_head * bh)
566     {
567     	struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
568     	struct buffer_head **bhp = &head->list;
569     
570     	bh->b_state = 0;
571     
572     	spin_lock(&head->lock);
573     	bh->b_dev = B_FREE;
574     	if(!*bhp) {
575     		*bhp = bh;
576     		bh->b_prev_free = bh;
577     	}
578     	bh->b_next_free = *bhp;
579     	bh->b_prev_free = (*bhp)->b_prev_free;
580     	(*bhp)->b_prev_free->b_next_free = bh;
581     	(*bhp)->b_prev_free = bh;
582     	spin_unlock(&head->lock);
583     }
584     
585     /*
586      * Why like this, I hear you say... The reason is race-conditions.
587      * As we don't lock buffers (unless we are reading them, that is),
588      * something might happen to it while we sleep (ie a read-error
589      * will force it bad). This shouldn't really happen currently, but
590      * the code is ready.
591      */
592     static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
593     {
594     	struct buffer_head *bh = hash(dev, block);
595     
596     	for (; bh; bh = bh->b_next)
597     		if (bh->b_blocknr == block	&&
598     		    bh->b_size    == size	&&
599     		    bh->b_dev     == dev)
600     			break;
601     	if (bh)
602     		get_bh(bh);
603     
604     	return bh;
605     }
606     
607     struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
608     {
609     	struct buffer_head *bh;
610     
611     	read_lock(&hash_table_lock);
612     	bh = __get_hash_table(dev, block, size);
613     	read_unlock(&hash_table_lock);
614     
615     	return bh;
616     }
617     
618     void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
619     {
620     	spin_lock(&lru_list_lock);
621     	if (bh->b_inode)
622     		list_del(&bh->b_inode_buffers);
623     	bh->b_inode = inode;
624     	list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
625     	spin_unlock(&lru_list_lock);
626     }
627     
628     void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
629     {
630     	spin_lock(&lru_list_lock);
631     	if (bh->b_inode)
632     		list_del(&bh->b_inode_buffers);
633     	bh->b_inode = inode;
634     	list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
635     	spin_unlock(&lru_list_lock);
636     }
637     
638     /* The caller must have the lru_list lock before calling the 
639        remove_inode_queue functions.  */
640     static void __remove_inode_queue(struct buffer_head *bh)
641     {
642     	bh->b_inode = NULL;
643     	list_del(&bh->b_inode_buffers);
644     }
645     
646     static inline void remove_inode_queue(struct buffer_head *bh)
647     {
648     	if (bh->b_inode)
649     		__remove_inode_queue(bh);
650     }
651     
652     int inode_has_buffers(struct inode *inode)
653     {
654     	int ret;
655     	
656     	spin_lock(&lru_list_lock);
657     	ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
658     	spin_unlock(&lru_list_lock);
659     	
660     	return ret;
661     }
662     
663     /* If invalidate_buffers() will trash dirty buffers, it means some kind
664        of fs corruption is going on. Trashing dirty data always imply losing
665        information that was supposed to be just stored on the physical layer
666        by the user.
667     
668        Thus invalidate_buffers in general usage is not allwowed to trash dirty
669        buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
670     
671        NOTE: In the case where the user removed a removable-media-disk even if
672        there's still dirty data not synced on disk (due a bug in the device driver
673        or due an error of the user), by not destroying the dirty buffers we could
674        generate corruption also on the next media inserted, thus a parameter is
675        necessary to handle this case in the most safe way possible (trying
676        to not corrupt also the new disk inserted with the data belonging to
677        the old now corrupted disk). Also for the ramdisk the natural thing
678        to do in order to release the ramdisk memory is to destroy dirty buffers.
679     
680        These are two special cases. Normal usage imply the device driver
681        to issue a sync on the device (without waiting I/O completion) and
682        then an invalidate_buffers call that doesn't trash dirty buffers.
683     
684        For handling cache coherency with the blkdev pagecache the 'update' case
685        is been introduced. It is needed to re-read from disk any pinned
686        buffer. NOTE: re-reading from disk is destructive so we can do it only
687        when we assume nobody is changing the buffercache under our I/O and when
688        we think the disk contains more recent information than the buffercache.
689        The update == 1 pass marks the buffers we need to update, the update == 2
690        pass does the actual I/O. */
691     void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
692     {
693     	int i, nlist, slept;
694     	struct buffer_head * bh, * bh_next;
695     
696      retry:
697     	slept = 0;
698     	spin_lock(&lru_list_lock);
699     	for(nlist = 0; nlist < NR_LIST; nlist++) {
700     		bh = lru_list[nlist];
701     		if (!bh)
702     			continue;
703     		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
704     			bh_next = bh->b_next_free;
705     
706     			/* Another device? */
707     			if (bh->b_dev != dev)
708     				continue;
709     			/* Not hashed? */
710     			if (!bh->b_pprev)
711     				continue;
712     			if (buffer_locked(bh)) {
713     				get_bh(bh);
714     				spin_unlock(&lru_list_lock);
715     				wait_on_buffer(bh);
716     				slept = 1;
717     				spin_lock(&lru_list_lock);
718     				put_bh(bh);
719     			}
720     
721     			write_lock(&hash_table_lock);
722     			/* All buffers in the lru lists are mapped */
723     			if (!buffer_mapped(bh))
724     				BUG();
725     			if (!atomic_read(&bh->b_count)) {
726     				if (destroy_dirty_buffers || !buffer_dirty(bh)) {
727     					remove_inode_queue(bh);
728     					__remove_from_queues(bh);
729     					put_last_free(bh);
730     				}
731     			} else if (update) {
732     				if ((update == 2) ^ buffer_uptodate(bh)  &&
733     				    (update == 2) ^ buffer_req(bh)) {
734     					write_unlock(&hash_table_lock);
735     					atomic_inc(&bh->b_count);
736     					spin_unlock(&lru_list_lock);
737     
738     					if (update == 2) {
739     						ll_rw_block(READ, 1, &bh);
740     						wait_on_buffer(bh);
741     					} else {
742     						lock_buffer(bh);
743     						clear_bit(BH_Uptodate, &bh->b_state);
744     						clear_bit(BH_Req, &bh->b_state);
745     						unlock_buffer(bh);
746     					}						
747     
748     					atomic_dec(&bh->b_count);
749     					goto retry;
750     				}
751     			}
752     
753     			write_unlock(&hash_table_lock);
754     			if (slept)
755     				goto out;
756     		}
757     	}
758     out:
759     	spin_unlock(&lru_list_lock);
760     	if (slept)
761     		goto retry;
762     }
763     
764     void set_blocksize(kdev_t dev, int size)
765     {
766     	extern int *blksize_size[];
767     	int i, nlist, slept;
768     	struct buffer_head * bh, * bh_next;
769     
770     	if (!blksize_size[MAJOR(dev)])
771     		return;
772     
773     	/* Size must be a power of two, and between 512 and PAGE_SIZE */
774     	if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
775     		panic("Invalid blocksize passed to set_blocksize");
776     
777     	if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
778     		blksize_size[MAJOR(dev)][MINOR(dev)] = size;
779     		return;
780     	}
781     	if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
782     		return;
783     	sync_buffers(dev, 2);
784     	blksize_size[MAJOR(dev)][MINOR(dev)] = size;
785     
786      retry:
787     	slept = 0;
788     	spin_lock(&lru_list_lock);
789     	for(nlist = 0; nlist < NR_LIST; nlist++) {
790     		bh = lru_list[nlist];
791     		if (!bh)
792     			continue;
793     		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
794     			bh_next = bh->b_next_free;
795     			if (bh->b_dev != dev || bh->b_size == size)
796     				continue;
797     			/* Unhashed? */
798     			if (!bh->b_pprev)
799     				continue;
800     			if (buffer_locked(bh)) {
801     				get_bh(bh);
802     				spin_unlock(&lru_list_lock);
803     				wait_on_buffer(bh);
804     				slept = 1;
805     				spin_lock(&lru_list_lock);
806     				put_bh(bh);
807     			}
808     
809     			write_lock(&hash_table_lock);
810     			if (!atomic_read(&bh->b_count)) {
811     				if (buffer_dirty(bh))
812     					printk(KERN_WARNING
813     					       "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
814     					       kdevname(dev), bh->b_blocknr, bh->b_size);
815     				remove_inode_queue(bh);
816     				__remove_from_queues(bh);
817     				put_last_free(bh);
818     			} else {
819     				if (atomic_set_buffer_clean(bh))
820     					__refile_buffer(bh);
821     				clear_bit(BH_Uptodate, &bh->b_state);
822     				printk(KERN_WARNING
823     				       "set_blocksize: "
824     				       "b_count %d, dev %s, block %lu, from %p\n",
825     				       atomic_read(&bh->b_count), bdevname(bh->b_dev),
826     				       bh->b_blocknr, __builtin_return_address(0));
827     			}
828     			write_unlock(&hash_table_lock);
829     			if (slept)
830     				goto out;
831     		}
832     	}
833      out:
834     	spin_unlock(&lru_list_lock);
835     	if (slept)
836     		goto retry;
837     }
838     
839     static void free_more_memory(void)
840     {
841     	balance_dirty();
842     	wakeup_bdflush();
843     	current->policy |= SCHED_YIELD;
844     	__set_current_state(TASK_RUNNING);
845     	schedule();
846     }
847     
848     void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
849     {
850     	bh->b_list = BUF_CLEAN;
851     	bh->b_end_io = handler;
852     	bh->b_private = private;
853     }
854     
855     static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
856     {
857     	static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
858     	unsigned long flags;
859     	struct buffer_head *tmp;
860     	struct page *page;
861     
862     	mark_buffer_uptodate(bh, uptodate);
863     
864     	/* This is a temporary buffer used for page I/O. */
865     	page = bh->b_page;
866     
867     	if (!uptodate)
868     		SetPageError(page);
869     
870     	/*
871     	 * Be _very_ careful from here on. Bad things can happen if
872     	 * two buffer heads end IO at almost the same time and both
873     	 * decide that the page is now completely done.
874     	 *
875     	 * Async buffer_heads are here only as labels for IO, and get
876     	 * thrown away once the IO for this page is complete.  IO is
877     	 * deemed complete once all buffers have been visited
878     	 * (b_count==0) and are now unlocked. We must make sure that
879     	 * only the _last_ buffer that decrements its count is the one
880     	 * that unlock the page..
881     	 */
882     	spin_lock_irqsave(&page_uptodate_lock, flags);
883     	mark_buffer_async(bh, 0);
884     	unlock_buffer(bh);
885     	tmp = bh->b_this_page;
886     	while (tmp != bh) {
887     		if (buffer_async(tmp) && buffer_locked(tmp))
888     			goto still_busy;
889     		tmp = tmp->b_this_page;
890     	}
891     
892     	/* OK, the async IO on this page is complete. */
893     	spin_unlock_irqrestore(&page_uptodate_lock, flags);
894     
895     	/*
896     	 * if none of the buffers had errors then we can set the
897     	 * page uptodate:
898     	 */
899     	if (!PageError(page))
900     		SetPageUptodate(page);
901     
902     	/*
903     	 * Run the hooks that have to be done when a page I/O has completed.
904     	 */
905     	if (PageTestandClearDecrAfter(page))
906     		atomic_dec(&nr_async_pages);
907     
908     	UnlockPage(page);
909     
910     	return;
911     
912     still_busy:
913     	spin_unlock_irqrestore(&page_uptodate_lock, flags);
914     	return;
915     }
916     
917     inline void set_buffer_async_io(struct buffer_head *bh) {
918         bh->b_end_io = end_buffer_io_async ;
919         mark_buffer_async(bh, 1);
920     }
921     
922     /*
923      * Synchronise all the inode's dirty buffers to the disk.
924      *
925      * We have conflicting pressures: we want to make sure that all
926      * initially dirty buffers get waited on, but that any subsequently
927      * dirtied buffers don't.  After all, we don't want fsync to last
928      * forever if somebody is actively writing to the file.
929      *
930      * Do this in two main stages: first we copy dirty buffers to a
931      * temporary inode list, queueing the writes as we go.  Then we clean
932      * up, waiting for those writes to complete.
933      * 
934      * During this second stage, any subsequent updates to the file may end
935      * up refiling the buffer on the original inode's dirty list again, so
936      * there is a chance we will end up with a buffer queued for write but
937      * not yet completed on that list.  So, as a final cleanup we go through
938      * the osync code to catch these locked, dirty buffers without requeuing
939      * any newly dirty buffers for write.
940      */
941     
942     int fsync_inode_buffers(struct inode *inode)
943     {
944     	struct buffer_head *bh;
945     	struct inode tmp;
946     	int err = 0, err2;
947     	
948     	INIT_LIST_HEAD(&tmp.i_dirty_buffers);
949     	
950     	spin_lock(&lru_list_lock);
951     
952     	while (!list_empty(&inode->i_dirty_buffers)) {
953     		bh = BH_ENTRY(inode->i_dirty_buffers.next);
954     		list_del(&bh->b_inode_buffers);
955     		if (!buffer_dirty(bh) && !buffer_locked(bh))
956     			bh->b_inode = NULL;
957     		else {
958     			bh->b_inode = &tmp;
959     			list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
960     			if (buffer_dirty(bh)) {
961     				get_bh(bh);
962     				spin_unlock(&lru_list_lock);
963     				ll_rw_block(WRITE, 1, &bh);
964     				brelse(bh);
965     				spin_lock(&lru_list_lock);
966     			}
967     		}
968     	}
969     
970     	while (!list_empty(&tmp.i_dirty_buffers)) {
971     		bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
972     		remove_inode_queue(bh);
973     		get_bh(bh);
974     		spin_unlock(&lru_list_lock);
975     		wait_on_buffer(bh);
976     		if (!buffer_uptodate(bh))
977     			err = -EIO;
978     		brelse(bh);
979     		spin_lock(&lru_list_lock);
980     	}
981     	
982     	spin_unlock(&lru_list_lock);
983     	err2 = osync_inode_buffers(inode);
984     
985     	if (err)
986     		return err;
987     	else
988     		return err2;
989     }
990     
991     int fsync_inode_data_buffers(struct inode *inode)
992     {
993     	struct buffer_head *bh;
994     	struct inode tmp;
995     	int err = 0, err2;
996     	
997     	INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
998     	
999     	spin_lock(&lru_list_lock);
1000     
1001     	while (!list_empty(&inode->i_dirty_data_buffers)) {
1002     		bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
1003     		list_del(&bh->b_inode_buffers);
1004     		if (!buffer_dirty(bh) && !buffer_locked(bh))
1005     			bh->b_inode = NULL;
1006     		else {
1007     			bh->b_inode = &tmp;
1008     			list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
1009     			if (buffer_dirty(bh)) {
1010     				get_bh(bh);
1011     				spin_unlock(&lru_list_lock);
1012     				ll_rw_block(WRITE, 1, &bh);
1013     				brelse(bh);
1014     				spin_lock(&lru_list_lock);
1015     			}
1016     		}
1017     	}
1018     
1019     	while (!list_empty(&tmp.i_dirty_data_buffers)) {
1020     		bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
1021     		remove_inode_queue(bh);
1022     		get_bh(bh);
1023     		spin_unlock(&lru_list_lock);
1024     		wait_on_buffer(bh);
1025     		if (!buffer_uptodate(bh))
1026     			err = -EIO;
1027     		brelse(bh);
1028     		spin_lock(&lru_list_lock);
1029     	}
1030     	
1031     	spin_unlock(&lru_list_lock);
1032     	err2 = osync_inode_data_buffers(inode);
1033     
1034     	if (err)
1035     		return err;
1036     	else
1037     		return err2;
1038     }
1039     
1040     /*
1041      * osync is designed to support O_SYNC io.  It waits synchronously for
1042      * all already-submitted IO to complete, but does not queue any new
1043      * writes to the disk.
1044      *
1045      * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
1046      * you dirty the buffers, and then use osync_inode_buffers to wait for
1047      * completion.  Any other dirty buffers which are not yet queued for
1048      * write will not be flushed to disk by the osync.
1049      */
1050     
1051     int osync_inode_buffers(struct inode *inode)
1052     {
1053     	struct buffer_head *bh;
1054     	struct list_head *list;
1055     	int err = 0;
1056     
1057     	spin_lock(&lru_list_lock);
1058     	
1059      repeat:
1060     	
1061     	for (list = inode->i_dirty_buffers.prev; 
1062     	     bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
1063     	     list = bh->b_inode_buffers.prev) {
1064     		if (buffer_locked(bh)) {
1065     			get_bh(bh);
1066     			spin_unlock(&lru_list_lock);
1067     			wait_on_buffer(bh);
1068     			if (!buffer_uptodate(bh))
1069     				err = -EIO;
1070     			brelse(bh);
1071     			spin_lock(&lru_list_lock);
1072     			goto repeat;
1073     		}
1074     	}
1075     
1076     	spin_unlock(&lru_list_lock);
1077     	return err;
1078     }
1079     
1080     int osync_inode_data_buffers(struct inode *inode)
1081     {
1082     	struct buffer_head *bh;
1083     	struct list_head *list;
1084     	int err = 0;
1085     
1086     	spin_lock(&lru_list_lock);
1087     	
1088      repeat:
1089     
1090     	for (list = inode->i_dirty_data_buffers.prev; 
1091     	     bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
1092     	     list = bh->b_inode_buffers.prev) {
1093     		if (buffer_locked(bh)) {
1094     			get_bh(bh);
1095     			spin_unlock(&lru_list_lock);
1096     			wait_on_buffer(bh);
1097     			if (!buffer_uptodate(bh))
1098     				err = -EIO;
1099     			brelse(bh);
1100     			spin_lock(&lru_list_lock);
1101     			goto repeat;
1102     		}
1103     	}
1104     
1105     	spin_unlock(&lru_list_lock);
1106     	return err;
1107     }
1108     
1109     
1110     /*
1111      * Invalidate any and all dirty buffers on a given inode.  We are
1112      * probably unmounting the fs, but that doesn't mean we have already
1113      * done a sync().  Just drop the buffers from the inode list.
1114      */
1115     void invalidate_inode_buffers(struct inode *inode)
1116     {
1117     	struct list_head * entry;
1118     	
1119     	spin_lock(&lru_list_lock);
1120     	while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
1121     		remove_inode_queue(BH_ENTRY(entry));
1122     	while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
1123     		remove_inode_queue(BH_ENTRY(entry));
1124     	spin_unlock(&lru_list_lock);
1125     }
1126     
1127     
1128     /*
1129      * Ok, this is getblk, and it isn't very clear, again to hinder
1130      * race-conditions. Most of the code is seldom used, (ie repeating),
1131      * so it should be much more efficient than it looks.
1132      *
1133      * The algorithm is changed: hopefully better, and an elusive bug removed.
1134      *
1135      * 14.02.92: changed it to sync dirty buffers a bit: better performance
1136      * when the filesystem starts to get full of dirty blocks (I hope).
1137      */
1138     struct buffer_head * getblk(kdev_t dev, int block, int size)
1139     {
1140     	struct buffer_head * bh;
1141     	int isize;
1142     
1143     repeat:
1144     	spin_lock(&lru_list_lock);
1145     	write_lock(&hash_table_lock);
1146     	bh = __get_hash_table(dev, block, size);
1147     	if (bh)
1148     		goto out;
1149     
1150     	isize = BUFSIZE_INDEX(size);
1151     	spin_lock(&free_list[isize].lock);
1152     	bh = free_list[isize].list;
1153     	if (bh) {
1154     		__remove_from_free_list(bh, isize);
1155     		atomic_set(&bh->b_count, 1);
1156     	}
1157     	spin_unlock(&free_list[isize].lock);
1158     
1159     	/*
1160     	 * OK, FINALLY we know that this buffer is the only one of
1161     	 * its kind, we hold a reference (b_count>0), it is unlocked,
1162     	 * and it is clean.
1163     	 */
1164     	if (bh) {
1165     		init_buffer(bh, NULL, NULL);
1166     		bh->b_dev = dev;
1167     		bh->b_blocknr = block;
1168     		bh->b_state = 1 << BH_Mapped;
1169     
1170     		/* Insert the buffer into the regular lists */
1171     		__insert_into_queues(bh);
1172     	out:
1173     		write_unlock(&hash_table_lock);
1174     		spin_unlock(&lru_list_lock);
1175     		touch_buffer(bh);
1176     		return bh;
1177     	}
1178     
1179     	/*
1180     	 * If we block while refilling the free list, somebody may
1181     	 * create the buffer first ... search the hashes again.
1182     	 */
1183     	write_unlock(&hash_table_lock);
1184     	spin_unlock(&lru_list_lock);
1185     
1186     	if (!grow_buffers(size))
1187     		free_more_memory();
1188     
1189     	/* FIXME: getblk should fail if there's no enough memory */
1190     	goto repeat;
1191     }
1192     
1193     /* -1 -> no need to flush
1194         0 -> async flush
1195         1 -> sync flush (wait for I/O completion) */
1196     static int balance_dirty_state(void)
1197     {
1198     	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1199     
1200     	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1201     	tot = nr_free_buffer_pages();
1202     
1203     	dirty *= 100;
1204     	soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1205     	hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1206     
1207     	/* First, check for the "real" dirty limit. */
1208     	if (dirty > soft_dirty_limit) {
1209     		if (dirty > hard_dirty_limit)
1210     			return 1;
1211     		return 0;
1212     	}
1213     
1214     	return -1;
1215     }
1216     
1217     /*
1218      * if a new dirty buffer is created we need to balance bdflush.
1219      *
1220      * in the future we might want to make bdflush aware of different
1221      * pressures on different devices - thus the (currently unused)
1222      * 'dev' parameter.
1223      */
1224     void balance_dirty(void)
1225     {
1226     	int state = balance_dirty_state();
1227     
1228     	if (state < 0)
1229     		return;
1230     
1231     	/* If we're getting into imbalance, start write-out */
1232     	spin_lock(&lru_list_lock);
1233     	write_some_buffers(NODEV);
1234     
1235     	/*
1236     	 * And if we're _really_ out of balance, wait for
1237     	 * some of the dirty/locked buffers ourselves and
1238     	 * start bdflush.
1239     	 * This will throttle heavy writers.
1240     	 */
1241     	if (state > 0) {
1242     		wait_for_some_buffers(NODEV);
1243     		wakeup_bdflush();
1244     	}
1245     }
1246     
1247     inline void __mark_dirty(struct buffer_head *bh)
1248     {
1249     	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1250     	refile_buffer(bh);
1251     }
1252     
1253     /* atomic version, the user must call balance_dirty() by hand
1254        as soon as it become possible to block */
1255     void __mark_buffer_dirty(struct buffer_head *bh)
1256     {
1257     	if (!atomic_set_buffer_dirty(bh))
1258     		__mark_dirty(bh);
1259     }
1260     
1261     void mark_buffer_dirty(struct buffer_head *bh)
1262     {
1263     	if (!atomic_set_buffer_dirty(bh)) {
1264     		__mark_dirty(bh);
1265     		balance_dirty();
1266     	}
1267     }
1268     
1269     /*
1270      * A buffer may need to be moved from one buffer list to another
1271      * (e.g. in case it is not shared any more). Handle this.
1272      */
1273     static void __refile_buffer(struct buffer_head *bh)
1274     {
1275     	int dispose = BUF_CLEAN;
1276     	if (buffer_locked(bh))
1277     		dispose = BUF_LOCKED;
1278     	if (buffer_dirty(bh))
1279     		dispose = BUF_DIRTY;
1280     	if (dispose != bh->b_list) {
1281     		__remove_from_lru_list(bh, bh->b_list);
1282     		bh->b_list = dispose;
1283     		if (dispose == BUF_CLEAN)
1284     			remove_inode_queue(bh);
1285     		__insert_into_lru_list(bh, dispose);
1286     	}
1287     }
1288     
1289     void refile_buffer(struct buffer_head *bh)
1290     {
1291     	spin_lock(&lru_list_lock);
1292     	__refile_buffer(bh);
1293     	spin_unlock(&lru_list_lock);
1294     }
1295     
1296     /*
1297      * Release a buffer head
1298      */
1299     void __brelse(struct buffer_head * buf)
1300     {
1301     	if (atomic_read(&buf->b_count)) {
1302     		put_bh(buf);
1303     		return;
1304     	}
1305     	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1306     }
1307     
1308     /*
1309      * bforget() is like brelse(), except it puts the buffer on the
1310      * free list if it can.. We can NOT free the buffer if:
1311      *  - there are other users of it
1312      *  - it is locked and thus can have active IO
1313      */
1314     void __bforget(struct buffer_head * buf)
1315     {
1316     	/* grab the lru lock here to block bdflush. */
1317     	spin_lock(&lru_list_lock);
1318     	write_lock(&hash_table_lock);
1319     	if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1320     		goto in_use;
1321     	__hash_unlink(buf);
1322     	write_unlock(&hash_table_lock);
1323     	remove_inode_queue(buf);
1324     	__remove_from_lru_list(buf, buf->b_list);
1325     	spin_unlock(&lru_list_lock);
1326     	put_last_free(buf);
1327     	return;
1328     
1329      in_use:
1330     	write_unlock(&hash_table_lock);
1331     	spin_unlock(&lru_list_lock);
1332     }
1333     
1334     /**
1335      *	bread() - reads a specified block and returns the bh
1336      *	@block: number of block
1337      *	@size: size (in bytes) to read
1338      * 
1339      *	Reads a specified block, and returns buffer head that
1340      *	contains it. It returns NULL if the block was unreadable.
1341      */
1342     struct buffer_head * bread(kdev_t dev, int block, int size)
1343     {
1344     	struct buffer_head * bh;
1345     
1346     	bh = getblk(dev, block, size);
1347     	if (buffer_uptodate(bh))
1348     		return bh;
1349     	ll_rw_block(READ, 1, &bh);
1350     	wait_on_buffer(bh);
1351     	if (buffer_uptodate(bh))
1352     		return bh;
1353     	brelse(bh);
1354     	return NULL;
1355     }
1356     
1357     /*
1358      * Note: the caller should wake up the buffer_wait list if needed.
1359      */
1360     static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1361     {
1362     	if (bh->b_inode)
1363     		BUG();
1364     	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1365     		kmem_cache_free(bh_cachep, bh);
1366     	} else {
1367     		bh->b_blocknr = -1;
1368     		bh->b_this_page = NULL;
1369     
1370     		nr_unused_buffer_heads++;
1371     		bh->b_next_free = unused_list;
1372     		unused_list = bh;
1373     	}
1374     }
1375     
1376     /*
1377      * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1378      * no-buffer-head deadlock.  Return NULL on failure; waiting for
1379      * buffer heads is now handled in create_buffers().
1380      */ 
1381     static struct buffer_head * get_unused_buffer_head(int async)
1382     {
1383     	struct buffer_head * bh;
1384     
1385     	spin_lock(&unused_list_lock);
1386     	if (nr_unused_buffer_heads > NR_RESERVED) {
1387     		bh = unused_list;
1388     		unused_list = bh->b_next_free;
1389     		nr_unused_buffer_heads--;
1390     		spin_unlock(&unused_list_lock);
1391     		return bh;
1392     	}
1393     	spin_unlock(&unused_list_lock);
1394     
1395     	/* This is critical.  We can't call out to the FS
1396     	 * to get more buffer heads, because the FS may need
1397     	 * more buffer-heads itself.  Thus SLAB_NOFS.
1398     	 */
1399     	if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1400     		bh->b_blocknr = -1;
1401     		bh->b_this_page = NULL;
1402     		return bh;
1403     	}
1404     
1405     	/*
1406     	 * If we need an async buffer, use the reserved buffer heads.
1407     	 */
1408     	if (async) {
1409     		spin_lock(&unused_list_lock);
1410     		if (unused_list) {
1411     			bh = unused_list;
1412     			unused_list = bh->b_next_free;
1413     			nr_unused_buffer_heads--;
1414     			spin_unlock(&unused_list_lock);
1415     			return bh;
1416     		}
1417     		spin_unlock(&unused_list_lock);
1418     	}
1419     #if 0
1420     	/*
1421     	 * (Pending further analysis ...)
1422     	 * Ordinary (non-async) requests can use a different memory priority
1423     	 * to free up pages. Any swapping thus generated will use async
1424     	 * buffer heads.
1425     	 */
1426     	if(!async &&
1427     	   (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1428     		memset(bh, 0, sizeof(*bh));
1429     		init_waitqueue_head(&bh->b_wait);
1430     		return bh;
1431     	}
1432     #endif
1433     
1434     	return NULL;
1435     }
1436     
1437     void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1438     {
1439     	bh->b_page = page;
1440     	if (offset >= PAGE_SIZE)
1441     		BUG();
1442     	if (PageHighMem(page))
1443     		/*
1444     		 * This catches illegal uses and preserves the offset:
1445     		 */
1446     		bh->b_data = (char *)(0 + offset);
1447     	else
1448     		bh->b_data = page_address(page) + offset;
1449     }
1450     
1451     /*
1452      * Create the appropriate buffers when given a page for data area and
1453      * the size of each buffer.. Use the bh->b_this_page linked list to
1454      * follow the buffers created.  Return NULL if unable to create more
1455      * buffers.
1456      * The async flag is used to differentiate async IO (paging, swapping)
1457      * from ordinary buffer allocations, and only async requests are allowed
1458      * to sleep waiting for buffer heads. 
1459      */
1460     static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1461     {
1462     	struct buffer_head *bh, *head;
1463     	long offset;
1464     
1465     try_again:
1466     	head = NULL;
1467     	offset = PAGE_SIZE;
1468     	while ((offset -= size) >= 0) {
1469     		bh = get_unused_buffer_head(async);
1470     		if (!bh)
1471     			goto no_grow;
1472     
1473     		bh->b_dev = B_FREE;  /* Flag as unused */
1474     		bh->b_this_page = head;
1475     		head = bh;
1476     
1477     		bh->b_state = 0;
1478     		bh->b_next_free = NULL;
1479     		bh->b_pprev = NULL;
1480     		atomic_set(&bh->b_count, 0);
1481     		bh->b_size = size;
1482     
1483     		set_bh_page(bh, page, offset);
1484     
1485     		bh->b_list = BUF_CLEAN;
1486     		bh->b_end_io = NULL;
1487     	}
1488     	return head;
1489     /*
1490      * In case anything failed, we just free everything we got.
1491      */
1492     no_grow:
1493     	if (head) {
1494     		spin_lock(&unused_list_lock);
1495     		do {
1496     			bh = head;
1497     			head = head->b_this_page;
1498     			__put_unused_buffer_head(bh);
1499     		} while (head);
1500     		spin_unlock(&unused_list_lock);
1501     
1502     		/* Wake up any waiters ... */
1503     		wake_up(&buffer_wait);
1504     	}
1505     
1506     	/*
1507     	 * Return failure for non-async IO requests.  Async IO requests
1508     	 * are not allowed to fail, so we have to wait until buffer heads
1509     	 * become available.  But we don't want tasks sleeping with 
1510     	 * partially complete buffers, so all were released above.
1511     	 */
1512     	if (!async)
1513     		return NULL;
1514     
1515     	/* We're _really_ low on memory. Now we just
1516     	 * wait for old buffer heads to become free due to
1517     	 * finishing IO.  Since this is an async request and
1518     	 * the reserve list is empty, we're sure there are 
1519     	 * async buffer heads in use.
1520     	 */
1521     	run_task_queue(&tq_disk);
1522     
1523     	free_more_memory();
1524     	goto try_again;
1525     }
1526     
1527     static void unmap_buffer(struct buffer_head * bh)
1528     {
1529     	if (buffer_mapped(bh)) {
1530     		mark_buffer_clean(bh);
1531     		lock_buffer(bh);
1532     		clear_bit(BH_Uptodate, &bh->b_state);
1533     		clear_bit(BH_Mapped, &bh->b_state);
1534     		clear_bit(BH_Req, &bh->b_state);
1535     		clear_bit(BH_New, &bh->b_state);
1536     		unlock_buffer(bh);
1537     	}
1538     }
1539     
1540     /*
1541      * We don't have to release all buffers here, but
1542      * we have to be sure that no dirty buffer is left
1543      * and no IO is going on (no buffer is locked), because
1544      * we have truncated the file and are going to free the
1545      * blocks on-disk..
1546      */
1547     int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1548     {
1549     	struct buffer_head *head, *bh, *next;
1550     	unsigned int curr_off = 0;
1551     
1552     	if (!PageLocked(page))
1553     		BUG();
1554     	if (!page->buffers)
1555     		return 1;
1556     
1557     	head = page->buffers;
1558     	bh = head;
1559     	do {
1560     		unsigned int next_off = curr_off + bh->b_size;
1561     		next = bh->b_this_page;
1562     
1563     		/*
1564     		 * is this block fully flushed?
1565     		 */
1566     		if (offset <= curr_off)
1567     			unmap_buffer(bh);
1568     		curr_off = next_off;
1569     		bh = next;
1570     	} while (bh != head);
1571     
1572     	/*
1573     	 * subtle. We release buffer-heads only if this is
1574     	 * the 'final' flushpage. We have invalidated the get_block
1575     	 * cached value unconditionally, so real IO is not
1576     	 * possible anymore.
1577     	 *
1578     	 * If the free doesn't work out, the buffers can be
1579     	 * left around - they just turn into anonymous buffers
1580     	 * instead.
1581     	 */
1582     	if (!offset) {
1583     		if (!try_to_free_buffers(page, 0)) {
1584     			if (drop_pagecache)
1585     				atomic_inc(&buffermem_pages);
1586     			return 0;
1587     		}
1588     	}
1589     
1590     	return 1;
1591     }
1592     
1593     void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1594     {
1595     	struct buffer_head *bh, *head, *tail;
1596     
1597     	/* FIXME: create_buffers should fail if there's no enough memory */
1598     	head = create_buffers(page, blocksize, 1);
1599     	if (page->buffers)
1600     		BUG();
1601     
1602     	bh = head;
1603     	do {
1604     		bh->b_dev = dev;
1605     		bh->b_blocknr = 0;
1606     		bh->b_end_io = NULL;
1607     		tail = bh;
1608     		bh = bh->b_this_page;
1609     	} while (bh);
1610     	tail->b_this_page = head;
1611     	page->buffers = head;
1612     	page_cache_get(page);
1613     }
1614     
1615     /*
1616      * We are taking a block for data and we don't want any output from any
1617      * buffer-cache aliases starting from return from that function and
1618      * until the moment when something will explicitly mark the buffer
1619      * dirty (hopefully that will not happen until we will free that block ;-)
1620      * We don't even need to mark it not-uptodate - nobody can expect
1621      * anything from a newly allocated buffer anyway. We used to used
1622      * unmap_buffer() for such invalidation, but that was wrong. We definitely
1623      * don't want to mark the alias unmapped, for example - it would confuse
1624      * anyone who might pick it with bread() afterwards...
1625      */
1626     
1627     static void unmap_underlying_metadata(struct buffer_head * bh)
1628     {
1629     	struct buffer_head *old_bh;
1630     
1631     	old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1632     	if (old_bh) {
1633     		mark_buffer_clean(old_bh);
1634     		wait_on_buffer(old_bh);
1635     		clear_bit(BH_Req, &old_bh->b_state);
1636     		/* Here we could run brelse or bforget. We use
1637     		   bforget because it will try to put the buffer
1638     		   in the freelist. */
1639     		__bforget(old_bh);
1640     	}
1641     }
1642     
1643     /*
1644      * NOTE! All mapped/uptodate combinations are valid:
1645      *
1646      *	Mapped	Uptodate	Meaning
1647      *
1648      *	No	No		"unknown" - must do get_block()
1649      *	No	Yes		"hole" - zero-filled
1650      *	Yes	No		"allocated" - allocated on disk, not read in
1651      *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1652      *
1653      * "Dirty" is valid only with the last case (mapped+uptodate).
1654      */
1655     
1656     /*
1657      * block_write_full_page() is SMP-safe - currently it's still
1658      * being called with the kernel lock held, but the code is ready.
1659      */
1660     static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1661     {
1662     	int err, i;
1663     	unsigned long block;
1664     	struct buffer_head *bh, *head;
1665     
1666     	if (!PageLocked(page))
1667     		BUG();
1668     
1669     	if (!page->buffers)
1670     		create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
1671     	head = page->buffers;
1672     
1673     	block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1674     
1675     	bh = head;
1676     	i = 0;
1677     
1678     	/* Stage 1: make sure we have all the buffers mapped! */
1679     	do {
1680     		/*
1681     		 * If the buffer isn't up-to-date, we can't be sure
1682     		 * that the buffer has been initialized with the proper
1683     		 * block number information etc..
1684     		 *
1685     		 * Leave it to the low-level FS to make all those
1686     		 * decisions (block #0 may actually be a valid block)
1687     		 */
1688     		if (!buffer_mapped(bh)) {
1689     			err = get_block(inode, block, bh, 1);
1690     			if (err)
1691     				goto out;
1692     			if (buffer_new(bh))
1693     				unmap_underlying_metadata(bh);
1694     		}
1695     		bh = bh->b_this_page;
1696     		block++;
1697     	} while (bh != head);
1698     
1699     	/* Stage 2: lock the buffers, mark them clean */
1700     	do {
1701     		lock_buffer(bh);
1702     		set_buffer_async_io(bh);
1703     		set_bit(BH_Uptodate, &bh->b_state);
1704     		clear_bit(BH_Dirty, &bh->b_state);
1705     		bh = bh->b_this_page;
1706     	} while (bh != head);
1707     
1708     	/* Stage 3: submit the IO */
1709     	do {
1710     		struct buffer_head *next = bh->b_this_page;
1711     		submit_bh(WRITE, bh);
1712     		bh = next;
1713     	} while (bh != head);
1714     
1715     	/* Done - end_buffer_io_async will unlock */
1716     	SetPageUptodate(page);
1717     	return 0;
1718     
1719     out:
1720     	ClearPageUptodate(page);
1721     	UnlockPage(page);
1722     	return err;
1723     }
1724     
1725     static int __block_prepare_write(struct inode *inode, struct page *page,
1726     		unsigned from, unsigned to, get_block_t *get_block)
1727     {
1728     	unsigned block_start, block_end;
1729     	unsigned long block;
1730     	int err = 0;
1731     	unsigned blocksize, bbits;
1732     	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1733     	char *kaddr = kmap(page);
1734     
1735     	blocksize = inode->i_sb->s_blocksize;
1736     	if (!page->buffers)
1737     		create_empty_buffers(page, inode->i_dev, blocksize);
1738     	head = page->buffers;
1739     
1740     	bbits = inode->i_sb->s_blocksize_bits;
1741     	block = page->index << (PAGE_CACHE_SHIFT - bbits);
1742     
1743     	for(bh = head, block_start = 0; bh != head || !block_start;
1744     	    block++, block_start=block_end, bh = bh->b_this_page) {
1745     		if (!bh)
1746     			BUG();
1747     		block_end = block_start+blocksize;
1748     		if (block_end <= from)
1749     			continue;
1750     		if (block_start >= to)
1751     			break;
1752     		if (!buffer_mapped(bh)) {
1753     			err = get_block(inode, block, bh, 1);
1754     			if (err)
1755     				goto out;
1756     			if (buffer_new(bh)) {
1757     				unmap_underlying_metadata(bh);
1758     				if (Page_Uptodate(page)) {
1759     					set_bit(BH_Uptodate, &bh->b_state);
1760     					continue;
1761     				}
1762     				if (block_end > to)
1763     					memset(kaddr+to, 0, block_end-to);
1764     				if (block_start < from)
1765     					memset(kaddr+block_start, 0, from-block_start);
1766     				if (block_end > to || block_start < from)
1767     					flush_dcache_page(page);
1768     				continue;
1769     			}
1770     		}
1771     		if (Page_Uptodate(page)) {
1772     			set_bit(BH_Uptodate, &bh->b_state);
1773     			continue; 
1774     		}
1775     		if (!buffer_uptodate(bh) &&
1776     		     (block_start < from || block_end > to)) {
1777     			ll_rw_block(READ, 1, &bh);
1778     			*wait_bh++=bh;
1779     		}
1780     	}
1781     	/*
1782     	 * If we issued read requests - let them complete.
1783     	 */
1784     	while(wait_bh > wait) {
1785     		wait_on_buffer(*--wait_bh);
1786     		err = -EIO;
1787     		if (!buffer_uptodate(*wait_bh))
1788     			goto out;
1789     	}
1790     	return 0;
1791     out:
1792     	return err;
1793     }
1794     
1795     static int __block_commit_write(struct inode *inode, struct page *page,
1796     		unsigned from, unsigned to)
1797     {
1798     	unsigned block_start, block_end;
1799     	int partial = 0, need_balance_dirty = 0;
1800     	unsigned blocksize;
1801     	struct buffer_head *bh, *head;
1802     
1803     	blocksize = inode->i_sb->s_blocksize;
1804     
1805     	for(bh = head = page->buffers, block_start = 0;
1806     	    bh != head || !block_start;
1807     	    block_start=block_end, bh = bh->b_this_page) {
1808     		block_end = block_start + blocksize;
1809     		if (block_end <= from || block_start >= to) {
1810     			if (!buffer_uptodate(bh))
1811     				partial = 1;
1812     		} else {
1813     			set_bit(BH_Uptodate, &bh->b_state);
1814     			if (!atomic_set_buffer_dirty(bh)) {
1815     				__mark_dirty(bh);
1816     				buffer_insert_inode_data_queue(bh, inode);
1817     				need_balance_dirty = 1;
1818     			}
1819     		}
1820     	}
1821     
1822     	if (need_balance_dirty)
1823     		balance_dirty();
1824     	/*
1825     	 * is this a partial write that happened to make all buffers
1826     	 * uptodate then we can optimize away a bogus readpage() for
1827     	 * the next read(). Here we 'discover' wether the page went
1828     	 * uptodate as a result of this (potentially partial) write.
1829     	 */
1830     	if (!partial)
1831     		SetPageUptodate(page);
1832     	return 0;
1833     }
1834     
1835     /*
1836      * Generic "read page" function for block devices that have the normal
1837      * get_block functionality. This is most of the block device filesystems.
1838      * Reads the page asynchronously --- the unlock_buffer() and
1839      * mark_buffer_uptodate() functions propagate buffer state into the
1840      * page struct once IO has completed.
1841      */
1842     int block_read_full_page(struct page *page, get_block_t *get_block)
1843     {
1844     	struct inode *inode = page->mapping->host;
1845     	unsigned long iblock, lblock;
1846     	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1847     	unsigned int blocksize, blocks;
1848     	int nr, i;
1849     
1850     	if (!PageLocked(page))
1851     		PAGE_BUG(page);
1852     	blocksize = inode->i_sb->s_blocksize;
1853     	if (!page->buffers)
1854     		create_empty_buffers(page, inode->i_dev, blocksize);
1855     	head = page->buffers;
1856     
1857     	blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1858     	iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1859     	lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1860     	bh = head;
1861     	nr = 0;
1862     	i = 0;
1863     
1864     	do {
1865     		if (buffer_uptodate(bh))
1866     			continue;
1867     
1868     		if (!buffer_mapped(bh)) {
1869     			if (iblock < lblock) {
1870     				if (get_block(inode, iblock, bh, 0))
1871     					continue;
1872     			}
1873     			if (!buffer_mapped(bh)) {
1874     				memset(kmap(page) + i*blocksize, 0, blocksize);
1875     				flush_dcache_page(page);
1876     				kunmap(page);
1877     				set_bit(BH_Uptodate, &bh->b_state);
1878     				continue;
1879     			}
1880     			/* get_block() might have updated the buffer synchronously */
1881     			if (buffer_uptodate(bh))
1882     				continue;
1883     		}
1884     
1885     		arr[nr] = bh;
1886     		nr++;
1887     	} while (i++, iblock++, (bh = bh->b_this_page) != head);
1888     
1889     	if (!nr) {
1890     		/*
1891     		 * all buffers are uptodate - we can set the page
1892     		 * uptodate as well.
1893     		 */
1894     		SetPageUptodate(page);
1895     		UnlockPage(page);
1896     		return 0;
1897     	}
1898     
1899     	/* Stage two: lock the buffers */
1900     	for (i = 0; i < nr; i++) {
1901     		struct buffer_head * bh = arr[i];
1902     		lock_buffer(bh);
1903     		set_buffer_async_io(bh);
1904     	}
1905     
1906     	/* Stage 3: start the IO */
1907     	for (i = 0; i < nr; i++)
1908     		submit_bh(READ, arr[i]);
1909     
1910     	return 0;
1911     }
1912     
1913     /*
1914      * For moronic filesystems that do not allow holes in file.
1915      * We may have to extend the file.
1916      */
1917     
1918     int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1919     {
1920     	struct address_space *mapping = page->mapping;
1921     	struct inode *inode = mapping->host;
1922     	struct page *new_page;
1923     	unsigned long pgpos;
1924     	long status;
1925     	unsigned zerofrom;
1926     	unsigned blocksize = inode->i_sb->s_blocksize;
1927     	char *kaddr;
1928     
1929     	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1930     		status = -ENOMEM;
1931     		new_page = grab_cache_page(mapping, pgpos);
1932     		if (!new_page)
1933     			goto out;
1934     		/* we might sleep */
1935     		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1936     			UnlockPage(new_page);
1937     			page_cache_release(new_page);
1938     			continue;
1939     		}
1940     		zerofrom = *bytes & ~PAGE_CACHE_MASK;
1941     		if (zerofrom & (blocksize-1)) {
1942     			*bytes |= (blocksize-1);
1943     			(*bytes)++;
1944     		}
1945     		status = __block_prepare_write(inode, new_page, zerofrom,
1946     						PAGE_CACHE_SIZE, get_block);
1947     		if (status)
1948     			goto out_unmap;
1949     		kaddr = page_address(new_page);
1950     		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1951     		flush_dcache_page(new_page);
1952     		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1953     		kunmap(new_page);
1954     		UnlockPage(new_page);
1955     		page_cache_release(new_page);
1956     	}
1957     
1958     	if (page->index < pgpos) {
1959     		/* completely inside the area */
1960     		zerofrom = offset;
1961     	} else {
1962     		/* page covers the boundary, find the boundary offset */
1963     		zerofrom = *bytes & ~PAGE_CACHE_MASK;
1964     
1965     		/* if we will expand the thing last block will be filled */
1966     		if (to > zerofrom && (zerofrom & (blocksize-1))) {
1967     			*bytes |= (blocksize-1);
1968     			(*bytes)++;
1969     		}
1970     
1971     		/* starting below the boundary? Nothing to zero out */
1972     		if (offset <= zerofrom)
1973     			zerofrom = offset;
1974     	}
1975     	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1976     	if (status)
1977     		goto out1;
1978     	kaddr = page_address(page);
1979     	if (zerofrom < offset) {
1980     		memset(kaddr+zerofrom, 0, offset-zerofrom);
1981     		flush_dcache_page(page);
1982     		__block_commit_write(inode, page, zerofrom, offset);
1983     	}
1984     	return 0;
1985     out1:
1986     	ClearPageUptodate(page);
1987     	kunmap(page);
1988     	return status;
1989     
1990     out_unmap:
1991     	ClearPageUptodate(new_page);
1992     	kunmap(new_page);
1993     	UnlockPage(new_page);
1994     	page_cache_release(new_page);
1995     out:
1996     	return status;
1997     }
1998     
1999     int block_prepare_write(struct page *page, unsigned from, unsigned to,
2000     			get_block_t *get_block)
2001     {
2002     	struct inode *inode = page->mapping->host;
2003     	int err = __block_prepare_write(inode, page, from, to, get_block);
2004     	if (err) {
2005     		ClearPageUptodate(page);
2006     		kunmap(page);
2007     	}
2008     	return err;
2009     }
2010     
2011     int generic_commit_write(struct file *file, struct page *page,
2012     		unsigned from, unsigned to)
2013     {
2014     	struct inode *inode = page->mapping->host;
2015     	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2016     	__block_commit_write(inode,page,from,to);
2017     	kunmap(page);
2018     	if (pos > inode->i_size) {
2019     		inode->i_size = pos;
2020     		mark_inode_dirty(inode);
2021     	}
2022     	return 0;
2023     }
2024     
2025     int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2026     {
2027     	unsigned long index = from >> PAGE_CACHE_SHIFT;
2028     	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2029     	unsigned blocksize, iblock, length, pos;
2030     	struct inode *inode = mapping->host;
2031     	struct page *page;
2032     	struct buffer_head *bh;
2033     	int err;
2034     
2035     	blocksize = inode->i_sb->s_blocksize;
2036     	length = offset & (blocksize - 1);
2037     
2038     	/* Block boundary? Nothing to do */
2039     	if (!length)
2040     		return 0;
2041     
2042     	length = blocksize - length;
2043     	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
2044     	
2045     	page = grab_cache_page(mapping, index);
2046     	err = -ENOMEM;
2047     	if (!page)
2048     		goto out;
2049     
2050     	if (!page->buffers)
2051     		create_empty_buffers(page, inode->i_dev, blocksize);
2052     
2053     	/* Find the buffer that contains "offset" */
2054     	bh = page->buffers;
2055     	pos = blocksize;
2056     	while (offset >= pos) {
2057     		bh = bh->b_this_page;
2058     		iblock++;
2059     		pos += blocksize;
2060     	}
2061     
2062     	err = 0;
2063     	if (!buffer_mapped(bh)) {
2064     		/* Hole? Nothing to do */
2065     		if (buffer_uptodate(bh))
2066     			goto unlock;
2067     		get_block(inode, iblock, bh, 0);
2068     		/* Still unmapped? Nothing to do */
2069     		if (!buffer_mapped(bh))
2070     			goto unlock;
2071     	}
2072     
2073     	/* Ok, it's mapped. Make sure it's up-to-date */
2074     	if (Page_Uptodate(page))
2075     		set_bit(BH_Uptodate, &bh->b_state);
2076     
2077     	if (!buffer_uptodate(bh)) {
2078     		err = -EIO;
2079     		ll_rw_block(READ, 1, &bh);
2080     		wait_on_buffer(bh);
2081     		/* Uhhuh. Read error. Complain and punt. */
2082     		if (!buffer_uptodate(bh))
2083     			goto unlock;
2084     	}
2085     
2086     	memset(kmap(page) + offset, 0, length);
2087     	flush_dcache_page(page);
2088     	kunmap(page);
2089     
2090     	__mark_buffer_dirty(bh);
2091     	err = 0;
2092     
2093     unlock:
2094     	UnlockPage(page);
2095     	page_cache_release(page);
2096     out:
2097     	return err;
2098     }
2099     
2100     int block_write_full_page(struct page *page, get_block_t *get_block)
2101     {
2102     	struct inode *inode = page->mapping->host;
2103     	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2104     	unsigned offset;
2105     	int err;
2106     
2107     	/* easy case */
2108     	if (page->index < end_index)
2109     		return __block_write_full_page(inode, page, get_block);
2110     
2111     	/* things got complicated... */
2112     	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2113     	/* OK, are we completely out? */
2114     	if (page->index >= end_index+1 || !offset) {
2115     		UnlockPage(page);
2116     		return -EIO;
2117     	}
2118     
2119     	/* Sigh... will have to work, then... */
2120     	err = __block_prepare_write(inode, page, 0, offset, get_block);
2121     	if (!err) {
2122     		memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2123     		flush_dcache_page(page);
2124     		__block_commit_write(inode,page,0,offset);
2125     done:
2126     		kunmap(page);
2127     		UnlockPage(page);
2128     		return err;
2129     	}
2130     	ClearPageUptodate(page);
2131     	goto done;
2132     }
2133     
2134     int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2135     {
2136     	struct buffer_head tmp;
2137     	struct inode *inode = mapping->host;
2138     	tmp.b_state = 0;
2139     	tmp.b_blocknr = 0;
2140     	get_block(inode, block, &tmp, 0);
2141     	return tmp.b_blocknr;
2142     }
2143     
2144     int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2145     {
2146     	int i, nr_blocks, retval;
2147     	unsigned long * blocks = iobuf->blocks;
2148     
2149     	nr_blocks = iobuf->length / blocksize;
2150     	/* build the blocklist */
2151     	for (i = 0; i < nr_blocks; i++, blocknr++) {
2152     		struct buffer_head bh;
2153     
2154     		bh.b_state = 0;
2155     		bh.b_dev = inode->i_dev;
2156     		bh.b_size = blocksize;
2157     
2158     		retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
2159     		if (retval)
2160     			goto out;
2161     
2162     		if (rw == READ) {
2163     			if (buffer_new(&bh))
2164     				BUG();
2165     			if (!buffer_mapped(&bh)) {
2166     				/* there was an hole in the filesystem */
2167     				blocks[i] = -1UL;
2168     				continue;
2169     			}
2170     		} else {
2171     			if (buffer_new(&bh))
2172     				unmap_underlying_metadata(&bh);
2173     			if (!buffer_mapped(&bh))
2174     				BUG();
2175     		}
2176     		blocks[i] = bh.b_blocknr;
2177     	}
2178     
2179     	retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2180     
2181      out:
2182     	return retval;
2183     }
2184     
2185     /*
2186      * IO completion routine for a buffer_head being used for kiobuf IO: we
2187      * can't dispatch the kiobuf callback until io_count reaches 0.  
2188      */
2189     
2190     static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2191     {
2192     	struct kiobuf *kiobuf;
2193     	
2194     	mark_buffer_uptodate(bh, uptodate);
2195     
2196     	kiobuf = bh->b_private;
2197     	unlock_buffer(bh);
2198     	end_kio_request(kiobuf, uptodate);
2199     }
2200     
2201     /*
2202      * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2203      * for them to complete.  Clean up the buffer_heads afterwards.  
2204      */
2205     
2206     static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2207     {
2208     	int iosize, err;
2209     	int i;
2210     	struct buffer_head *tmp;
2211     
2212     	iosize = 0;
2213     	err = 0;
2214     
2215     	for (i = nr; --i >= 0; ) {
2216     		iosize += size;
2217     		tmp = bh[i];
2218     		if (buffer_locked(tmp)) {
2219     			wait_on_buffer(tmp);
2220     		}
2221     		
2222     		if (!buffer_uptodate(tmp)) {
2223     			/* We are traversing bh'es in reverse order so
2224                                clearing iosize on error calculates the
2225                                amount of IO before the first error. */
2226     			iosize = 0;
2227     			err = -EIO;
2228     		}
2229     	}
2230     	
2231     	if (iosize)
2232     		return iosize;
2233     	return err;
2234     }
2235     
2236     /*
2237      * Start I/O on a physical range of kernel memory, defined by a vector
2238      * of kiobuf structs (much like a user-space iovec list).
2239      *
2240      * The kiobuf must already be locked for IO.  IO is submitted
2241      * asynchronously: you need to check page->locked, page->uptodate, and
2242      * maybe wait on page->wait.
2243      *
2244      * It is up to the caller to make sure that there are enough blocks
2245      * passed in to completely map the iobufs to disk.
2246      */
2247     
2248     int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
2249     	       kdev_t dev, unsigned long b[], int size)
2250     {
2251     	int		err;
2252     	int		length;
2253     	int		transferred;
2254     	int		i;
2255     	int		bufind;
2256     	int		pageind;
2257     	int		bhind;
2258     	int		offset;
2259     	unsigned long	blocknr;
2260     	struct kiobuf *	iobuf = NULL;
2261     	struct page *	map;
2262     	struct buffer_head *tmp, **bhs = NULL;
2263     
2264     	if (!nr)
2265     		return 0;
2266     	
2267     	/* 
2268     	 * First, do some alignment and validity checks 
2269     	 */
2270     	for (i = 0; i < nr; i++) {
2271     		iobuf = iovec[i];
2272     		if ((iobuf->offset & (size-1)) ||
2273     		    (iobuf->length & (size-1)))
2274     			return -EINVAL;
2275     		if (!iobuf->nr_pages)
2276     			panic("brw_kiovec: iobuf not initialised");
2277     	}
2278     
2279     	/* 
2280     	 * OK to walk down the iovec doing page IO on each page we find. 
2281     	 */
2282     	bufind = bhind = transferred = err = 0;
2283     	for (i = 0; i < nr; i++) {
2284     		iobuf = iovec[i];
2285     		offset = iobuf->offset;
2286     		length = iobuf->length;
2287     		iobuf->errno = 0;
2288     		if (!bhs)
2289     			bhs = iobuf->bh;
2290     		
2291     		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2292     			map  = iobuf->maplist[pageind];
2293     			if (!map) {
2294     				err = -EFAULT;
2295     				goto finished;
2296     			}
2297     			
2298     			while (length > 0) {
2299     				blocknr = b[bufind++];
2300     				if (blocknr == -1UL) {
2301     					if (rw == READ) {
2302     						/* there was an hole in the filesystem */
2303     						memset(kmap(map) + offset, 0, size);
2304     						flush_dcache_page(map);
2305     						kunmap(map);
2306     
2307     						transferred += size;
2308     						goto skip_block;
2309     					} else
2310     						BUG();
2311     				}
2312     				tmp = bhs[bhind++];
2313     
2314     				tmp->b_dev = B_FREE;
2315     				tmp->b_size = size;
2316     				set_bh_page(tmp, map, offset);
2317     				tmp->b_this_page = tmp;
2318     
2319     				init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2320     				tmp->b_dev = dev;
2321     				tmp->b_blocknr = blocknr;
2322     				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2323     
2324     				if (rw == WRITE) {
2325     					set_bit(BH_Uptodate, &tmp->b_state);
2326     					clear_bit(BH_Dirty, &tmp->b_state);
2327     				} else
2328     					set_bit(BH_Uptodate, &tmp->b_state);
2329     
2330     				atomic_inc(&iobuf->io_count);
2331     				submit_bh(rw, tmp);
2332     				/* 
2333     				 * Wait for IO if we have got too much 
2334     				 */
2335     				if (bhind >= KIO_MAX_SECTORS) {
2336     					kiobuf_wait_for_io(iobuf); /* wake-one */
2337     					err = wait_kio(rw, bhind, bhs, size);
2338     					if (err >= 0)
2339     						transferred += err;
2340     					else
2341     						goto finished;
2342     					bhind = 0;
2343     				}
2344     
2345     			skip_block:
2346     				length -= size;
2347     				offset += size;
2348     
2349     				if (offset >= PAGE_SIZE) {
2350     					offset = 0;
2351     					break;
2352     				}
2353     			} /* End of block loop */
2354     		} /* End of page loop */		
2355     	} /* End of iovec loop */
2356     
2357     	/* Is there any IO still left to submit? */
2358     	if (bhind) {
2359     		kiobuf_wait_for_io(iobuf); /* wake-one */
2360     		err = wait_kio(rw, bhind, bhs, size);
2361     		if (err >= 0)
2362     			transferred += err;
2363     		else
2364     			goto finished;
2365     	}
2366     
2367      finished:
2368     	if (transferred)
2369     		return transferred;
2370     	return err;
2371     }
2372     
2373     /*
2374      * Start I/O on a page.
2375      * This function expects the page to be locked and may return
2376      * before I/O is complete. You then have to check page->locked,
2377      * page->uptodate, and maybe wait on page->wait.
2378      *
2379      * brw_page() is SMP-safe, although it's being called with the
2380      * kernel lock held - but the code is ready.
2381      *
2382      * FIXME: we need a swapper_inode->get_block function to remove
2383      *        some of the bmap kludges and interface ugliness here.
2384      */
2385     int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2386     {
2387     	struct buffer_head *head, *bh;
2388     
2389     	if (!PageLocked(page))
2390     		panic("brw_page: page not locked for I/O");
2391     
2392     	if (!page->buffers)
2393     		create_empty_buffers(page, dev, size);
2394     	head = bh = page->buffers;
2395     
2396     	/* Stage 1: lock all the buffers */
2397     	do {
2398     		lock_buffer(bh);
2399     		bh->b_blocknr = *(b++);
2400     		set_bit(BH_Mapped, &bh->b_state);
2401     		set_buffer_async_io(bh);
2402     		bh = bh->b_this_page;
2403     	} while (bh != head);
2404     
2405     	/* Stage 2: start the IO */
2406     	do {
2407     		struct buffer_head *next = bh->b_this_page;
2408     		submit_bh(rw, bh);
2409     		bh = next;
2410     	} while (bh != head);
2411     	return 0;
2412     }
2413     
2414     int block_symlink(struct inode *inode, const char *symname, int len)
2415     {
2416     	struct address_space *mapping = inode->i_mapping;
2417     	struct page *page = grab_cache_page(mapping, 0);
2418     	int err = -ENOMEM;
2419     	char *kaddr;
2420     
2421     	if (!page)
2422     		goto fail;
2423     	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2424     	if (err)
2425     		goto fail_map;
2426     	kaddr = page_address(page);
2427     	memcpy(kaddr, symname, len-1);
2428     	mapping->a_ops->commit_write(NULL, page, 0, len-1);
2429     	/*
2430     	 * Notice that we are _not_ going to block here - end of page is
2431     	 * unmapped, so this will only try to map the rest of page, see
2432     	 * that it is unmapped (typically even will not look into inode -
2433     	 * ->i_size will be enough for everything) and zero it out.
2434     	 * OTOH it's obviously correct and should make the page up-to-date.
2435     	 */
2436     	err = mapping->a_ops->readpage(NULL, page);
2437     	wait_on_page(page);
2438     	page_cache_release(page);
2439     	if (err < 0)
2440     		goto fail;
2441     	mark_inode_dirty(inode);
2442     	return 0;
2443     fail_map:
2444     	UnlockPage(page);
2445     	page_cache_release(page);
2446     fail:
2447     	return err;
2448     }
2449     
2450     /*
2451      * Try to increase the number of buffers available: the size argument
2452      * is used to determine what kind of buffers we want.
2453      */
2454     static int grow_buffers(int size)
2455     {
2456     	struct page * page;
2457     	struct buffer_head *bh, *tmp;
2458     	struct buffer_head * insert_point;
2459     	int isize;
2460     
2461     	if ((size & 511) || (size > PAGE_SIZE)) {
2462     		printk(KERN_ERR "VFS: grow_buffers: size = %d\n",size);
2463     		return 0;
2464     	}
2465     
2466     	page = alloc_page(GFP_NOFS);
2467     	if (!page)
2468     		goto out;
2469     	LockPage(page);
2470     	bh = create_buffers(page, size, 0);
2471     	if (!bh)
2472     		goto no_buffer_head;
2473     
2474     	isize = BUFSIZE_INDEX(size);
2475     
2476     	spin_lock(&free_list[isize].lock);
2477     	insert_point = free_list[isize].list;
2478     	tmp = bh;
2479     	while (1) {
2480     		if (insert_point) {
2481     			tmp->b_next_free = insert_point->b_next_free;
2482     			tmp->b_prev_free = insert_point;
2483     			insert_point->b_next_free->b_prev_free = tmp;
2484     			insert_point->b_next_free = tmp;
2485     		} else {
2486     			tmp->b_prev_free = tmp;
2487     			tmp->b_next_free = tmp;
2488     		}
2489     		insert_point = tmp;
2490     		if (tmp->b_this_page)
2491     			tmp = tmp->b_this_page;
2492     		else
2493     			break;
2494     	}
2495     	tmp->b_this_page = bh;
2496     	free_list[isize].list = bh;
2497     	spin_unlock(&free_list[isize].lock);
2498     
2499     	page->buffers = bh;
2500     	page->flags &= ~(1 << PG_referenced);
2501     	lru_cache_add(page);
2502     	UnlockPage(page);
2503     	atomic_inc(&buffermem_pages);
2504     	return 1;
2505     
2506     no_buffer_head:
2507     	UnlockPage(page);
2508     	page_cache_release(page);
2509     out:
2510     	return 0;
2511     }
2512     
2513     static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
2514     {
2515     	struct buffer_head * p = bh;
2516     	int tryagain = 1;
2517     
2518     	do {
2519     		if (buffer_dirty(p) || buffer_locked(p)) {
2520     			if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
2521     				if (buffer_dirty(p)) {
2522     					ll_rw_block(WRITE, 1, &p);
2523     					tryagain = 0;
2524     				} else if (buffer_locked(p)) {
2525     					if (gfp_mask & __GFP_WAIT) {
2526     						wait_on_buffer(p);
2527     						tryagain = 1;
2528     					} else
2529     						tryagain = 0;
2530     				}
2531     			} else
2532     				tryagain = 0;
2533     		}
2534     		p = p->b_this_page;
2535     	} while (p != bh);
2536     
2537     	return tryagain;
2538     }
2539     
2540     /*
2541      * Can the buffer be thrown out?
2542      */
2543     #define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock))
2544     #define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2545     
2546     /*
2547      * try_to_free_buffers() checks if all the buffers on this particular page
2548      * are unused, and free's the page if so.
2549      *
2550      * Wake up bdflush() if this fails - if we're running low on memory due
2551      * to dirty buffers, we need to flush them out as quickly as possible.
2552      *
2553      * NOTE: There are quite a number of ways that threads of control can
2554      *       obtain a reference to a buffer head within a page.  So we must
2555      *	 lock out all of these paths to cleanly toss the page.
2556      */
2557     int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2558     {
2559     	struct buffer_head * tmp, * bh = page->buffers;
2560     	int index = BUFSIZE_INDEX(bh->b_size);
2561     
2562     cleaned_buffers_try_again:
2563     	spin_lock(&lru_list_lock);
2564     	write_lock(&hash_table_lock);
2565     	spin_lock(&free_list[index].lock);
2566     	tmp = bh;
2567     	do {
2568     		if (buffer_busy(tmp))
2569     			goto busy_buffer_page;
2570     		tmp = tmp->b_this_page;
2571     	} while (tmp != bh);
2572     
2573     	spin_lock(&unused_list_lock);
2574     	tmp = bh;
2575     	do {
2576     		struct buffer_head * p = tmp;
2577     		tmp = tmp->b_this_page;
2578     
2579     		/* The buffer can be either on the regular
2580     		 * queues or on the free list..
2581     		 */
2582     		if (p->b_dev != B_FREE) {
2583     			remove_inode_queue(p);
2584     			__remove_from_queues(p);
2585     		} else
2586     			__remove_from_free_list(p, index);
2587     		__put_unused_buffer_head(p);
2588     	} while (tmp != bh);
2589     	spin_unlock(&unused_list_lock);
2590     
2591     	/* Wake up anyone waiting for buffer heads */
2592     	wake_up(&buffer_wait);
2593     
2594     	/* And free the page */
2595     	page->buffers = NULL;
2596     	page_cache_release(page);
2597     	spin_unlock(&free_list[index].lock);
2598     	write_unlock(&hash_table_lock);
2599     	spin_unlock(&lru_list_lock);
2600     	return 1;
2601     
2602     busy_buffer_page:
2603     	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
2604     	spin_unlock(&free_list[index].lock);
2605     	write_unlock(&hash_table_lock);
2606     	spin_unlock(&lru_list_lock);
2607     	if (gfp_mask & __GFP_IO) {
2608     		if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2609     			if (sync_page_buffers(bh, gfp_mask)) {
2610     				/* no IO or waiting next time */
2611     				gfp_mask = 0;
2612     				goto cleaned_buffers_try_again;
2613     			}
2614     		}
2615     	}
2616     	if (balance_dirty_state() >= 0)
2617     		wakeup_bdflush();
2618     	return 0;
2619     }
2620     
2621     /* ================== Debugging =================== */
2622     
2623     void show_buffers(void)
2624     {
2625     #ifdef CONFIG_SMP
2626     	struct buffer_head * bh;
2627     	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2628     	int nlist;
2629     	static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2630     #endif
2631     
2632     	printk("Buffer memory:   %6dkB\n",
2633     			atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2634     
2635     #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2636     	if (!spin_trylock(&lru_list_lock))
2637     		return;
2638     	for(nlist = 0; nlist < NR_LIST; nlist++) {
2639     		found = locked = dirty = used = lastused = 0;
2640     		bh = lru_list[nlist];
2641     		if(!bh) continue;
2642     
2643     		do {
2644     			found++;
2645     			if (buffer_locked(bh))
2646     				locked++;
2647     			if (buffer_dirty(bh))
2648     				dirty++;
2649     			if (atomic_read(&bh->b_count))
2650     				used++, lastused = found;
2651     			bh = bh->b_next_free;
2652     		} while (bh != lru_list[nlist]);
2653     		{
2654     			int tmp = nr_buffers_type[nlist];
2655     			if (found != tmp)
2656     				printk("%9s: BUG -> found %d, reported %d\n",
2657     				       buf_types[nlist], found, tmp);
2658     		}
2659     		printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2660     		       "%d locked, %d dirty\n",
2661     		       buf_types[nlist], found, size_buffers_type[nlist]>>10,
2662     		       used, lastused, locked, dirty);
2663     	}
2664     	spin_unlock(&lru_list_lock);
2665     #endif
2666     }
2667     
2668     /* ===================== Init ======================= */
2669     
2670     /*
2671      * allocate the hash table and init the free list
2672      * Use gfp() for the hash table to decrease TLB misses, use
2673      * SLAB cache for buffer heads.
2674      */
2675     void __init buffer_init(unsigned long mempages)
2676     {
2677     	int order, i;
2678     	unsigned int nr_hash;
2679     
2680     	/* The buffer cache hash table is less important these days,
2681     	 * trim it a bit.
2682     	 */
2683     	mempages >>= 14;
2684     
2685     	mempages *= sizeof(struct buffer_head *);
2686     
2687     	for (order = 0; (1 << order) < mempages; order++)
2688     		;
2689     
2690     	/* try to allocate something until we get it or we're asking
2691     	   for something that is really too small */
2692     
2693     	do {
2694     		unsigned long tmp;
2695     
2696     		nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2697     		bh_hash_mask = (nr_hash - 1);
2698     
2699     		tmp = nr_hash;
2700     		bh_hash_shift = 0;
2701     		while((tmp >>= 1UL) != 0UL)
2702     			bh_hash_shift++;
2703     
2704     		hash_table = (struct buffer_head **)
2705     		    __get_free_pages(GFP_ATOMIC, order);
2706     	} while (hash_table == NULL && --order > 0);
2707     	printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2708     	       nr_hash, order, (PAGE_SIZE << order));
2709     
2710     	if (!hash_table)
2711     		panic("Failed to allocate buffer hash table\n");
2712     
2713     	/* Setup hash chains. */
2714     	for(i = 0; i < nr_hash; i++)
2715     		hash_table[i] = NULL;
2716     
2717     	/* Setup free lists. */
2718     	for(i = 0; i < NR_SIZES; i++) {
2719     		free_list[i].list = NULL;
2720     		free_list[i].lock = SPIN_LOCK_UNLOCKED;
2721     	}
2722     
2723     	/* Setup lru lists. */
2724     	for(i = 0; i < NR_LIST; i++)
2725     		lru_list[i] = NULL;
2726     
2727     }
2728     
2729     
2730     /* ====================== bdflush support =================== */
2731     
2732     /* This is a simple kernel daemon, whose job it is to provide a dynamic
2733      * response to dirty buffers.  Once this process is activated, we write back
2734      * a limited number of buffers to the disks and then go back to sleep again.
2735      */
2736     
2737     DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2738     
2739     void wakeup_bdflush(void)
2740     {
2741     	wake_up_interruptible(&bdflush_wait);
2742     }
2743     
2744     /* 
2745      * Here we attempt to write back old buffers.  We also try to flush inodes 
2746      * and supers as well, since this function is essentially "update", and 
2747      * otherwise there would be no way of ensuring that these quantities ever 
2748      * get written back.  Ideally, we would have a timestamp on the inodes
2749      * and superblocks so that we could write back only the old ones as well
2750      */
2751     
2752     static int sync_old_buffers(void)
2753     {
2754     	lock_kernel();
2755     	sync_unlocked_inodes();
2756     	sync_supers(0);
2757     	unlock_kernel();
2758     
2759     	for (;;) {
2760     		struct buffer_head *bh;
2761     
2762     		spin_lock(&lru_list_lock);
2763     		bh = lru_list[BUF_DIRTY];
2764     		if (!bh || time_before(jiffies, bh->b_flushtime))
2765     			break;
2766     		if (write_some_buffers(NODEV))
2767     			continue;
2768     		return 0;
2769     	}
2770     	spin_unlock(&lru_list_lock);
2771     	return 0;
2772     }
2773     
2774     int block_sync_page(struct page *page)
2775     {
2776     	run_task_queue(&tq_disk);
2777     	return 0;
2778     }
2779     
2780     /* This is the interface to bdflush.  As we get more sophisticated, we can
2781      * pass tuning parameters to this "process", to adjust how it behaves. 
2782      * We would want to verify each parameter, however, to make sure that it 
2783      * is reasonable. */
2784     
2785     asmlinkage long sys_bdflush(int func, long data)
2786     {
2787     	if (!capable(CAP_SYS_ADMIN))
2788     		return -EPERM;
2789     
2790     	if (func == 1) {
2791     		/* do_exit directly and let kupdate to do its work alone. */
2792     		do_exit(0);
2793     #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2794     	 a syscall that doesn't care about the current mm context. */
2795     		int error;
2796     		struct mm_struct *user_mm;
2797     
2798     		/*
2799     		 * bdflush will spend all of it's time in kernel-space,
2800     		 * without touching user-space, so we can switch it into
2801     		 * 'lazy TLB mode' to reduce the cost of context-switches
2802     		 * to and from bdflush.
2803     		 */
2804     		user_mm = start_lazy_tlb();
2805     		error = sync_old_buffers();
2806     		end_lazy_tlb(user_mm);
2807     		return error;
2808     #endif
2809     	}
2810     
2811     	/* Basically func 1 means read param 1, 2 means write param 1, etc */
2812     	if (func >= 2) {
2813     		int i = (func-2) >> 1;
2814     		if (i >= 0 && i < N_PARAM) {
2815     			if ((func & 1) == 0)
2816     				return put_user(bdf_prm.data[i], (int*)data);
2817     
2818     			if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2819     				bdf_prm.data[i] = data;
2820     				return 0;
2821     			}
2822     		}
2823     		return -EINVAL;
2824     	}
2825     
2826     	/* Having func 0 used to launch the actual bdflush and then never
2827     	 * return (unless explicitly killed). We return zero here to 
2828     	 * remain semi-compatible with present update(8) programs.
2829     	 */
2830     	return 0;
2831     }
2832     
2833     /*
2834      * This is the actual bdflush daemon itself. It used to be started from
2835      * the syscall above, but now we launch it ourselves internally with
2836      * kernel_thread(...)  directly after the first thread in init/main.c
2837      */
2838     int bdflush(void *startup)
2839     {
2840     	struct task_struct *tsk = current;
2841     
2842     	/*
2843     	 *	We have a bare-bones task_struct, and really should fill
2844     	 *	in a few more things so "top" and /proc/2/{exe,root,cwd}
2845     	 *	display semi-sane things. Not real crucial though...  
2846     	 */
2847     
2848     	tsk->session = 1;
2849     	tsk->pgrp = 1;
2850     	strcpy(tsk->comm, "bdflush");
2851     
2852     	/* avoid getting signals */
2853     	spin_lock_irq(&tsk->sigmask_lock);
2854     	flush_signals(tsk);
2855     	sigfillset(&tsk->blocked);
2856     	recalc_sigpending(tsk);
2857     	spin_unlock_irq(&tsk->sigmask_lock);
2858     
2859     	complete((struct completion *)startup);
2860     
2861     	for (;;) {
2862     		CHECK_EMERGENCY_SYNC
2863     
2864     		spin_lock(&lru_list_lock);
2865     		if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
2866     			wait_for_some_buffers(NODEV);
2867     			interruptible_sleep_on(&bdflush_wait);
2868     		}
2869     	}
2870     }
2871     
2872     /*
2873      * This is the kernel update daemon. It was used to live in userspace
2874      * but since it's need to run safely we want it unkillable by mistake.
2875      * You don't need to change your userspace configuration since
2876      * the userspace `update` will do_exit(0) at the first sys_bdflush().
2877      */
2878     int kupdate(void *startup)
2879     {
2880     	struct task_struct * tsk = current;
2881     	int interval;
2882     
2883     	tsk->session = 1;
2884     	tsk->pgrp = 1;
2885     	strcpy(tsk->comm, "kupdated");
2886     
2887     	/* sigstop and sigcont will stop and wakeup kupdate */
2888     	spin_lock_irq(&tsk->sigmask_lock);
2889     	sigfillset(&tsk->blocked);
2890     	siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2891     	recalc_sigpending(tsk);
2892     	spin_unlock_irq(&tsk->sigmask_lock);
2893     
2894     	complete((struct completion *)startup);
2895     
2896     	for (;;) {
2897     		wait_for_some_buffers(NODEV);
2898     
2899     		/* update interval */
2900     		interval = bdf_prm.b_un.interval;
2901     		if (interval) {
2902     			tsk->state = TASK_INTERRUPTIBLE;
2903     			schedule_timeout(interval);
2904     		} else {
2905     		stop_kupdate:
2906     			tsk->state = TASK_STOPPED;
2907     			schedule(); /* wait for SIGCONT */
2908     		}
2909     		/* check for sigstop */
2910     		if (signal_pending(tsk)) {
2911     			int stopped = 0;
2912     			spin_lock_irq(&tsk->sigmask_lock);
2913     			if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2914     				sigdelset(&tsk->pending.signal, SIGSTOP);
2915     				stopped = 1;
2916     			}
2917     			recalc_sigpending(tsk);
2918     			spin_unlock_irq(&tsk->sigmask_lock);
2919     			if (stopped)
2920     				goto stop_kupdate;
2921     		}
2922     #ifdef DEBUG
2923     		printk(KERN_DEBUG "kupdate() activated...\n");
2924     #endif
2925     		sync_old_buffers();
2926     	}
2927     }
2928     
2929     static int __init bdflush_init(void)
2930     {
2931     	static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
2932     
2933     	kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2934     	wait_for_completion(&startup);
2935     	kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2936     	wait_for_completion(&startup);
2937     	return 0;
2938     }
2939     
2940     module_init(bdflush_init)
2941     
2942