File: /usr/src/linux/fs/buffer.c
1 /*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
11 */
12
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17 */
18
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
21 */
22
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
25 */
26
27 /* Thread it... -DaveM */
28
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/slab.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48 #include <linux/completion.h>
49
50 #include <asm/uaccess.h>
51 #include <asm/io.h>
52 #include <asm/bitops.h>
53 #include <asm/mmu_context.h>
54
55 #define NR_SIZES 7
56 static char buffersize_index[65] =
57 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
58 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
60 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
61 6};
62
63 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
64 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
65 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
66 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
67 number of unused buffer heads */
68
69 /* Anti-deadlock ordering:
70 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
71 */
72
73 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
74
75 /*
76 * Hash table gook..
77 */
78 static unsigned int bh_hash_mask;
79 static unsigned int bh_hash_shift;
80 static struct buffer_head **hash_table;
81 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
82
83 static struct buffer_head *lru_list[NR_LIST];
84 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
85 static int nr_buffers_type[NR_LIST];
86 static unsigned long size_buffers_type[NR_LIST];
87
88 static struct buffer_head * unused_list;
89 static int nr_unused_buffer_heads;
90 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
91 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
92
93 struct bh_free_head {
94 struct buffer_head *list;
95 spinlock_t lock;
96 };
97 static struct bh_free_head free_list[NR_SIZES];
98
99 static int grow_buffers(int size);
100 static void __refile_buffer(struct buffer_head *);
101
102 /* This is used by some architectures to estimate available memory. */
103 atomic_t buffermem_pages = ATOMIC_INIT(0);
104
105 /* Here is the parameter block for the bdflush process. If you add or
106 * remove any of the parameters, make sure to update kernel/sysctl.c
107 * and the documentation at linux/Documentation/sysctl/vm.txt.
108 */
109
110 #define N_PARAM 9
111
112 /* The dummy values in this structure are left in there for compatibility
113 * with old programs that play with the /proc entries.
114 */
115 union bdflush_param {
116 struct {
117 int nfract; /* Percentage of buffer cache dirty to
118 activate bdflush */
119 int dummy1; /* old "ndirty" */
120 int dummy2; /* old "nrefill" */
121 int dummy3; /* unused */
122 int interval; /* jiffies delay between kupdate flushes */
123 int age_buffer; /* Time for normal buffer to age before we flush it */
124 int nfract_sync;/* Percentage of buffer cache dirty to
125 activate bdflush synchronously */
126 int dummy4; /* unused */
127 int dummy5; /* unused */
128 } b_un;
129 unsigned int data[N_PARAM];
130 } bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
131
132 /* These are the min and max parameter values that we will allow to be assigned */
133 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0};
134 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
135
136 inline void unlock_buffer(struct buffer_head *bh)
137 {
138 clear_bit(BH_Wait_IO, &bh->b_state);
139 clear_bit(BH_Lock, &bh->b_state);
140 smp_mb__after_clear_bit();
141 if (waitqueue_active(&bh->b_wait))
142 wake_up(&bh->b_wait);
143 }
144
145 /*
146 * Rewrote the wait-routines to use the "new" wait-queue functionality,
147 * and getting rid of the cli-sti pairs. The wait-queue routines still
148 * need cli-sti, but now it's just a couple of 386 instructions or so.
149 *
150 * Note that the real wait_on_buffer() is an inline function that checks
151 * if 'b_wait' is set before calling this, so that the queues aren't set
152 * up unnecessarily.
153 */
154 void __wait_on_buffer(struct buffer_head * bh)
155 {
156 struct task_struct *tsk = current;
157 DECLARE_WAITQUEUE(wait, tsk);
158
159 get_bh(bh);
160 add_wait_queue(&bh->b_wait, &wait);
161 do {
162 run_task_queue(&tq_disk);
163 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
164 if (!buffer_locked(bh))
165 break;
166 schedule();
167 } while (buffer_locked(bh));
168 tsk->state = TASK_RUNNING;
169 remove_wait_queue(&bh->b_wait, &wait);
170 put_bh(bh);
171 }
172
173 /*
174 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
175 * unlock the buffer. This is what ll_rw_block uses too.
176 */
177 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
178 {
179 mark_buffer_uptodate(bh, uptodate);
180 unlock_buffer(bh);
181 put_bh(bh);
182 }
183
184 /*
185 * The buffers have been marked clean and locked. Just submit the dang
186 * things..
187 */
188 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
189 {
190 do {
191 struct buffer_head * bh = *array++;
192 bh->b_end_io = end_buffer_io_sync;
193 submit_bh(WRITE, bh);
194 } while (--count);
195 }
196
197 /*
198 * Write some buffers from the head of the dirty queue.
199 *
200 * This must be called with the LRU lock held, and will
201 * return without it!
202 */
203 #define NRSYNC (32)
204 static int write_some_buffers(kdev_t dev)
205 {
206 struct buffer_head *next;
207 struct buffer_head *array[NRSYNC];
208 unsigned int count;
209 int nr;
210
211 next = lru_list[BUF_DIRTY];
212 nr = nr_buffers_type[BUF_DIRTY];
213 count = 0;
214 while (next && --nr >= 0) {
215 struct buffer_head * bh = next;
216 next = bh->b_next_free;
217
218 if (dev && bh->b_dev != dev)
219 continue;
220 if (test_and_set_bit(BH_Lock, &bh->b_state))
221 continue;
222 if (atomic_set_buffer_clean(bh)) {
223 __refile_buffer(bh);
224 get_bh(bh);
225 array[count++] = bh;
226 if (count < NRSYNC)
227 continue;
228
229 spin_unlock(&lru_list_lock);
230 write_locked_buffers(array, count);
231 return -EAGAIN;
232 }
233 unlock_buffer(bh);
234 __refile_buffer(bh);
235 }
236 spin_unlock(&lru_list_lock);
237
238 if (count)
239 write_locked_buffers(array, count);
240 return 0;
241 }
242
243 /*
244 * Write out all buffers on the dirty list.
245 */
246 static void write_unlocked_buffers(kdev_t dev)
247 {
248 do {
249 spin_lock(&lru_list_lock);
250 } while (write_some_buffers(dev));
251 run_task_queue(&tq_disk);
252 }
253
254 /*
255 * Wait for a buffer on the proper list.
256 *
257 * This must be called with the LRU lock held, and
258 * will return with it released.
259 */
260 static int wait_for_buffers(kdev_t dev, int index, int refile)
261 {
262 struct buffer_head * next;
263 int nr;
264
265 next = lru_list[index];
266 nr = nr_buffers_type[index];
267 while (next && --nr >= 0) {
268 struct buffer_head *bh = next;
269 next = bh->b_next_free;
270
271 if (!buffer_locked(bh)) {
272 if (refile)
273 __refile_buffer(bh);
274 continue;
275 }
276 if (dev && bh->b_dev != dev)
277 continue;
278
279 get_bh(bh);
280 spin_unlock(&lru_list_lock);
281 wait_on_buffer (bh);
282 put_bh(bh);
283 return -EAGAIN;
284 }
285 spin_unlock(&lru_list_lock);
286 return 0;
287 }
288
289 static inline void wait_for_some_buffers(kdev_t dev)
290 {
291 spin_lock(&lru_list_lock);
292 wait_for_buffers(dev, BUF_LOCKED, 1);
293 }
294
295 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
296 {
297 do {
298 spin_lock(&lru_list_lock);
299 } while (wait_for_buffers(dev, index, refile));
300 return 0;
301 }
302
303 /* Call sync_buffers with wait!=0 to ensure that the call does not
304 * return until all buffer writes have completed. Sync() may return
305 * before the writes have finished; fsync() may not.
306 */
307
308 /* Godamity-damn. Some buffers (bitmaps for filesystems)
309 * spontaneously dirty themselves without ever brelse being called.
310 * We will ultimately want to put these in a separate list, but for
311 * now we search all of the lists for dirty buffers.
312 */
313 int sync_buffers(kdev_t dev, int wait)
314 {
315 int err = 0;
316
317 /* One pass for no-wait, three for wait:
318 * 0) write out all dirty, unlocked buffers;
319 * 1) wait for all dirty locked buffers;
320 * 2) write out all dirty, unlocked buffers;
321 * 2) wait for completion by waiting for all buffers to unlock.
322 */
323 write_unlocked_buffers(dev);
324 if (wait) {
325 err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
326 write_unlocked_buffers(dev);
327 err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
328 }
329 return err;
330 }
331
332 int fsync_super(struct super_block *sb)
333 {
334 kdev_t dev = sb->s_dev;
335 sync_buffers(dev, 0);
336
337 lock_kernel();
338 sync_inodes_sb(sb);
339 DQUOT_SYNC(dev);
340 lock_super(sb);
341 if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
342 sb->s_op->write_super(sb);
343 unlock_super(sb);
344 unlock_kernel();
345
346 return sync_buffers(dev, 1);
347 }
348
349 int fsync_no_super(kdev_t dev)
350 {
351 sync_buffers(dev, 0);
352 return sync_buffers(dev, 1);
353 }
354
355 int fsync_dev(kdev_t dev)
356 {
357 sync_buffers(dev, 0);
358
359 lock_kernel();
360 sync_inodes(dev);
361 DQUOT_SYNC(dev);
362 sync_supers(dev);
363 unlock_kernel();
364
365 return sync_buffers(dev, 1);
366 }
367
368 /*
369 * There's no real reason to pretend we should
370 * ever do anything differently
371 */
372 void sync_dev(kdev_t dev)
373 {
374 fsync_dev(dev);
375 }
376
377 asmlinkage long sys_sync(void)
378 {
379 fsync_dev(0);
380 return 0;
381 }
382
383 /*
384 * filp may be NULL if called via the msync of a vma.
385 */
386
387 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
388 {
389 struct inode * inode = dentry->d_inode;
390 struct super_block * sb;
391 kdev_t dev;
392 int ret;
393
394 lock_kernel();
395 /* sync the inode to buffers */
396 write_inode_now(inode, 0);
397
398 /* sync the superblock to buffers */
399 sb = inode->i_sb;
400 lock_super(sb);
401 if (sb->s_op && sb->s_op->write_super)
402 sb->s_op->write_super(sb);
403 unlock_super(sb);
404
405 /* .. finally sync the buffers to disk */
406 dev = inode->i_dev;
407 ret = sync_buffers(dev, 1);
408 unlock_kernel();
409 return ret;
410 }
411
412 asmlinkage long sys_fsync(unsigned int fd)
413 {
414 struct file * file;
415 struct dentry * dentry;
416 struct inode * inode;
417 int err;
418
419 err = -EBADF;
420 file = fget(fd);
421 if (!file)
422 goto out;
423
424 dentry = file->f_dentry;
425 inode = dentry->d_inode;
426
427 err = -EINVAL;
428 if (!file->f_op || !file->f_op->fsync)
429 goto out_putf;
430
431 /* We need to protect against concurrent writers.. */
432 down(&inode->i_sem);
433 filemap_fdatasync(inode->i_mapping);
434 err = file->f_op->fsync(file, dentry, 0);
435 filemap_fdatawait(inode->i_mapping);
436 up(&inode->i_sem);
437
438 out_putf:
439 fput(file);
440 out:
441 return err;
442 }
443
444 asmlinkage long sys_fdatasync(unsigned int fd)
445 {
446 struct file * file;
447 struct dentry * dentry;
448 struct inode * inode;
449 int err;
450
451 err = -EBADF;
452 file = fget(fd);
453 if (!file)
454 goto out;
455
456 dentry = file->f_dentry;
457 inode = dentry->d_inode;
458
459 err = -EINVAL;
460 if (!file->f_op || !file->f_op->fsync)
461 goto out_putf;
462
463 down(&inode->i_sem);
464 filemap_fdatasync(inode->i_mapping);
465 err = file->f_op->fsync(file, dentry, 1);
466 filemap_fdatawait(inode->i_mapping);
467 up(&inode->i_sem);
468
469 out_putf:
470 fput(file);
471 out:
472 return err;
473 }
474
475 /* After several hours of tedious analysis, the following hash
476 * function won. Do not mess with it... -DaveM
477 */
478 #define _hashfn(dev,block) \
479 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
480 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
481 ((block) << (bh_hash_shift - 12))))
482 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
483
484 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
485 {
486 if ((bh->b_next = *head) != NULL)
487 bh->b_next->b_pprev = &bh->b_next;
488 *head = bh;
489 bh->b_pprev = head;
490 }
491
492 static __inline__ void __hash_unlink(struct buffer_head *bh)
493 {
494 if (bh->b_pprev) {
495 if (bh->b_next)
496 bh->b_next->b_pprev = bh->b_pprev;
497 *(bh->b_pprev) = bh->b_next;
498 bh->b_pprev = NULL;
499 }
500 }
501
502 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
503 {
504 struct buffer_head **bhp = &lru_list[blist];
505
506 if(!*bhp) {
507 *bhp = bh;
508 bh->b_prev_free = bh;
509 }
510 bh->b_next_free = *bhp;
511 bh->b_prev_free = (*bhp)->b_prev_free;
512 (*bhp)->b_prev_free->b_next_free = bh;
513 (*bhp)->b_prev_free = bh;
514 nr_buffers_type[blist]++;
515 size_buffers_type[blist] += bh->b_size;
516 }
517
518 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
519 {
520 if (bh->b_prev_free || bh->b_next_free) {
521 bh->b_prev_free->b_next_free = bh->b_next_free;
522 bh->b_next_free->b_prev_free = bh->b_prev_free;
523 if (lru_list[blist] == bh)
524 lru_list[blist] = bh->b_next_free;
525 if (lru_list[blist] == bh)
526 lru_list[blist] = NULL;
527 bh->b_next_free = bh->b_prev_free = NULL;
528 nr_buffers_type[blist]--;
529 size_buffers_type[blist] -= bh->b_size;
530 }
531 }
532
533 static void __remove_from_free_list(struct buffer_head * bh, int index)
534 {
535 if(bh->b_next_free == bh)
536 free_list[index].list = NULL;
537 else {
538 bh->b_prev_free->b_next_free = bh->b_next_free;
539 bh->b_next_free->b_prev_free = bh->b_prev_free;
540 if (free_list[index].list == bh)
541 free_list[index].list = bh->b_next_free;
542 }
543 bh->b_next_free = bh->b_prev_free = NULL;
544 }
545
546 /* must be called with both the hash_table_lock and the lru_list_lock
547 held */
548 static void __remove_from_queues(struct buffer_head *bh)
549 {
550 __hash_unlink(bh);
551 __remove_from_lru_list(bh, bh->b_list);
552 }
553
554 static void __insert_into_queues(struct buffer_head *bh)
555 {
556 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
557
558 __hash_link(bh, head);
559 __insert_into_lru_list(bh, bh->b_list);
560 }
561
562 /* This function must only run if there are no other
563 * references _anywhere_ to this buffer head.
564 */
565 static void put_last_free(struct buffer_head * bh)
566 {
567 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
568 struct buffer_head **bhp = &head->list;
569
570 bh->b_state = 0;
571
572 spin_lock(&head->lock);
573 bh->b_dev = B_FREE;
574 if(!*bhp) {
575 *bhp = bh;
576 bh->b_prev_free = bh;
577 }
578 bh->b_next_free = *bhp;
579 bh->b_prev_free = (*bhp)->b_prev_free;
580 (*bhp)->b_prev_free->b_next_free = bh;
581 (*bhp)->b_prev_free = bh;
582 spin_unlock(&head->lock);
583 }
584
585 /*
586 * Why like this, I hear you say... The reason is race-conditions.
587 * As we don't lock buffers (unless we are reading them, that is),
588 * something might happen to it while we sleep (ie a read-error
589 * will force it bad). This shouldn't really happen currently, but
590 * the code is ready.
591 */
592 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
593 {
594 struct buffer_head *bh = hash(dev, block);
595
596 for (; bh; bh = bh->b_next)
597 if (bh->b_blocknr == block &&
598 bh->b_size == size &&
599 bh->b_dev == dev)
600 break;
601 if (bh)
602 get_bh(bh);
603
604 return bh;
605 }
606
607 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
608 {
609 struct buffer_head *bh;
610
611 read_lock(&hash_table_lock);
612 bh = __get_hash_table(dev, block, size);
613 read_unlock(&hash_table_lock);
614
615 return bh;
616 }
617
618 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
619 {
620 spin_lock(&lru_list_lock);
621 if (bh->b_inode)
622 list_del(&bh->b_inode_buffers);
623 bh->b_inode = inode;
624 list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
625 spin_unlock(&lru_list_lock);
626 }
627
628 void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
629 {
630 spin_lock(&lru_list_lock);
631 if (bh->b_inode)
632 list_del(&bh->b_inode_buffers);
633 bh->b_inode = inode;
634 list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
635 spin_unlock(&lru_list_lock);
636 }
637
638 /* The caller must have the lru_list lock before calling the
639 remove_inode_queue functions. */
640 static void __remove_inode_queue(struct buffer_head *bh)
641 {
642 bh->b_inode = NULL;
643 list_del(&bh->b_inode_buffers);
644 }
645
646 static inline void remove_inode_queue(struct buffer_head *bh)
647 {
648 if (bh->b_inode)
649 __remove_inode_queue(bh);
650 }
651
652 int inode_has_buffers(struct inode *inode)
653 {
654 int ret;
655
656 spin_lock(&lru_list_lock);
657 ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
658 spin_unlock(&lru_list_lock);
659
660 return ret;
661 }
662
663 /* If invalidate_buffers() will trash dirty buffers, it means some kind
664 of fs corruption is going on. Trashing dirty data always imply losing
665 information that was supposed to be just stored on the physical layer
666 by the user.
667
668 Thus invalidate_buffers in general usage is not allwowed to trash dirty
669 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
670
671 NOTE: In the case where the user removed a removable-media-disk even if
672 there's still dirty data not synced on disk (due a bug in the device driver
673 or due an error of the user), by not destroying the dirty buffers we could
674 generate corruption also on the next media inserted, thus a parameter is
675 necessary to handle this case in the most safe way possible (trying
676 to not corrupt also the new disk inserted with the data belonging to
677 the old now corrupted disk). Also for the ramdisk the natural thing
678 to do in order to release the ramdisk memory is to destroy dirty buffers.
679
680 These are two special cases. Normal usage imply the device driver
681 to issue a sync on the device (without waiting I/O completion) and
682 then an invalidate_buffers call that doesn't trash dirty buffers.
683
684 For handling cache coherency with the blkdev pagecache the 'update' case
685 is been introduced. It is needed to re-read from disk any pinned
686 buffer. NOTE: re-reading from disk is destructive so we can do it only
687 when we assume nobody is changing the buffercache under our I/O and when
688 we think the disk contains more recent information than the buffercache.
689 The update == 1 pass marks the buffers we need to update, the update == 2
690 pass does the actual I/O. */
691 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
692 {
693 int i, nlist, slept;
694 struct buffer_head * bh, * bh_next;
695
696 retry:
697 slept = 0;
698 spin_lock(&lru_list_lock);
699 for(nlist = 0; nlist < NR_LIST; nlist++) {
700 bh = lru_list[nlist];
701 if (!bh)
702 continue;
703 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
704 bh_next = bh->b_next_free;
705
706 /* Another device? */
707 if (bh->b_dev != dev)
708 continue;
709 /* Not hashed? */
710 if (!bh->b_pprev)
711 continue;
712 if (buffer_locked(bh)) {
713 get_bh(bh);
714 spin_unlock(&lru_list_lock);
715 wait_on_buffer(bh);
716 slept = 1;
717 spin_lock(&lru_list_lock);
718 put_bh(bh);
719 }
720
721 write_lock(&hash_table_lock);
722 /* All buffers in the lru lists are mapped */
723 if (!buffer_mapped(bh))
724 BUG();
725 if (!atomic_read(&bh->b_count)) {
726 if (destroy_dirty_buffers || !buffer_dirty(bh)) {
727 remove_inode_queue(bh);
728 __remove_from_queues(bh);
729 put_last_free(bh);
730 }
731 } else if (update) {
732 if ((update == 2) ^ buffer_uptodate(bh) &&
733 (update == 2) ^ buffer_req(bh)) {
734 write_unlock(&hash_table_lock);
735 atomic_inc(&bh->b_count);
736 spin_unlock(&lru_list_lock);
737
738 if (update == 2) {
739 ll_rw_block(READ, 1, &bh);
740 wait_on_buffer(bh);
741 } else {
742 lock_buffer(bh);
743 clear_bit(BH_Uptodate, &bh->b_state);
744 clear_bit(BH_Req, &bh->b_state);
745 unlock_buffer(bh);
746 }
747
748 atomic_dec(&bh->b_count);
749 goto retry;
750 }
751 }
752
753 write_unlock(&hash_table_lock);
754 if (slept)
755 goto out;
756 }
757 }
758 out:
759 spin_unlock(&lru_list_lock);
760 if (slept)
761 goto retry;
762 }
763
764 void set_blocksize(kdev_t dev, int size)
765 {
766 extern int *blksize_size[];
767 int i, nlist, slept;
768 struct buffer_head * bh, * bh_next;
769
770 if (!blksize_size[MAJOR(dev)])
771 return;
772
773 /* Size must be a power of two, and between 512 and PAGE_SIZE */
774 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
775 panic("Invalid blocksize passed to set_blocksize");
776
777 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
778 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
779 return;
780 }
781 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
782 return;
783 sync_buffers(dev, 2);
784 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
785
786 retry:
787 slept = 0;
788 spin_lock(&lru_list_lock);
789 for(nlist = 0; nlist < NR_LIST; nlist++) {
790 bh = lru_list[nlist];
791 if (!bh)
792 continue;
793 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
794 bh_next = bh->b_next_free;
795 if (bh->b_dev != dev || bh->b_size == size)
796 continue;
797 /* Unhashed? */
798 if (!bh->b_pprev)
799 continue;
800 if (buffer_locked(bh)) {
801 get_bh(bh);
802 spin_unlock(&lru_list_lock);
803 wait_on_buffer(bh);
804 slept = 1;
805 spin_lock(&lru_list_lock);
806 put_bh(bh);
807 }
808
809 write_lock(&hash_table_lock);
810 if (!atomic_read(&bh->b_count)) {
811 if (buffer_dirty(bh))
812 printk(KERN_WARNING
813 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
814 kdevname(dev), bh->b_blocknr, bh->b_size);
815 remove_inode_queue(bh);
816 __remove_from_queues(bh);
817 put_last_free(bh);
818 } else {
819 if (atomic_set_buffer_clean(bh))
820 __refile_buffer(bh);
821 clear_bit(BH_Uptodate, &bh->b_state);
822 printk(KERN_WARNING
823 "set_blocksize: "
824 "b_count %d, dev %s, block %lu, from %p\n",
825 atomic_read(&bh->b_count), bdevname(bh->b_dev),
826 bh->b_blocknr, __builtin_return_address(0));
827 }
828 write_unlock(&hash_table_lock);
829 if (slept)
830 goto out;
831 }
832 }
833 out:
834 spin_unlock(&lru_list_lock);
835 if (slept)
836 goto retry;
837 }
838
839 static void free_more_memory(void)
840 {
841 balance_dirty();
842 wakeup_bdflush();
843 current->policy |= SCHED_YIELD;
844 __set_current_state(TASK_RUNNING);
845 schedule();
846 }
847
848 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
849 {
850 bh->b_list = BUF_CLEAN;
851 bh->b_end_io = handler;
852 bh->b_private = private;
853 }
854
855 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
856 {
857 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
858 unsigned long flags;
859 struct buffer_head *tmp;
860 struct page *page;
861
862 mark_buffer_uptodate(bh, uptodate);
863
864 /* This is a temporary buffer used for page I/O. */
865 page = bh->b_page;
866
867 if (!uptodate)
868 SetPageError(page);
869
870 /*
871 * Be _very_ careful from here on. Bad things can happen if
872 * two buffer heads end IO at almost the same time and both
873 * decide that the page is now completely done.
874 *
875 * Async buffer_heads are here only as labels for IO, and get
876 * thrown away once the IO for this page is complete. IO is
877 * deemed complete once all buffers have been visited
878 * (b_count==0) and are now unlocked. We must make sure that
879 * only the _last_ buffer that decrements its count is the one
880 * that unlock the page..
881 */
882 spin_lock_irqsave(&page_uptodate_lock, flags);
883 mark_buffer_async(bh, 0);
884 unlock_buffer(bh);
885 tmp = bh->b_this_page;
886 while (tmp != bh) {
887 if (buffer_async(tmp) && buffer_locked(tmp))
888 goto still_busy;
889 tmp = tmp->b_this_page;
890 }
891
892 /* OK, the async IO on this page is complete. */
893 spin_unlock_irqrestore(&page_uptodate_lock, flags);
894
895 /*
896 * if none of the buffers had errors then we can set the
897 * page uptodate:
898 */
899 if (!PageError(page))
900 SetPageUptodate(page);
901
902 /*
903 * Run the hooks that have to be done when a page I/O has completed.
904 */
905 if (PageTestandClearDecrAfter(page))
906 atomic_dec(&nr_async_pages);
907
908 UnlockPage(page);
909
910 return;
911
912 still_busy:
913 spin_unlock_irqrestore(&page_uptodate_lock, flags);
914 return;
915 }
916
917 inline void set_buffer_async_io(struct buffer_head *bh) {
918 bh->b_end_io = end_buffer_io_async ;
919 mark_buffer_async(bh, 1);
920 }
921
922 /*
923 * Synchronise all the inode's dirty buffers to the disk.
924 *
925 * We have conflicting pressures: we want to make sure that all
926 * initially dirty buffers get waited on, but that any subsequently
927 * dirtied buffers don't. After all, we don't want fsync to last
928 * forever if somebody is actively writing to the file.
929 *
930 * Do this in two main stages: first we copy dirty buffers to a
931 * temporary inode list, queueing the writes as we go. Then we clean
932 * up, waiting for those writes to complete.
933 *
934 * During this second stage, any subsequent updates to the file may end
935 * up refiling the buffer on the original inode's dirty list again, so
936 * there is a chance we will end up with a buffer queued for write but
937 * not yet completed on that list. So, as a final cleanup we go through
938 * the osync code to catch these locked, dirty buffers without requeuing
939 * any newly dirty buffers for write.
940 */
941
942 int fsync_inode_buffers(struct inode *inode)
943 {
944 struct buffer_head *bh;
945 struct inode tmp;
946 int err = 0, err2;
947
948 INIT_LIST_HEAD(&tmp.i_dirty_buffers);
949
950 spin_lock(&lru_list_lock);
951
952 while (!list_empty(&inode->i_dirty_buffers)) {
953 bh = BH_ENTRY(inode->i_dirty_buffers.next);
954 list_del(&bh->b_inode_buffers);
955 if (!buffer_dirty(bh) && !buffer_locked(bh))
956 bh->b_inode = NULL;
957 else {
958 bh->b_inode = &tmp;
959 list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
960 if (buffer_dirty(bh)) {
961 get_bh(bh);
962 spin_unlock(&lru_list_lock);
963 ll_rw_block(WRITE, 1, &bh);
964 brelse(bh);
965 spin_lock(&lru_list_lock);
966 }
967 }
968 }
969
970 while (!list_empty(&tmp.i_dirty_buffers)) {
971 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
972 remove_inode_queue(bh);
973 get_bh(bh);
974 spin_unlock(&lru_list_lock);
975 wait_on_buffer(bh);
976 if (!buffer_uptodate(bh))
977 err = -EIO;
978 brelse(bh);
979 spin_lock(&lru_list_lock);
980 }
981
982 spin_unlock(&lru_list_lock);
983 err2 = osync_inode_buffers(inode);
984
985 if (err)
986 return err;
987 else
988 return err2;
989 }
990
991 int fsync_inode_data_buffers(struct inode *inode)
992 {
993 struct buffer_head *bh;
994 struct inode tmp;
995 int err = 0, err2;
996
997 INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
998
999 spin_lock(&lru_list_lock);
1000
1001 while (!list_empty(&inode->i_dirty_data_buffers)) {
1002 bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
1003 list_del(&bh->b_inode_buffers);
1004 if (!buffer_dirty(bh) && !buffer_locked(bh))
1005 bh->b_inode = NULL;
1006 else {
1007 bh->b_inode = &tmp;
1008 list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
1009 if (buffer_dirty(bh)) {
1010 get_bh(bh);
1011 spin_unlock(&lru_list_lock);
1012 ll_rw_block(WRITE, 1, &bh);
1013 brelse(bh);
1014 spin_lock(&lru_list_lock);
1015 }
1016 }
1017 }
1018
1019 while (!list_empty(&tmp.i_dirty_data_buffers)) {
1020 bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
1021 remove_inode_queue(bh);
1022 get_bh(bh);
1023 spin_unlock(&lru_list_lock);
1024 wait_on_buffer(bh);
1025 if (!buffer_uptodate(bh))
1026 err = -EIO;
1027 brelse(bh);
1028 spin_lock(&lru_list_lock);
1029 }
1030
1031 spin_unlock(&lru_list_lock);
1032 err2 = osync_inode_data_buffers(inode);
1033
1034 if (err)
1035 return err;
1036 else
1037 return err2;
1038 }
1039
1040 /*
1041 * osync is designed to support O_SYNC io. It waits synchronously for
1042 * all already-submitted IO to complete, but does not queue any new
1043 * writes to the disk.
1044 *
1045 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
1046 * you dirty the buffers, and then use osync_inode_buffers to wait for
1047 * completion. Any other dirty buffers which are not yet queued for
1048 * write will not be flushed to disk by the osync.
1049 */
1050
1051 int osync_inode_buffers(struct inode *inode)
1052 {
1053 struct buffer_head *bh;
1054 struct list_head *list;
1055 int err = 0;
1056
1057 spin_lock(&lru_list_lock);
1058
1059 repeat:
1060
1061 for (list = inode->i_dirty_buffers.prev;
1062 bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
1063 list = bh->b_inode_buffers.prev) {
1064 if (buffer_locked(bh)) {
1065 get_bh(bh);
1066 spin_unlock(&lru_list_lock);
1067 wait_on_buffer(bh);
1068 if (!buffer_uptodate(bh))
1069 err = -EIO;
1070 brelse(bh);
1071 spin_lock(&lru_list_lock);
1072 goto repeat;
1073 }
1074 }
1075
1076 spin_unlock(&lru_list_lock);
1077 return err;
1078 }
1079
1080 int osync_inode_data_buffers(struct inode *inode)
1081 {
1082 struct buffer_head *bh;
1083 struct list_head *list;
1084 int err = 0;
1085
1086 spin_lock(&lru_list_lock);
1087
1088 repeat:
1089
1090 for (list = inode->i_dirty_data_buffers.prev;
1091 bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
1092 list = bh->b_inode_buffers.prev) {
1093 if (buffer_locked(bh)) {
1094 get_bh(bh);
1095 spin_unlock(&lru_list_lock);
1096 wait_on_buffer(bh);
1097 if (!buffer_uptodate(bh))
1098 err = -EIO;
1099 brelse(bh);
1100 spin_lock(&lru_list_lock);
1101 goto repeat;
1102 }
1103 }
1104
1105 spin_unlock(&lru_list_lock);
1106 return err;
1107 }
1108
1109
1110 /*
1111 * Invalidate any and all dirty buffers on a given inode. We are
1112 * probably unmounting the fs, but that doesn't mean we have already
1113 * done a sync(). Just drop the buffers from the inode list.
1114 */
1115 void invalidate_inode_buffers(struct inode *inode)
1116 {
1117 struct list_head * entry;
1118
1119 spin_lock(&lru_list_lock);
1120 while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
1121 remove_inode_queue(BH_ENTRY(entry));
1122 while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
1123 remove_inode_queue(BH_ENTRY(entry));
1124 spin_unlock(&lru_list_lock);
1125 }
1126
1127
1128 /*
1129 * Ok, this is getblk, and it isn't very clear, again to hinder
1130 * race-conditions. Most of the code is seldom used, (ie repeating),
1131 * so it should be much more efficient than it looks.
1132 *
1133 * The algorithm is changed: hopefully better, and an elusive bug removed.
1134 *
1135 * 14.02.92: changed it to sync dirty buffers a bit: better performance
1136 * when the filesystem starts to get full of dirty blocks (I hope).
1137 */
1138 struct buffer_head * getblk(kdev_t dev, int block, int size)
1139 {
1140 struct buffer_head * bh;
1141 int isize;
1142
1143 repeat:
1144 spin_lock(&lru_list_lock);
1145 write_lock(&hash_table_lock);
1146 bh = __get_hash_table(dev, block, size);
1147 if (bh)
1148 goto out;
1149
1150 isize = BUFSIZE_INDEX(size);
1151 spin_lock(&free_list[isize].lock);
1152 bh = free_list[isize].list;
1153 if (bh) {
1154 __remove_from_free_list(bh, isize);
1155 atomic_set(&bh->b_count, 1);
1156 }
1157 spin_unlock(&free_list[isize].lock);
1158
1159 /*
1160 * OK, FINALLY we know that this buffer is the only one of
1161 * its kind, we hold a reference (b_count>0), it is unlocked,
1162 * and it is clean.
1163 */
1164 if (bh) {
1165 init_buffer(bh, NULL, NULL);
1166 bh->b_dev = dev;
1167 bh->b_blocknr = block;
1168 bh->b_state = 1 << BH_Mapped;
1169
1170 /* Insert the buffer into the regular lists */
1171 __insert_into_queues(bh);
1172 out:
1173 write_unlock(&hash_table_lock);
1174 spin_unlock(&lru_list_lock);
1175 touch_buffer(bh);
1176 return bh;
1177 }
1178
1179 /*
1180 * If we block while refilling the free list, somebody may
1181 * create the buffer first ... search the hashes again.
1182 */
1183 write_unlock(&hash_table_lock);
1184 spin_unlock(&lru_list_lock);
1185
1186 if (!grow_buffers(size))
1187 free_more_memory();
1188
1189 /* FIXME: getblk should fail if there's no enough memory */
1190 goto repeat;
1191 }
1192
1193 /* -1 -> no need to flush
1194 0 -> async flush
1195 1 -> sync flush (wait for I/O completion) */
1196 static int balance_dirty_state(void)
1197 {
1198 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1199
1200 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1201 tot = nr_free_buffer_pages();
1202
1203 dirty *= 100;
1204 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1205 hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1206
1207 /* First, check for the "real" dirty limit. */
1208 if (dirty > soft_dirty_limit) {
1209 if (dirty > hard_dirty_limit)
1210 return 1;
1211 return 0;
1212 }
1213
1214 return -1;
1215 }
1216
1217 /*
1218 * if a new dirty buffer is created we need to balance bdflush.
1219 *
1220 * in the future we might want to make bdflush aware of different
1221 * pressures on different devices - thus the (currently unused)
1222 * 'dev' parameter.
1223 */
1224 void balance_dirty(void)
1225 {
1226 int state = balance_dirty_state();
1227
1228 if (state < 0)
1229 return;
1230
1231 /* If we're getting into imbalance, start write-out */
1232 spin_lock(&lru_list_lock);
1233 write_some_buffers(NODEV);
1234
1235 /*
1236 * And if we're _really_ out of balance, wait for
1237 * some of the dirty/locked buffers ourselves and
1238 * start bdflush.
1239 * This will throttle heavy writers.
1240 */
1241 if (state > 0) {
1242 wait_for_some_buffers(NODEV);
1243 wakeup_bdflush();
1244 }
1245 }
1246
1247 inline void __mark_dirty(struct buffer_head *bh)
1248 {
1249 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1250 refile_buffer(bh);
1251 }
1252
1253 /* atomic version, the user must call balance_dirty() by hand
1254 as soon as it become possible to block */
1255 void __mark_buffer_dirty(struct buffer_head *bh)
1256 {
1257 if (!atomic_set_buffer_dirty(bh))
1258 __mark_dirty(bh);
1259 }
1260
1261 void mark_buffer_dirty(struct buffer_head *bh)
1262 {
1263 if (!atomic_set_buffer_dirty(bh)) {
1264 __mark_dirty(bh);
1265 balance_dirty();
1266 }
1267 }
1268
1269 /*
1270 * A buffer may need to be moved from one buffer list to another
1271 * (e.g. in case it is not shared any more). Handle this.
1272 */
1273 static void __refile_buffer(struct buffer_head *bh)
1274 {
1275 int dispose = BUF_CLEAN;
1276 if (buffer_locked(bh))
1277 dispose = BUF_LOCKED;
1278 if (buffer_dirty(bh))
1279 dispose = BUF_DIRTY;
1280 if (dispose != bh->b_list) {
1281 __remove_from_lru_list(bh, bh->b_list);
1282 bh->b_list = dispose;
1283 if (dispose == BUF_CLEAN)
1284 remove_inode_queue(bh);
1285 __insert_into_lru_list(bh, dispose);
1286 }
1287 }
1288
1289 void refile_buffer(struct buffer_head *bh)
1290 {
1291 spin_lock(&lru_list_lock);
1292 __refile_buffer(bh);
1293 spin_unlock(&lru_list_lock);
1294 }
1295
1296 /*
1297 * Release a buffer head
1298 */
1299 void __brelse(struct buffer_head * buf)
1300 {
1301 if (atomic_read(&buf->b_count)) {
1302 put_bh(buf);
1303 return;
1304 }
1305 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1306 }
1307
1308 /*
1309 * bforget() is like brelse(), except it puts the buffer on the
1310 * free list if it can.. We can NOT free the buffer if:
1311 * - there are other users of it
1312 * - it is locked and thus can have active IO
1313 */
1314 void __bforget(struct buffer_head * buf)
1315 {
1316 /* grab the lru lock here to block bdflush. */
1317 spin_lock(&lru_list_lock);
1318 write_lock(&hash_table_lock);
1319 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1320 goto in_use;
1321 __hash_unlink(buf);
1322 write_unlock(&hash_table_lock);
1323 remove_inode_queue(buf);
1324 __remove_from_lru_list(buf, buf->b_list);
1325 spin_unlock(&lru_list_lock);
1326 put_last_free(buf);
1327 return;
1328
1329 in_use:
1330 write_unlock(&hash_table_lock);
1331 spin_unlock(&lru_list_lock);
1332 }
1333
1334 /**
1335 * bread() - reads a specified block and returns the bh
1336 * @block: number of block
1337 * @size: size (in bytes) to read
1338 *
1339 * Reads a specified block, and returns buffer head that
1340 * contains it. It returns NULL if the block was unreadable.
1341 */
1342 struct buffer_head * bread(kdev_t dev, int block, int size)
1343 {
1344 struct buffer_head * bh;
1345
1346 bh = getblk(dev, block, size);
1347 if (buffer_uptodate(bh))
1348 return bh;
1349 ll_rw_block(READ, 1, &bh);
1350 wait_on_buffer(bh);
1351 if (buffer_uptodate(bh))
1352 return bh;
1353 brelse(bh);
1354 return NULL;
1355 }
1356
1357 /*
1358 * Note: the caller should wake up the buffer_wait list if needed.
1359 */
1360 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1361 {
1362 if (bh->b_inode)
1363 BUG();
1364 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1365 kmem_cache_free(bh_cachep, bh);
1366 } else {
1367 bh->b_blocknr = -1;
1368 bh->b_this_page = NULL;
1369
1370 nr_unused_buffer_heads++;
1371 bh->b_next_free = unused_list;
1372 unused_list = bh;
1373 }
1374 }
1375
1376 /*
1377 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1378 * no-buffer-head deadlock. Return NULL on failure; waiting for
1379 * buffer heads is now handled in create_buffers().
1380 */
1381 static struct buffer_head * get_unused_buffer_head(int async)
1382 {
1383 struct buffer_head * bh;
1384
1385 spin_lock(&unused_list_lock);
1386 if (nr_unused_buffer_heads > NR_RESERVED) {
1387 bh = unused_list;
1388 unused_list = bh->b_next_free;
1389 nr_unused_buffer_heads--;
1390 spin_unlock(&unused_list_lock);
1391 return bh;
1392 }
1393 spin_unlock(&unused_list_lock);
1394
1395 /* This is critical. We can't call out to the FS
1396 * to get more buffer heads, because the FS may need
1397 * more buffer-heads itself. Thus SLAB_NOFS.
1398 */
1399 if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1400 bh->b_blocknr = -1;
1401 bh->b_this_page = NULL;
1402 return bh;
1403 }
1404
1405 /*
1406 * If we need an async buffer, use the reserved buffer heads.
1407 */
1408 if (async) {
1409 spin_lock(&unused_list_lock);
1410 if (unused_list) {
1411 bh = unused_list;
1412 unused_list = bh->b_next_free;
1413 nr_unused_buffer_heads--;
1414 spin_unlock(&unused_list_lock);
1415 return bh;
1416 }
1417 spin_unlock(&unused_list_lock);
1418 }
1419 #if 0
1420 /*
1421 * (Pending further analysis ...)
1422 * Ordinary (non-async) requests can use a different memory priority
1423 * to free up pages. Any swapping thus generated will use async
1424 * buffer heads.
1425 */
1426 if(!async &&
1427 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1428 memset(bh, 0, sizeof(*bh));
1429 init_waitqueue_head(&bh->b_wait);
1430 return bh;
1431 }
1432 #endif
1433
1434 return NULL;
1435 }
1436
1437 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1438 {
1439 bh->b_page = page;
1440 if (offset >= PAGE_SIZE)
1441 BUG();
1442 if (PageHighMem(page))
1443 /*
1444 * This catches illegal uses and preserves the offset:
1445 */
1446 bh->b_data = (char *)(0 + offset);
1447 else
1448 bh->b_data = page_address(page) + offset;
1449 }
1450
1451 /*
1452 * Create the appropriate buffers when given a page for data area and
1453 * the size of each buffer.. Use the bh->b_this_page linked list to
1454 * follow the buffers created. Return NULL if unable to create more
1455 * buffers.
1456 * The async flag is used to differentiate async IO (paging, swapping)
1457 * from ordinary buffer allocations, and only async requests are allowed
1458 * to sleep waiting for buffer heads.
1459 */
1460 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1461 {
1462 struct buffer_head *bh, *head;
1463 long offset;
1464
1465 try_again:
1466 head = NULL;
1467 offset = PAGE_SIZE;
1468 while ((offset -= size) >= 0) {
1469 bh = get_unused_buffer_head(async);
1470 if (!bh)
1471 goto no_grow;
1472
1473 bh->b_dev = B_FREE; /* Flag as unused */
1474 bh->b_this_page = head;
1475 head = bh;
1476
1477 bh->b_state = 0;
1478 bh->b_next_free = NULL;
1479 bh->b_pprev = NULL;
1480 atomic_set(&bh->b_count, 0);
1481 bh->b_size = size;
1482
1483 set_bh_page(bh, page, offset);
1484
1485 bh->b_list = BUF_CLEAN;
1486 bh->b_end_io = NULL;
1487 }
1488 return head;
1489 /*
1490 * In case anything failed, we just free everything we got.
1491 */
1492 no_grow:
1493 if (head) {
1494 spin_lock(&unused_list_lock);
1495 do {
1496 bh = head;
1497 head = head->b_this_page;
1498 __put_unused_buffer_head(bh);
1499 } while (head);
1500 spin_unlock(&unused_list_lock);
1501
1502 /* Wake up any waiters ... */
1503 wake_up(&buffer_wait);
1504 }
1505
1506 /*
1507 * Return failure for non-async IO requests. Async IO requests
1508 * are not allowed to fail, so we have to wait until buffer heads
1509 * become available. But we don't want tasks sleeping with
1510 * partially complete buffers, so all were released above.
1511 */
1512 if (!async)
1513 return NULL;
1514
1515 /* We're _really_ low on memory. Now we just
1516 * wait for old buffer heads to become free due to
1517 * finishing IO. Since this is an async request and
1518 * the reserve list is empty, we're sure there are
1519 * async buffer heads in use.
1520 */
1521 run_task_queue(&tq_disk);
1522
1523 free_more_memory();
1524 goto try_again;
1525 }
1526
1527 static void unmap_buffer(struct buffer_head * bh)
1528 {
1529 if (buffer_mapped(bh)) {
1530 mark_buffer_clean(bh);
1531 lock_buffer(bh);
1532 clear_bit(BH_Uptodate, &bh->b_state);
1533 clear_bit(BH_Mapped, &bh->b_state);
1534 clear_bit(BH_Req, &bh->b_state);
1535 clear_bit(BH_New, &bh->b_state);
1536 unlock_buffer(bh);
1537 }
1538 }
1539
1540 /*
1541 * We don't have to release all buffers here, but
1542 * we have to be sure that no dirty buffer is left
1543 * and no IO is going on (no buffer is locked), because
1544 * we have truncated the file and are going to free the
1545 * blocks on-disk..
1546 */
1547 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1548 {
1549 struct buffer_head *head, *bh, *next;
1550 unsigned int curr_off = 0;
1551
1552 if (!PageLocked(page))
1553 BUG();
1554 if (!page->buffers)
1555 return 1;
1556
1557 head = page->buffers;
1558 bh = head;
1559 do {
1560 unsigned int next_off = curr_off + bh->b_size;
1561 next = bh->b_this_page;
1562
1563 /*
1564 * is this block fully flushed?
1565 */
1566 if (offset <= curr_off)
1567 unmap_buffer(bh);
1568 curr_off = next_off;
1569 bh = next;
1570 } while (bh != head);
1571
1572 /*
1573 * subtle. We release buffer-heads only if this is
1574 * the 'final' flushpage. We have invalidated the get_block
1575 * cached value unconditionally, so real IO is not
1576 * possible anymore.
1577 *
1578 * If the free doesn't work out, the buffers can be
1579 * left around - they just turn into anonymous buffers
1580 * instead.
1581 */
1582 if (!offset) {
1583 if (!try_to_free_buffers(page, 0)) {
1584 if (drop_pagecache)
1585 atomic_inc(&buffermem_pages);
1586 return 0;
1587 }
1588 }
1589
1590 return 1;
1591 }
1592
1593 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1594 {
1595 struct buffer_head *bh, *head, *tail;
1596
1597 /* FIXME: create_buffers should fail if there's no enough memory */
1598 head = create_buffers(page, blocksize, 1);
1599 if (page->buffers)
1600 BUG();
1601
1602 bh = head;
1603 do {
1604 bh->b_dev = dev;
1605 bh->b_blocknr = 0;
1606 bh->b_end_io = NULL;
1607 tail = bh;
1608 bh = bh->b_this_page;
1609 } while (bh);
1610 tail->b_this_page = head;
1611 page->buffers = head;
1612 page_cache_get(page);
1613 }
1614
1615 /*
1616 * We are taking a block for data and we don't want any output from any
1617 * buffer-cache aliases starting from return from that function and
1618 * until the moment when something will explicitly mark the buffer
1619 * dirty (hopefully that will not happen until we will free that block ;-)
1620 * We don't even need to mark it not-uptodate - nobody can expect
1621 * anything from a newly allocated buffer anyway. We used to used
1622 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1623 * don't want to mark the alias unmapped, for example - it would confuse
1624 * anyone who might pick it with bread() afterwards...
1625 */
1626
1627 static void unmap_underlying_metadata(struct buffer_head * bh)
1628 {
1629 struct buffer_head *old_bh;
1630
1631 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1632 if (old_bh) {
1633 mark_buffer_clean(old_bh);
1634 wait_on_buffer(old_bh);
1635 clear_bit(BH_Req, &old_bh->b_state);
1636 /* Here we could run brelse or bforget. We use
1637 bforget because it will try to put the buffer
1638 in the freelist. */
1639 __bforget(old_bh);
1640 }
1641 }
1642
1643 /*
1644 * NOTE! All mapped/uptodate combinations are valid:
1645 *
1646 * Mapped Uptodate Meaning
1647 *
1648 * No No "unknown" - must do get_block()
1649 * No Yes "hole" - zero-filled
1650 * Yes No "allocated" - allocated on disk, not read in
1651 * Yes Yes "valid" - allocated and up-to-date in memory.
1652 *
1653 * "Dirty" is valid only with the last case (mapped+uptodate).
1654 */
1655
1656 /*
1657 * block_write_full_page() is SMP-safe - currently it's still
1658 * being called with the kernel lock held, but the code is ready.
1659 */
1660 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1661 {
1662 int err, i;
1663 unsigned long block;
1664 struct buffer_head *bh, *head;
1665
1666 if (!PageLocked(page))
1667 BUG();
1668
1669 if (!page->buffers)
1670 create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
1671 head = page->buffers;
1672
1673 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1674
1675 bh = head;
1676 i = 0;
1677
1678 /* Stage 1: make sure we have all the buffers mapped! */
1679 do {
1680 /*
1681 * If the buffer isn't up-to-date, we can't be sure
1682 * that the buffer has been initialized with the proper
1683 * block number information etc..
1684 *
1685 * Leave it to the low-level FS to make all those
1686 * decisions (block #0 may actually be a valid block)
1687 */
1688 if (!buffer_mapped(bh)) {
1689 err = get_block(inode, block, bh, 1);
1690 if (err)
1691 goto out;
1692 if (buffer_new(bh))
1693 unmap_underlying_metadata(bh);
1694 }
1695 bh = bh->b_this_page;
1696 block++;
1697 } while (bh != head);
1698
1699 /* Stage 2: lock the buffers, mark them clean */
1700 do {
1701 lock_buffer(bh);
1702 set_buffer_async_io(bh);
1703 set_bit(BH_Uptodate, &bh->b_state);
1704 clear_bit(BH_Dirty, &bh->b_state);
1705 bh = bh->b_this_page;
1706 } while (bh != head);
1707
1708 /* Stage 3: submit the IO */
1709 do {
1710 struct buffer_head *next = bh->b_this_page;
1711 submit_bh(WRITE, bh);
1712 bh = next;
1713 } while (bh != head);
1714
1715 /* Done - end_buffer_io_async will unlock */
1716 SetPageUptodate(page);
1717 return 0;
1718
1719 out:
1720 ClearPageUptodate(page);
1721 UnlockPage(page);
1722 return err;
1723 }
1724
1725 static int __block_prepare_write(struct inode *inode, struct page *page,
1726 unsigned from, unsigned to, get_block_t *get_block)
1727 {
1728 unsigned block_start, block_end;
1729 unsigned long block;
1730 int err = 0;
1731 unsigned blocksize, bbits;
1732 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1733 char *kaddr = kmap(page);
1734
1735 blocksize = inode->i_sb->s_blocksize;
1736 if (!page->buffers)
1737 create_empty_buffers(page, inode->i_dev, blocksize);
1738 head = page->buffers;
1739
1740 bbits = inode->i_sb->s_blocksize_bits;
1741 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1742
1743 for(bh = head, block_start = 0; bh != head || !block_start;
1744 block++, block_start=block_end, bh = bh->b_this_page) {
1745 if (!bh)
1746 BUG();
1747 block_end = block_start+blocksize;
1748 if (block_end <= from)
1749 continue;
1750 if (block_start >= to)
1751 break;
1752 if (!buffer_mapped(bh)) {
1753 err = get_block(inode, block, bh, 1);
1754 if (err)
1755 goto out;
1756 if (buffer_new(bh)) {
1757 unmap_underlying_metadata(bh);
1758 if (Page_Uptodate(page)) {
1759 set_bit(BH_Uptodate, &bh->b_state);
1760 continue;
1761 }
1762 if (block_end > to)
1763 memset(kaddr+to, 0, block_end-to);
1764 if (block_start < from)
1765 memset(kaddr+block_start, 0, from-block_start);
1766 if (block_end > to || block_start < from)
1767 flush_dcache_page(page);
1768 continue;
1769 }
1770 }
1771 if (Page_Uptodate(page)) {
1772 set_bit(BH_Uptodate, &bh->b_state);
1773 continue;
1774 }
1775 if (!buffer_uptodate(bh) &&
1776 (block_start < from || block_end > to)) {
1777 ll_rw_block(READ, 1, &bh);
1778 *wait_bh++=bh;
1779 }
1780 }
1781 /*
1782 * If we issued read requests - let them complete.
1783 */
1784 while(wait_bh > wait) {
1785 wait_on_buffer(*--wait_bh);
1786 err = -EIO;
1787 if (!buffer_uptodate(*wait_bh))
1788 goto out;
1789 }
1790 return 0;
1791 out:
1792 return err;
1793 }
1794
1795 static int __block_commit_write(struct inode *inode, struct page *page,
1796 unsigned from, unsigned to)
1797 {
1798 unsigned block_start, block_end;
1799 int partial = 0, need_balance_dirty = 0;
1800 unsigned blocksize;
1801 struct buffer_head *bh, *head;
1802
1803 blocksize = inode->i_sb->s_blocksize;
1804
1805 for(bh = head = page->buffers, block_start = 0;
1806 bh != head || !block_start;
1807 block_start=block_end, bh = bh->b_this_page) {
1808 block_end = block_start + blocksize;
1809 if (block_end <= from || block_start >= to) {
1810 if (!buffer_uptodate(bh))
1811 partial = 1;
1812 } else {
1813 set_bit(BH_Uptodate, &bh->b_state);
1814 if (!atomic_set_buffer_dirty(bh)) {
1815 __mark_dirty(bh);
1816 buffer_insert_inode_data_queue(bh, inode);
1817 need_balance_dirty = 1;
1818 }
1819 }
1820 }
1821
1822 if (need_balance_dirty)
1823 balance_dirty();
1824 /*
1825 * is this a partial write that happened to make all buffers
1826 * uptodate then we can optimize away a bogus readpage() for
1827 * the next read(). Here we 'discover' wether the page went
1828 * uptodate as a result of this (potentially partial) write.
1829 */
1830 if (!partial)
1831 SetPageUptodate(page);
1832 return 0;
1833 }
1834
1835 /*
1836 * Generic "read page" function for block devices that have the normal
1837 * get_block functionality. This is most of the block device filesystems.
1838 * Reads the page asynchronously --- the unlock_buffer() and
1839 * mark_buffer_uptodate() functions propagate buffer state into the
1840 * page struct once IO has completed.
1841 */
1842 int block_read_full_page(struct page *page, get_block_t *get_block)
1843 {
1844 struct inode *inode = page->mapping->host;
1845 unsigned long iblock, lblock;
1846 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1847 unsigned int blocksize, blocks;
1848 int nr, i;
1849
1850 if (!PageLocked(page))
1851 PAGE_BUG(page);
1852 blocksize = inode->i_sb->s_blocksize;
1853 if (!page->buffers)
1854 create_empty_buffers(page, inode->i_dev, blocksize);
1855 head = page->buffers;
1856
1857 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1858 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1859 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1860 bh = head;
1861 nr = 0;
1862 i = 0;
1863
1864 do {
1865 if (buffer_uptodate(bh))
1866 continue;
1867
1868 if (!buffer_mapped(bh)) {
1869 if (iblock < lblock) {
1870 if (get_block(inode, iblock, bh, 0))
1871 continue;
1872 }
1873 if (!buffer_mapped(bh)) {
1874 memset(kmap(page) + i*blocksize, 0, blocksize);
1875 flush_dcache_page(page);
1876 kunmap(page);
1877 set_bit(BH_Uptodate, &bh->b_state);
1878 continue;
1879 }
1880 /* get_block() might have updated the buffer synchronously */
1881 if (buffer_uptodate(bh))
1882 continue;
1883 }
1884
1885 arr[nr] = bh;
1886 nr++;
1887 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1888
1889 if (!nr) {
1890 /*
1891 * all buffers are uptodate - we can set the page
1892 * uptodate as well.
1893 */
1894 SetPageUptodate(page);
1895 UnlockPage(page);
1896 return 0;
1897 }
1898
1899 /* Stage two: lock the buffers */
1900 for (i = 0; i < nr; i++) {
1901 struct buffer_head * bh = arr[i];
1902 lock_buffer(bh);
1903 set_buffer_async_io(bh);
1904 }
1905
1906 /* Stage 3: start the IO */
1907 for (i = 0; i < nr; i++)
1908 submit_bh(READ, arr[i]);
1909
1910 return 0;
1911 }
1912
1913 /*
1914 * For moronic filesystems that do not allow holes in file.
1915 * We may have to extend the file.
1916 */
1917
1918 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1919 {
1920 struct address_space *mapping = page->mapping;
1921 struct inode *inode = mapping->host;
1922 struct page *new_page;
1923 unsigned long pgpos;
1924 long status;
1925 unsigned zerofrom;
1926 unsigned blocksize = inode->i_sb->s_blocksize;
1927 char *kaddr;
1928
1929 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1930 status = -ENOMEM;
1931 new_page = grab_cache_page(mapping, pgpos);
1932 if (!new_page)
1933 goto out;
1934 /* we might sleep */
1935 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1936 UnlockPage(new_page);
1937 page_cache_release(new_page);
1938 continue;
1939 }
1940 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1941 if (zerofrom & (blocksize-1)) {
1942 *bytes |= (blocksize-1);
1943 (*bytes)++;
1944 }
1945 status = __block_prepare_write(inode, new_page, zerofrom,
1946 PAGE_CACHE_SIZE, get_block);
1947 if (status)
1948 goto out_unmap;
1949 kaddr = page_address(new_page);
1950 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1951 flush_dcache_page(new_page);
1952 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1953 kunmap(new_page);
1954 UnlockPage(new_page);
1955 page_cache_release(new_page);
1956 }
1957
1958 if (page->index < pgpos) {
1959 /* completely inside the area */
1960 zerofrom = offset;
1961 } else {
1962 /* page covers the boundary, find the boundary offset */
1963 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1964
1965 /* if we will expand the thing last block will be filled */
1966 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1967 *bytes |= (blocksize-1);
1968 (*bytes)++;
1969 }
1970
1971 /* starting below the boundary? Nothing to zero out */
1972 if (offset <= zerofrom)
1973 zerofrom = offset;
1974 }
1975 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1976 if (status)
1977 goto out1;
1978 kaddr = page_address(page);
1979 if (zerofrom < offset) {
1980 memset(kaddr+zerofrom, 0, offset-zerofrom);
1981 flush_dcache_page(page);
1982 __block_commit_write(inode, page, zerofrom, offset);
1983 }
1984 return 0;
1985 out1:
1986 ClearPageUptodate(page);
1987 kunmap(page);
1988 return status;
1989
1990 out_unmap:
1991 ClearPageUptodate(new_page);
1992 kunmap(new_page);
1993 UnlockPage(new_page);
1994 page_cache_release(new_page);
1995 out:
1996 return status;
1997 }
1998
1999 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2000 get_block_t *get_block)
2001 {
2002 struct inode *inode = page->mapping->host;
2003 int err = __block_prepare_write(inode, page, from, to, get_block);
2004 if (err) {
2005 ClearPageUptodate(page);
2006 kunmap(page);
2007 }
2008 return err;
2009 }
2010
2011 int generic_commit_write(struct file *file, struct page *page,
2012 unsigned from, unsigned to)
2013 {
2014 struct inode *inode = page->mapping->host;
2015 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2016 __block_commit_write(inode,page,from,to);
2017 kunmap(page);
2018 if (pos > inode->i_size) {
2019 inode->i_size = pos;
2020 mark_inode_dirty(inode);
2021 }
2022 return 0;
2023 }
2024
2025 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2026 {
2027 unsigned long index = from >> PAGE_CACHE_SHIFT;
2028 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2029 unsigned blocksize, iblock, length, pos;
2030 struct inode *inode = mapping->host;
2031 struct page *page;
2032 struct buffer_head *bh;
2033 int err;
2034
2035 blocksize = inode->i_sb->s_blocksize;
2036 length = offset & (blocksize - 1);
2037
2038 /* Block boundary? Nothing to do */
2039 if (!length)
2040 return 0;
2041
2042 length = blocksize - length;
2043 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
2044
2045 page = grab_cache_page(mapping, index);
2046 err = -ENOMEM;
2047 if (!page)
2048 goto out;
2049
2050 if (!page->buffers)
2051 create_empty_buffers(page, inode->i_dev, blocksize);
2052
2053 /* Find the buffer that contains "offset" */
2054 bh = page->buffers;
2055 pos = blocksize;
2056 while (offset >= pos) {
2057 bh = bh->b_this_page;
2058 iblock++;
2059 pos += blocksize;
2060 }
2061
2062 err = 0;
2063 if (!buffer_mapped(bh)) {
2064 /* Hole? Nothing to do */
2065 if (buffer_uptodate(bh))
2066 goto unlock;
2067 get_block(inode, iblock, bh, 0);
2068 /* Still unmapped? Nothing to do */
2069 if (!buffer_mapped(bh))
2070 goto unlock;
2071 }
2072
2073 /* Ok, it's mapped. Make sure it's up-to-date */
2074 if (Page_Uptodate(page))
2075 set_bit(BH_Uptodate, &bh->b_state);
2076
2077 if (!buffer_uptodate(bh)) {
2078 err = -EIO;
2079 ll_rw_block(READ, 1, &bh);
2080 wait_on_buffer(bh);
2081 /* Uhhuh. Read error. Complain and punt. */
2082 if (!buffer_uptodate(bh))
2083 goto unlock;
2084 }
2085
2086 memset(kmap(page) + offset, 0, length);
2087 flush_dcache_page(page);
2088 kunmap(page);
2089
2090 __mark_buffer_dirty(bh);
2091 err = 0;
2092
2093 unlock:
2094 UnlockPage(page);
2095 page_cache_release(page);
2096 out:
2097 return err;
2098 }
2099
2100 int block_write_full_page(struct page *page, get_block_t *get_block)
2101 {
2102 struct inode *inode = page->mapping->host;
2103 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2104 unsigned offset;
2105 int err;
2106
2107 /* easy case */
2108 if (page->index < end_index)
2109 return __block_write_full_page(inode, page, get_block);
2110
2111 /* things got complicated... */
2112 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2113 /* OK, are we completely out? */
2114 if (page->index >= end_index+1 || !offset) {
2115 UnlockPage(page);
2116 return -EIO;
2117 }
2118
2119 /* Sigh... will have to work, then... */
2120 err = __block_prepare_write(inode, page, 0, offset, get_block);
2121 if (!err) {
2122 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2123 flush_dcache_page(page);
2124 __block_commit_write(inode,page,0,offset);
2125 done:
2126 kunmap(page);
2127 UnlockPage(page);
2128 return err;
2129 }
2130 ClearPageUptodate(page);
2131 goto done;
2132 }
2133
2134 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2135 {
2136 struct buffer_head tmp;
2137 struct inode *inode = mapping->host;
2138 tmp.b_state = 0;
2139 tmp.b_blocknr = 0;
2140 get_block(inode, block, &tmp, 0);
2141 return tmp.b_blocknr;
2142 }
2143
2144 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2145 {
2146 int i, nr_blocks, retval;
2147 unsigned long * blocks = iobuf->blocks;
2148
2149 nr_blocks = iobuf->length / blocksize;
2150 /* build the blocklist */
2151 for (i = 0; i < nr_blocks; i++, blocknr++) {
2152 struct buffer_head bh;
2153
2154 bh.b_state = 0;
2155 bh.b_dev = inode->i_dev;
2156 bh.b_size = blocksize;
2157
2158 retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
2159 if (retval)
2160 goto out;
2161
2162 if (rw == READ) {
2163 if (buffer_new(&bh))
2164 BUG();
2165 if (!buffer_mapped(&bh)) {
2166 /* there was an hole in the filesystem */
2167 blocks[i] = -1UL;
2168 continue;
2169 }
2170 } else {
2171 if (buffer_new(&bh))
2172 unmap_underlying_metadata(&bh);
2173 if (!buffer_mapped(&bh))
2174 BUG();
2175 }
2176 blocks[i] = bh.b_blocknr;
2177 }
2178
2179 retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2180
2181 out:
2182 return retval;
2183 }
2184
2185 /*
2186 * IO completion routine for a buffer_head being used for kiobuf IO: we
2187 * can't dispatch the kiobuf callback until io_count reaches 0.
2188 */
2189
2190 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2191 {
2192 struct kiobuf *kiobuf;
2193
2194 mark_buffer_uptodate(bh, uptodate);
2195
2196 kiobuf = bh->b_private;
2197 unlock_buffer(bh);
2198 end_kio_request(kiobuf, uptodate);
2199 }
2200
2201 /*
2202 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2203 * for them to complete. Clean up the buffer_heads afterwards.
2204 */
2205
2206 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2207 {
2208 int iosize, err;
2209 int i;
2210 struct buffer_head *tmp;
2211
2212 iosize = 0;
2213 err = 0;
2214
2215 for (i = nr; --i >= 0; ) {
2216 iosize += size;
2217 tmp = bh[i];
2218 if (buffer_locked(tmp)) {
2219 wait_on_buffer(tmp);
2220 }
2221
2222 if (!buffer_uptodate(tmp)) {
2223 /* We are traversing bh'es in reverse order so
2224 clearing iosize on error calculates the
2225 amount of IO before the first error. */
2226 iosize = 0;
2227 err = -EIO;
2228 }
2229 }
2230
2231 if (iosize)
2232 return iosize;
2233 return err;
2234 }
2235
2236 /*
2237 * Start I/O on a physical range of kernel memory, defined by a vector
2238 * of kiobuf structs (much like a user-space iovec list).
2239 *
2240 * The kiobuf must already be locked for IO. IO is submitted
2241 * asynchronously: you need to check page->locked, page->uptodate, and
2242 * maybe wait on page->wait.
2243 *
2244 * It is up to the caller to make sure that there are enough blocks
2245 * passed in to completely map the iobufs to disk.
2246 */
2247
2248 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2249 kdev_t dev, unsigned long b[], int size)
2250 {
2251 int err;
2252 int length;
2253 int transferred;
2254 int i;
2255 int bufind;
2256 int pageind;
2257 int bhind;
2258 int offset;
2259 unsigned long blocknr;
2260 struct kiobuf * iobuf = NULL;
2261 struct page * map;
2262 struct buffer_head *tmp, **bhs = NULL;
2263
2264 if (!nr)
2265 return 0;
2266
2267 /*
2268 * First, do some alignment and validity checks
2269 */
2270 for (i = 0; i < nr; i++) {
2271 iobuf = iovec[i];
2272 if ((iobuf->offset & (size-1)) ||
2273 (iobuf->length & (size-1)))
2274 return -EINVAL;
2275 if (!iobuf->nr_pages)
2276 panic("brw_kiovec: iobuf not initialised");
2277 }
2278
2279 /*
2280 * OK to walk down the iovec doing page IO on each page we find.
2281 */
2282 bufind = bhind = transferred = err = 0;
2283 for (i = 0; i < nr; i++) {
2284 iobuf = iovec[i];
2285 offset = iobuf->offset;
2286 length = iobuf->length;
2287 iobuf->errno = 0;
2288 if (!bhs)
2289 bhs = iobuf->bh;
2290
2291 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2292 map = iobuf->maplist[pageind];
2293 if (!map) {
2294 err = -EFAULT;
2295 goto finished;
2296 }
2297
2298 while (length > 0) {
2299 blocknr = b[bufind++];
2300 if (blocknr == -1UL) {
2301 if (rw == READ) {
2302 /* there was an hole in the filesystem */
2303 memset(kmap(map) + offset, 0, size);
2304 flush_dcache_page(map);
2305 kunmap(map);
2306
2307 transferred += size;
2308 goto skip_block;
2309 } else
2310 BUG();
2311 }
2312 tmp = bhs[bhind++];
2313
2314 tmp->b_dev = B_FREE;
2315 tmp->b_size = size;
2316 set_bh_page(tmp, map, offset);
2317 tmp->b_this_page = tmp;
2318
2319 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2320 tmp->b_dev = dev;
2321 tmp->b_blocknr = blocknr;
2322 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2323
2324 if (rw == WRITE) {
2325 set_bit(BH_Uptodate, &tmp->b_state);
2326 clear_bit(BH_Dirty, &tmp->b_state);
2327 } else
2328 set_bit(BH_Uptodate, &tmp->b_state);
2329
2330 atomic_inc(&iobuf->io_count);
2331 submit_bh(rw, tmp);
2332 /*
2333 * Wait for IO if we have got too much
2334 */
2335 if (bhind >= KIO_MAX_SECTORS) {
2336 kiobuf_wait_for_io(iobuf); /* wake-one */
2337 err = wait_kio(rw, bhind, bhs, size);
2338 if (err >= 0)
2339 transferred += err;
2340 else
2341 goto finished;
2342 bhind = 0;
2343 }
2344
2345 skip_block:
2346 length -= size;
2347 offset += size;
2348
2349 if (offset >= PAGE_SIZE) {
2350 offset = 0;
2351 break;
2352 }
2353 } /* End of block loop */
2354 } /* End of page loop */
2355 } /* End of iovec loop */
2356
2357 /* Is there any IO still left to submit? */
2358 if (bhind) {
2359 kiobuf_wait_for_io(iobuf); /* wake-one */
2360 err = wait_kio(rw, bhind, bhs, size);
2361 if (err >= 0)
2362 transferred += err;
2363 else
2364 goto finished;
2365 }
2366
2367 finished:
2368 if (transferred)
2369 return transferred;
2370 return err;
2371 }
2372
2373 /*
2374 * Start I/O on a page.
2375 * This function expects the page to be locked and may return
2376 * before I/O is complete. You then have to check page->locked,
2377 * page->uptodate, and maybe wait on page->wait.
2378 *
2379 * brw_page() is SMP-safe, although it's being called with the
2380 * kernel lock held - but the code is ready.
2381 *
2382 * FIXME: we need a swapper_inode->get_block function to remove
2383 * some of the bmap kludges and interface ugliness here.
2384 */
2385 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2386 {
2387 struct buffer_head *head, *bh;
2388
2389 if (!PageLocked(page))
2390 panic("brw_page: page not locked for I/O");
2391
2392 if (!page->buffers)
2393 create_empty_buffers(page, dev, size);
2394 head = bh = page->buffers;
2395
2396 /* Stage 1: lock all the buffers */
2397 do {
2398 lock_buffer(bh);
2399 bh->b_blocknr = *(b++);
2400 set_bit(BH_Mapped, &bh->b_state);
2401 set_buffer_async_io(bh);
2402 bh = bh->b_this_page;
2403 } while (bh != head);
2404
2405 /* Stage 2: start the IO */
2406 do {
2407 struct buffer_head *next = bh->b_this_page;
2408 submit_bh(rw, bh);
2409 bh = next;
2410 } while (bh != head);
2411 return 0;
2412 }
2413
2414 int block_symlink(struct inode *inode, const char *symname, int len)
2415 {
2416 struct address_space *mapping = inode->i_mapping;
2417 struct page *page = grab_cache_page(mapping, 0);
2418 int err = -ENOMEM;
2419 char *kaddr;
2420
2421 if (!page)
2422 goto fail;
2423 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2424 if (err)
2425 goto fail_map;
2426 kaddr = page_address(page);
2427 memcpy(kaddr, symname, len-1);
2428 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2429 /*
2430 * Notice that we are _not_ going to block here - end of page is
2431 * unmapped, so this will only try to map the rest of page, see
2432 * that it is unmapped (typically even will not look into inode -
2433 * ->i_size will be enough for everything) and zero it out.
2434 * OTOH it's obviously correct and should make the page up-to-date.
2435 */
2436 err = mapping->a_ops->readpage(NULL, page);
2437 wait_on_page(page);
2438 page_cache_release(page);
2439 if (err < 0)
2440 goto fail;
2441 mark_inode_dirty(inode);
2442 return 0;
2443 fail_map:
2444 UnlockPage(page);
2445 page_cache_release(page);
2446 fail:
2447 return err;
2448 }
2449
2450 /*
2451 * Try to increase the number of buffers available: the size argument
2452 * is used to determine what kind of buffers we want.
2453 */
2454 static int grow_buffers(int size)
2455 {
2456 struct page * page;
2457 struct buffer_head *bh, *tmp;
2458 struct buffer_head * insert_point;
2459 int isize;
2460
2461 if ((size & 511) || (size > PAGE_SIZE)) {
2462 printk(KERN_ERR "VFS: grow_buffers: size = %d\n",size);
2463 return 0;
2464 }
2465
2466 page = alloc_page(GFP_NOFS);
2467 if (!page)
2468 goto out;
2469 LockPage(page);
2470 bh = create_buffers(page, size, 0);
2471 if (!bh)
2472 goto no_buffer_head;
2473
2474 isize = BUFSIZE_INDEX(size);
2475
2476 spin_lock(&free_list[isize].lock);
2477 insert_point = free_list[isize].list;
2478 tmp = bh;
2479 while (1) {
2480 if (insert_point) {
2481 tmp->b_next_free = insert_point->b_next_free;
2482 tmp->b_prev_free = insert_point;
2483 insert_point->b_next_free->b_prev_free = tmp;
2484 insert_point->b_next_free = tmp;
2485 } else {
2486 tmp->b_prev_free = tmp;
2487 tmp->b_next_free = tmp;
2488 }
2489 insert_point = tmp;
2490 if (tmp->b_this_page)
2491 tmp = tmp->b_this_page;
2492 else
2493 break;
2494 }
2495 tmp->b_this_page = bh;
2496 free_list[isize].list = bh;
2497 spin_unlock(&free_list[isize].lock);
2498
2499 page->buffers = bh;
2500 page->flags &= ~(1 << PG_referenced);
2501 lru_cache_add(page);
2502 UnlockPage(page);
2503 atomic_inc(&buffermem_pages);
2504 return 1;
2505
2506 no_buffer_head:
2507 UnlockPage(page);
2508 page_cache_release(page);
2509 out:
2510 return 0;
2511 }
2512
2513 static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
2514 {
2515 struct buffer_head * p = bh;
2516 int tryagain = 1;
2517
2518 do {
2519 if (buffer_dirty(p) || buffer_locked(p)) {
2520 if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
2521 if (buffer_dirty(p)) {
2522 ll_rw_block(WRITE, 1, &p);
2523 tryagain = 0;
2524 } else if (buffer_locked(p)) {
2525 if (gfp_mask & __GFP_WAIT) {
2526 wait_on_buffer(p);
2527 tryagain = 1;
2528 } else
2529 tryagain = 0;
2530 }
2531 } else
2532 tryagain = 0;
2533 }
2534 p = p->b_this_page;
2535 } while (p != bh);
2536
2537 return tryagain;
2538 }
2539
2540 /*
2541 * Can the buffer be thrown out?
2542 */
2543 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock))
2544 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2545
2546 /*
2547 * try_to_free_buffers() checks if all the buffers on this particular page
2548 * are unused, and free's the page if so.
2549 *
2550 * Wake up bdflush() if this fails - if we're running low on memory due
2551 * to dirty buffers, we need to flush them out as quickly as possible.
2552 *
2553 * NOTE: There are quite a number of ways that threads of control can
2554 * obtain a reference to a buffer head within a page. So we must
2555 * lock out all of these paths to cleanly toss the page.
2556 */
2557 int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2558 {
2559 struct buffer_head * tmp, * bh = page->buffers;
2560 int index = BUFSIZE_INDEX(bh->b_size);
2561
2562 cleaned_buffers_try_again:
2563 spin_lock(&lru_list_lock);
2564 write_lock(&hash_table_lock);
2565 spin_lock(&free_list[index].lock);
2566 tmp = bh;
2567 do {
2568 if (buffer_busy(tmp))
2569 goto busy_buffer_page;
2570 tmp = tmp->b_this_page;
2571 } while (tmp != bh);
2572
2573 spin_lock(&unused_list_lock);
2574 tmp = bh;
2575 do {
2576 struct buffer_head * p = tmp;
2577 tmp = tmp->b_this_page;
2578
2579 /* The buffer can be either on the regular
2580 * queues or on the free list..
2581 */
2582 if (p->b_dev != B_FREE) {
2583 remove_inode_queue(p);
2584 __remove_from_queues(p);
2585 } else
2586 __remove_from_free_list(p, index);
2587 __put_unused_buffer_head(p);
2588 } while (tmp != bh);
2589 spin_unlock(&unused_list_lock);
2590
2591 /* Wake up anyone waiting for buffer heads */
2592 wake_up(&buffer_wait);
2593
2594 /* And free the page */
2595 page->buffers = NULL;
2596 page_cache_release(page);
2597 spin_unlock(&free_list[index].lock);
2598 write_unlock(&hash_table_lock);
2599 spin_unlock(&lru_list_lock);
2600 return 1;
2601
2602 busy_buffer_page:
2603 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2604 spin_unlock(&free_list[index].lock);
2605 write_unlock(&hash_table_lock);
2606 spin_unlock(&lru_list_lock);
2607 if (gfp_mask & __GFP_IO) {
2608 if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2609 if (sync_page_buffers(bh, gfp_mask)) {
2610 /* no IO or waiting next time */
2611 gfp_mask = 0;
2612 goto cleaned_buffers_try_again;
2613 }
2614 }
2615 }
2616 if (balance_dirty_state() >= 0)
2617 wakeup_bdflush();
2618 return 0;
2619 }
2620
2621 /* ================== Debugging =================== */
2622
2623 void show_buffers(void)
2624 {
2625 #ifdef CONFIG_SMP
2626 struct buffer_head * bh;
2627 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2628 int nlist;
2629 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2630 #endif
2631
2632 printk("Buffer memory: %6dkB\n",
2633 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2634
2635 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2636 if (!spin_trylock(&lru_list_lock))
2637 return;
2638 for(nlist = 0; nlist < NR_LIST; nlist++) {
2639 found = locked = dirty = used = lastused = 0;
2640 bh = lru_list[nlist];
2641 if(!bh) continue;
2642
2643 do {
2644 found++;
2645 if (buffer_locked(bh))
2646 locked++;
2647 if (buffer_dirty(bh))
2648 dirty++;
2649 if (atomic_read(&bh->b_count))
2650 used++, lastused = found;
2651 bh = bh->b_next_free;
2652 } while (bh != lru_list[nlist]);
2653 {
2654 int tmp = nr_buffers_type[nlist];
2655 if (found != tmp)
2656 printk("%9s: BUG -> found %d, reported %d\n",
2657 buf_types[nlist], found, tmp);
2658 }
2659 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2660 "%d locked, %d dirty\n",
2661 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2662 used, lastused, locked, dirty);
2663 }
2664 spin_unlock(&lru_list_lock);
2665 #endif
2666 }
2667
2668 /* ===================== Init ======================= */
2669
2670 /*
2671 * allocate the hash table and init the free list
2672 * Use gfp() for the hash table to decrease TLB misses, use
2673 * SLAB cache for buffer heads.
2674 */
2675 void __init buffer_init(unsigned long mempages)
2676 {
2677 int order, i;
2678 unsigned int nr_hash;
2679
2680 /* The buffer cache hash table is less important these days,
2681 * trim it a bit.
2682 */
2683 mempages >>= 14;
2684
2685 mempages *= sizeof(struct buffer_head *);
2686
2687 for (order = 0; (1 << order) < mempages; order++)
2688 ;
2689
2690 /* try to allocate something until we get it or we're asking
2691 for something that is really too small */
2692
2693 do {
2694 unsigned long tmp;
2695
2696 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2697 bh_hash_mask = (nr_hash - 1);
2698
2699 tmp = nr_hash;
2700 bh_hash_shift = 0;
2701 while((tmp >>= 1UL) != 0UL)
2702 bh_hash_shift++;
2703
2704 hash_table = (struct buffer_head **)
2705 __get_free_pages(GFP_ATOMIC, order);
2706 } while (hash_table == NULL && --order > 0);
2707 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2708 nr_hash, order, (PAGE_SIZE << order));
2709
2710 if (!hash_table)
2711 panic("Failed to allocate buffer hash table\n");
2712
2713 /* Setup hash chains. */
2714 for(i = 0; i < nr_hash; i++)
2715 hash_table[i] = NULL;
2716
2717 /* Setup free lists. */
2718 for(i = 0; i < NR_SIZES; i++) {
2719 free_list[i].list = NULL;
2720 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2721 }
2722
2723 /* Setup lru lists. */
2724 for(i = 0; i < NR_LIST; i++)
2725 lru_list[i] = NULL;
2726
2727 }
2728
2729
2730 /* ====================== bdflush support =================== */
2731
2732 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2733 * response to dirty buffers. Once this process is activated, we write back
2734 * a limited number of buffers to the disks and then go back to sleep again.
2735 */
2736
2737 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2738
2739 void wakeup_bdflush(void)
2740 {
2741 wake_up_interruptible(&bdflush_wait);
2742 }
2743
2744 /*
2745 * Here we attempt to write back old buffers. We also try to flush inodes
2746 * and supers as well, since this function is essentially "update", and
2747 * otherwise there would be no way of ensuring that these quantities ever
2748 * get written back. Ideally, we would have a timestamp on the inodes
2749 * and superblocks so that we could write back only the old ones as well
2750 */
2751
2752 static int sync_old_buffers(void)
2753 {
2754 lock_kernel();
2755 sync_unlocked_inodes();
2756 sync_supers(0);
2757 unlock_kernel();
2758
2759 for (;;) {
2760 struct buffer_head *bh;
2761
2762 spin_lock(&lru_list_lock);
2763 bh = lru_list[BUF_DIRTY];
2764 if (!bh || time_before(jiffies, bh->b_flushtime))
2765 break;
2766 if (write_some_buffers(NODEV))
2767 continue;
2768 return 0;
2769 }
2770 spin_unlock(&lru_list_lock);
2771 return 0;
2772 }
2773
2774 int block_sync_page(struct page *page)
2775 {
2776 run_task_queue(&tq_disk);
2777 return 0;
2778 }
2779
2780 /* This is the interface to bdflush. As we get more sophisticated, we can
2781 * pass tuning parameters to this "process", to adjust how it behaves.
2782 * We would want to verify each parameter, however, to make sure that it
2783 * is reasonable. */
2784
2785 asmlinkage long sys_bdflush(int func, long data)
2786 {
2787 if (!capable(CAP_SYS_ADMIN))
2788 return -EPERM;
2789
2790 if (func == 1) {
2791 /* do_exit directly and let kupdate to do its work alone. */
2792 do_exit(0);
2793 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2794 a syscall that doesn't care about the current mm context. */
2795 int error;
2796 struct mm_struct *user_mm;
2797
2798 /*
2799 * bdflush will spend all of it's time in kernel-space,
2800 * without touching user-space, so we can switch it into
2801 * 'lazy TLB mode' to reduce the cost of context-switches
2802 * to and from bdflush.
2803 */
2804 user_mm = start_lazy_tlb();
2805 error = sync_old_buffers();
2806 end_lazy_tlb(user_mm);
2807 return error;
2808 #endif
2809 }
2810
2811 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2812 if (func >= 2) {
2813 int i = (func-2) >> 1;
2814 if (i >= 0 && i < N_PARAM) {
2815 if ((func & 1) == 0)
2816 return put_user(bdf_prm.data[i], (int*)data);
2817
2818 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2819 bdf_prm.data[i] = data;
2820 return 0;
2821 }
2822 }
2823 return -EINVAL;
2824 }
2825
2826 /* Having func 0 used to launch the actual bdflush and then never
2827 * return (unless explicitly killed). We return zero here to
2828 * remain semi-compatible with present update(8) programs.
2829 */
2830 return 0;
2831 }
2832
2833 /*
2834 * This is the actual bdflush daemon itself. It used to be started from
2835 * the syscall above, but now we launch it ourselves internally with
2836 * kernel_thread(...) directly after the first thread in init/main.c
2837 */
2838 int bdflush(void *startup)
2839 {
2840 struct task_struct *tsk = current;
2841
2842 /*
2843 * We have a bare-bones task_struct, and really should fill
2844 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2845 * display semi-sane things. Not real crucial though...
2846 */
2847
2848 tsk->session = 1;
2849 tsk->pgrp = 1;
2850 strcpy(tsk->comm, "bdflush");
2851
2852 /* avoid getting signals */
2853 spin_lock_irq(&tsk->sigmask_lock);
2854 flush_signals(tsk);
2855 sigfillset(&tsk->blocked);
2856 recalc_sigpending(tsk);
2857 spin_unlock_irq(&tsk->sigmask_lock);
2858
2859 complete((struct completion *)startup);
2860
2861 for (;;) {
2862 CHECK_EMERGENCY_SYNC
2863
2864 spin_lock(&lru_list_lock);
2865 if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
2866 wait_for_some_buffers(NODEV);
2867 interruptible_sleep_on(&bdflush_wait);
2868 }
2869 }
2870 }
2871
2872 /*
2873 * This is the kernel update daemon. It was used to live in userspace
2874 * but since it's need to run safely we want it unkillable by mistake.
2875 * You don't need to change your userspace configuration since
2876 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2877 */
2878 int kupdate(void *startup)
2879 {
2880 struct task_struct * tsk = current;
2881 int interval;
2882
2883 tsk->session = 1;
2884 tsk->pgrp = 1;
2885 strcpy(tsk->comm, "kupdated");
2886
2887 /* sigstop and sigcont will stop and wakeup kupdate */
2888 spin_lock_irq(&tsk->sigmask_lock);
2889 sigfillset(&tsk->blocked);
2890 siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2891 recalc_sigpending(tsk);
2892 spin_unlock_irq(&tsk->sigmask_lock);
2893
2894 complete((struct completion *)startup);
2895
2896 for (;;) {
2897 wait_for_some_buffers(NODEV);
2898
2899 /* update interval */
2900 interval = bdf_prm.b_un.interval;
2901 if (interval) {
2902 tsk->state = TASK_INTERRUPTIBLE;
2903 schedule_timeout(interval);
2904 } else {
2905 stop_kupdate:
2906 tsk->state = TASK_STOPPED;
2907 schedule(); /* wait for SIGCONT */
2908 }
2909 /* check for sigstop */
2910 if (signal_pending(tsk)) {
2911 int stopped = 0;
2912 spin_lock_irq(&tsk->sigmask_lock);
2913 if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2914 sigdelset(&tsk->pending.signal, SIGSTOP);
2915 stopped = 1;
2916 }
2917 recalc_sigpending(tsk);
2918 spin_unlock_irq(&tsk->sigmask_lock);
2919 if (stopped)
2920 goto stop_kupdate;
2921 }
2922 #ifdef DEBUG
2923 printk(KERN_DEBUG "kupdate() activated...\n");
2924 #endif
2925 sync_old_buffers();
2926 }
2927 }
2928
2929 static int __init bdflush_init(void)
2930 {
2931 static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
2932
2933 kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2934 wait_for_completion(&startup);
2935 kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2936 wait_for_completion(&startup);
2937 return 0;
2938 }
2939
2940 module_init(bdflush_init)
2941
2942